summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorAndreas Gruenbacher <agruenba@redhat.com>2022-08-05 19:37:03 +0300
committerAndreas Gruenbacher <agruenba@redhat.com>2022-08-05 19:37:03 +0300
commit446279168e030fd0ed68e2bba336bef8bb3da352 (patch)
tree0944d4dc64b07f74a6cf73d2ce803a789f8d786d /fs
parent44dab005fd4298c209ea32ffda6131cad4358d21 (diff)
parent6feaec81477af0390a41470cbd6b353f68dd853c (diff)
downloadlinux-446279168e030fd0ed68e2bba336bef8bb3da352.tar.xz
Merge part of branch 'for-next.instantiate' into for-next
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c4
-rw-r--r--fs/9p/fid.c22
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/v9fs.h10
-rw-r--r--fs/9p/vfs_addr.c28
-rw-r--r--fs/9p/vfs_inode.c13
-rw-r--r--fs/9p/vfs_inode_dotl.c3
-rw-r--r--fs/Kconfig22
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/afs/callback.c2
-rw-r--r--fs/afs/dir.c37
-rw-r--r--fs/afs/dir_edit.c10
-rw-r--r--fs/afs/dir_silly.c4
-rw-r--r--fs/afs/dynroot.c2
-rw-r--r--fs/afs/file.c10
-rw-r--r--fs/afs/fs_operation.c6
-rw-r--r--fs/afs/inode.c41
-rw-r--r--fs/afs/internal.h23
-rw-r--r--fs/afs/misc.c5
-rw-r--r--fs/afs/rotate.c4
-rw-r--r--fs/afs/rxrpc.c8
-rw-r--r--fs/afs/security.c3
-rw-r--r--fs/afs/super.c6
-rw-r--r--fs/afs/volume.c3
-rw-r--r--fs/afs/write.c24
-rw-r--r--fs/attr.c26
-rw-r--r--fs/btrfs/block-group.h1
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/disk-io.c13
-rw-r--r--fs/btrfs/extent-tree.c20
-rw-r--r--fs/btrfs/extent_io.c3
-rw-r--r--fs/btrfs/file.c92
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/btrfs/locking.c3
-rw-r--r--fs/btrfs/reflink.c16
-rw-r--r--fs/btrfs/super.c47
-rw-r--r--fs/btrfs/zoned.c27
-rw-r--r--fs/btrfs/zoned.h5
-rw-r--r--fs/ceph/addr.c58
-rw-r--r--fs/ceph/cache.c4
-rw-r--r--fs/ceph/cache.h2
-rw-r--r--fs/ceph/caps.c179
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/inode.c45
-rw-r--r--fs/ceph/mds_client.c125
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/quota.c19
-rw-r--r--fs/ceph/snap.c8
-rw-r--r--fs/ceph/super.c3
-rw-r--r--fs/ceph/super.h39
-rw-r--r--fs/ceph/xattr.c24
-rw-r--r--fs/cifs/Makefile4
-rw-r--r--fs/cifs/cifs_debug.c23
-rw-r--r--fs/cifs/cifs_swn.c4
-rw-r--r--fs/cifs/cifsencrypt.c8
-rw-r--r--fs/cifs/cifsfs.c22
-rw-r--r--fs/cifs/cifsfs.h7
-rw-r--r--fs/cifs/cifsglob.h218
-rw-r--r--fs/cifs/cifsproto.h16
-rw-r--r--fs/cifs/cifssmb.c5
-rw-r--r--fs/cifs/connect.c195
-rw-r--r--fs/cifs/dfs_cache.c96
-rw-r--r--fs/cifs/file.c28
-rw-r--r--fs/cifs/fs_context.c33
-rw-r--r--fs/cifs/fs_context.h4
-rw-r--r--fs/cifs/fscache.c8
-rw-r--r--fs/cifs/fscache.h10
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/misc.c51
-rw-r--r--fs/cifs/readdir.c179
-rw-r--r--fs/cifs/sess.c213
-rw-r--r--fs/cifs/smb1ops.c6
-rw-r--r--fs/cifs/smb2inode.c7
-rw-r--r--fs/cifs/smb2misc.c12
-rw-r--r--fs/cifs/smb2ops.c222
-rw-r--r--fs/cifs/smb2pdu.c37
-rw-r--r--fs/cifs/smb2pdu.h22
-rw-r--r--fs/cifs/smb2transport.c7
-rw-r--r--fs/cifs/smbdirect.c8
-rw-r--r--fs/cifs/trace.h40
-rw-r--r--fs/cifs/transport.c48
-rw-r--r--fs/dax.c120
-rw-r--r--fs/erofs/fscache.c1
-rw-r--r--fs/erofs/inode.c5
-rw-r--r--fs/erofs/zdata.c167
-rw-r--r--fs/erofs/zdata.h50
-rw-r--r--fs/exec.c14
-rw-r--r--fs/exfat/namei.c4
-rw-r--r--fs/exportfs/expfs.c5
-rw-r--r--fs/ext2/dir.c9
-rw-r--r--fs/ext2/inode.c3
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/ext4/mballoc.c26
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/namei.c3
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/resize.c10
-rw-r--r--fs/ext4/super.c172
-rw-r--r--fs/ext4/xattr.c3
-rw-r--r--fs/f2fs/checkpoint.c16
-rw-r--r--fs/f2fs/data.c213
-rw-r--r--fs/f2fs/debug.c18
-rw-r--r--fs/f2fs/dir.c3
-rw-r--r--fs/f2fs/f2fs.h133
-rw-r--r--fs/f2fs/file.c307
-rw-r--r--fs/f2fs/gc.c186
-rw-r--r--fs/f2fs/hash.c11
-rw-r--r--fs/f2fs/inline.c29
-rw-r--r--fs/f2fs/inode.c34
-rw-r--r--fs/f2fs/iostat.c31
-rw-r--r--fs/f2fs/namei.c55
-rw-r--r--fs/f2fs/node.c31
-rw-r--r--fs/f2fs/node.h1
-rw-r--r--fs/f2fs/segment.c460
-rw-r--r--fs/f2fs/segment.h40
-rw-r--r--fs/f2fs/super.c88
-rw-r--r--fs/f2fs/verity.c2
-rw-r--r--fs/fat/fat.h14
-rw-r--r--fs/fat/fatent.c7
-rw-r--r--fs/fat/file.c14
-rw-r--r--fs/fat/inode.c19
-rw-r--r--fs/fat/misc.c78
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/fcntl.c9
-rw-r--r--fs/file.c113
-rw-r--r--fs/file_table.c9
-rw-r--r--fs/freevxfs/vxfs.h27
-rw-r--r--fs/freevxfs/vxfs_bmap.c26
-rw-r--r--fs/freevxfs/vxfs_dir.h27
-rw-r--r--fs/freevxfs/vxfs_extern.h27
-rw-r--r--fs/freevxfs/vxfs_fshead.c26
-rw-r--r--fs/freevxfs/vxfs_fshead.h27
-rw-r--r--fs/freevxfs/vxfs_immed.c26
-rw-r--r--fs/freevxfs/vxfs_inode.c26
-rw-r--r--fs/freevxfs/vxfs_inode.h27
-rw-r--r--fs/freevxfs/vxfs_lookup.c26
-rw-r--r--fs/freevxfs/vxfs_olt.c26
-rw-r--r--fs/freevxfs/vxfs_olt.h27
-rw-r--r--fs/freevxfs/vxfs_subr.c26
-rw-r--r--fs/freevxfs/vxfs_super.c26
-rw-r--r--fs/fs-writeback.c39
-rw-r--r--fs/fsopen.c4
-rw-r--r--fs/fuse/dax.c4
-rw-r--r--fs/fuse/virtio_fs.c6
-rw-r--r--fs/gfs2/glock.c187
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c31
-rw-r--r--fs/gfs2/incore.h6
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/gfs2/quota.c28
-rw-r--r--fs/gfs2/rgrp.c3
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hugetlbfs/inode.c110
-rw-r--r--fs/inode.c2
-rw-r--r--fs/internal.h3
-rw-r--r--fs/io_uring.c706
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/jffs2/erase.c6
-rw-r--r--fs/jffs2/fs.c1
-rw-r--r--fs/jfs/Makefile2
-rw-r--r--fs/jfs/inode.c18
-rw-r--r--fs/jfs/jfs_dmap.c71
-rw-r--r--fs/jfs/jfs_dtree.c298
-rw-r--r--fs/jfs/jfs_extent.c255
-rw-r--r--fs/jfs/jfs_logmgr.c8
-rw-r--r--fs/jfs/jfs_mount.c4
-rw-r--r--fs/jfs/jfs_txnmgr.c34
-rw-r--r--fs/jfs/jfs_xtree.c961
-rw-r--r--fs/jfs/jfs_xtree.h4
-rw-r--r--fs/kernfs/dir.c31
-rw-r--r--fs/kernfs/file.c47
-rw-r--r--fs/ksmbd/connection.c22
-rw-r--r--fs/ksmbd/connection.h27
-rw-r--r--fs/ksmbd/ksmbd_netlink.h3
-rw-r--r--fs/ksmbd/misc.c10
-rw-r--r--fs/ksmbd/smb2misc.c2
-rw-r--r--fs/ksmbd/smb2pdu.c126
-rw-r--r--fs/ksmbd/smb_common.c4
-rw-r--r--fs/ksmbd/smbacl.c1
-rw-r--r--fs/ksmbd/transport_ipc.c3
-rw-r--r--fs/ksmbd/transport_rdma.c363
-rw-r--r--fs/ksmbd/transport_rdma.h8
-rw-r--r--fs/locks.c61
-rw-r--r--fs/namei.c89
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/netfs/buffered_read.c9
-rw-r--r--fs/netfs/internal.h2
-rw-r--r--fs/netfs/objects.c8
-rw-r--r--fs/nfs/callback_proc.c1
-rw-r--r--fs/nfs/dir.c1
-rw-r--r--fs/nfs/direct.c23
-rw-r--r--fs/nfs/file.c68
-rw-r--r--fs/nfs/filelayout/filelayout.c7
-rw-r--r--fs/nfs/fscache.c7
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs4file.c5
-rw-r--r--fs/nfs/nfs4namespace.c9
-rw-r--r--fs/nfs/nfs4proc.c182
-rw-r--r--fs/nfs/nfs4state.c29
-rw-r--r--fs/nfs/nfs4xdr.c99
-rw-r--r--fs/nfs/pagelist.c3
-rw-r--r--fs/nfs/pnfs.c23
-rw-r--r--fs/nfs/pnfs.h1
-rw-r--r--fs/nfs/unlink.c8
-rw-r--r--fs/nfs/write.c54
-rw-r--r--fs/nfsd/filecache.c77
-rw-r--r--fs/nfsd/filecache.h2
-rw-r--r--fs/nfsd/nfs3proc.c141
-rw-r--r--fs/nfsd/nfs4proc.c264
-rw-r--r--fs/nfsd/nfs4state.c353
-rw-r--r--fs/nfsd/nfs4xdr.c2
-rw-r--r--fs/nfsd/nfscache.c2
-rw-r--r--fs/nfsd/nfsctl.c20
-rw-r--r--fs/nfsd/nfsd.h5
-rw-r--r--fs/nfsd/state.h31
-rw-r--r--fs/nfsd/trace.h34
-rw-r--r--fs/nfsd/vfs.c255
-rw-r--r--fs/nfsd/vfs.h14
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/notify/dnotify/dnotify.c13
-rw-r--r--fs/notify/fanotify/fanotify.c24
-rw-r--r--fs/notify/fanotify/fanotify.h12
-rw-r--r--fs/notify/fanotify/fanotify_user.c104
-rw-r--r--fs/notify/fdinfo.c21
-rw-r--r--fs/notify/fsnotify.c89
-rw-r--r--fs/notify/group.c32
-rw-r--r--fs/notify/inotify/inotify.h19
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c47
-rw-r--r--fs/notify/mark.c112
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ntfs3/file.c12
-rw-r--r--fs/ntfs3/frecord.c10
-rw-r--r--fs/ntfs3/fslog.c12
-rw-r--r--fs/ntfs3/inode.c9
-rw-r--r--fs/ntfs3/super.c10
-rw-r--r--fs/ntfs3/xattr.c136
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c12
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c21
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c17
-rw-r--r--fs/ocfs2/inode.c4
-rw-r--r--fs/ocfs2/journal.c33
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/quota_local.c10
-rw-r--r--fs/ocfs2/reservations.c4
-rw-r--r--fs/ocfs2/reservations.h9
-rw-r--r--fs/ocfs2/super.c180
-rw-r--r--fs/open.c75
-rw-r--r--fs/overlayfs/copy_up.c90
-rw-r--r--fs/overlayfs/dir.c147
-rw-r--r--fs/overlayfs/export.c5
-rw-r--r--fs/overlayfs/file.c56
-rw-r--r--fs/overlayfs/inode.c68
-rw-r--r--fs/overlayfs/namei.c53
-rw-r--r--fs/overlayfs/overlayfs.h232
-rw-r--r--fs/overlayfs/ovl_entry.h7
-rw-r--r--fs/overlayfs/readdir.c48
-rw-r--r--fs/overlayfs/super.c57
-rw-r--r--fs/overlayfs/util.c103
-rw-r--r--fs/pipe.c33
-rw-r--r--fs/proc/base.c22
-rw-r--r--fs/proc/generic.c3
-rw-r--r--fs/proc/kcore.c14
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/proc_net.c3
-rw-r--r--fs/proc/proc_sysctl.c93
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--fs/proc/vmcore.c130
-rw-r--r--fs/quota/dquot.c10
-rw-r--r--fs/read_write.c16
-rw-r--r--fs/seq_file.c32
-rw-r--r--fs/smbfs_common/smb2pdu.h108
-rw-r--r--fs/smbfs_common/smbfsctl.h6
-rw-r--r--fs/stat.c2
-rw-r--r--fs/sync.c9
-rw-r--r--fs/sysv/super.c4
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/ubifs/budget.c7
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/userfaultfd.c32
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/libxfs/xfs_ag.c3
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c12
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr.c1683
-rw-r--r--fs/xfs/libxfs/xfs_attr.h195
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c64
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c37
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c167
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h58
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_btree.c185
-rw-r--r--fs/xfs/libxfs/xfs_btree.h26
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c15
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h28
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h9
-rw-r--r--fs/xfs/libxfs/xfs_defer.c97
-rw-r--r--fs/xfs/libxfs/xfs_defer.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c8
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h8
-rw-r--r--fs/xfs/libxfs/xfs_format.h189
-rw-r--r--fs/xfs/libxfs/xfs_fs.h41
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c8
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c118
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c51
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h76
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h87
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h16
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c75
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h50
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c14
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h13
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c161
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c9
-rw-r--r--fs/xfs/libxfs/xfs_sb.c80
-rw-r--r--fs/xfs/libxfs/xfs_shared.h24
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c225
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h16
-rw-r--r--fs/xfs/libxfs/xfs_types.h11
-rw-r--r--fs/xfs/scrub/bmap.c26
-rw-r--r--fs/xfs/scrub/common.c2
-rw-r--r--fs/xfs/scrub/inode.c20
-rw-r--r--fs/xfs/scrub/rtbitmap.c9
-rw-r--r--fs/xfs/scrub/scrub.c17
-rw-r--r--fs/xfs/xfs_acl.c7
-rw-r--r--fs/xfs/xfs_acl.h8
-rw-r--r--fs/xfs/xfs_attr_item.c885
-rw-r--r--fs/xfs/xfs_attr_item.h54
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_bmap_item.c27
-rw-r--r--fs/xfs/xfs_bmap_util.c27
-rw-r--r--fs/xfs/xfs_buf_item.h24
-rw-r--r--fs/xfs/xfs_buf_item_recover.c66
-rw-r--r--fs/xfs/xfs_dquot.c18
-rw-r--r--fs/xfs/xfs_dquot.h8
-rw-r--r--fs/xfs/xfs_error.c9
-rw-r--r--fs/xfs/xfs_error.h20
-rw-r--r--fs/xfs/xfs_extfree_item.c23
-rw-r--r--fs/xfs/xfs_file.c26
-rw-r--r--fs/xfs/xfs_filestream.c7
-rw-r--r--fs/xfs/xfs_fsmap.c6
-rw-r--r--fs/xfs/xfs_fsops.c14
-rw-r--r--fs/xfs/xfs_globals.c1
-rw-r--r--fs/xfs/xfs_icache.c9
-rw-r--r--fs/xfs/xfs_icreate_item.c1
-rw-r--r--fs/xfs/xfs_inode.c82
-rw-r--r--fs/xfs/xfs_inode.h29
-rw-r--r--fs/xfs/xfs_inode_item.c48
-rw-r--r--fs/xfs/xfs_inode_item_recover.c145
-rw-r--r--fs/xfs/xfs_ioctl.c13
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c33
-rw-r--r--fs/xfs/xfs_iops.c7
-rw-r--r--fs/xfs/xfs_itable.c15
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_iwalk.h2
-rw-r--r--fs/xfs/xfs_log.c766
-rw-r--r--fs/xfs/xfs_log.h97
-rw-r--r--fs/xfs/xfs_log_cil.c391
-rw-r--r--fs/xfs/xfs_log_priv.h92
-rw-r--r--fs/xfs/xfs_log_recover.c95
-rw-r--r--fs/xfs/xfs_message.c58
-rw-r--r--fs/xfs/xfs_message.h61
-rw-r--r--fs/xfs/xfs_mount.c92
-rw-r--r--fs/xfs/xfs_mount.h50
-rw-r--r--fs/xfs/xfs_ondisk.h2
-rw-r--r--fs/xfs/xfs_qm.c18
-rw-r--r--fs/xfs/xfs_qm.h5
-rw-r--r--fs/xfs/xfs_qm_syscalls.c26
-rw-r--r--fs/xfs/xfs_quotaops.c8
-rw-r--r--fs/xfs/xfs_refcount_item.c25
-rw-r--r--fs/xfs/xfs_reflink.c100
-rw-r--r--fs/xfs/xfs_rmap_item.c25
-rw-r--r--fs/xfs/xfs_rtalloc.c41
-rw-r--r--fs/xfs/xfs_rtalloc.h9
-rw-r--r--fs/xfs/xfs_super.c38
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_symlink.c5
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h100
-rw-r--r--fs/xfs/xfs_trans.c52
-rw-r--r--fs/xfs/xfs_trans.h38
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_xattr.c96
-rw-r--r--fs/xfs/xfs_xattr.h13
-rw-r--r--fs/zonefs/super.c111
393 files changed, 12055 insertions, 9255 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 1c8dc696d516..cebba4eaa0b5 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -62,12 +62,12 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
version = cpu_to_le32(v9inode->qid.version);
path = cpu_to_le64(v9inode->qid.path);
v9ses = v9fs_inode2v9ses(inode);
- v9inode->netfs_ctx.cache =
+ v9inode->netfs.cache =
fscache_acquire_cookie(v9fs_session_cache(v9ses),
0,
&path, sizeof(path),
&version, sizeof(version),
- i_size_read(&v9inode->vfs_inode));
+ i_size_read(&v9inode->netfs.inode));
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
inode, v9fs_inode_cookie(v9inode));
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 79df61fe0e59..baf2b152229e 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -152,7 +152,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
const unsigned char **wnames, *uname;
int i, n, l, clone, access;
struct v9fs_session_info *v9ses;
- struct p9_fid *fid, *old_fid = NULL;
+ struct p9_fid *fid, *old_fid;
v9ses = v9fs_dentry2v9ses(dentry);
access = v9ses->flags & V9FS_ACCESS_MASK;
@@ -194,13 +194,12 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
if (IS_ERR(fid))
return fid;
+ refcount_inc(&fid->count);
v9fs_fid_add(dentry->d_sb->s_root, fid);
}
/* If we are root ourself just return that */
- if (dentry->d_sb->s_root == dentry) {
- refcount_inc(&fid->count);
+ if (dentry->d_sb->s_root == dentry)
return fid;
- }
/*
* Do a multipath walk with attached root.
* When walking parent we need to make sure we
@@ -212,6 +211,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
fid = ERR_PTR(n);
goto err_out;
}
+ old_fid = fid;
clone = 1;
i = 0;
while (i < n) {
@@ -221,19 +221,15 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
* walk to ensure none of the patch component change
*/
fid = p9_client_walk(fid, l, &wnames[i], clone);
+ /* non-cloning walk will return the same fid */
+ if (fid != old_fid) {
+ p9_client_clunk(old_fid);
+ old_fid = fid;
+ }
if (IS_ERR(fid)) {
- if (old_fid) {
- /*
- * If we fail, clunk fid which are mapping
- * to path component and not the last component
- * of the path.
- */
- p9_client_clunk(old_fid);
- }
kfree(wnames);
goto err_out;
}
- old_fid = fid;
i += l;
clone = 0;
}
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index e28ddf763b3b..0129de2ea31a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -625,7 +625,7 @@ static void v9fs_inode_init_once(void *foo)
struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
memset(&v9inode->qid, 0, sizeof(v9inode->qid));
- inode_init_once(&v9inode->vfs_inode);
+ inode_init_once(&v9inode->netfs.inode);
}
/**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index ec0e8df3b2eb..6acabc2e7dc9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -109,11 +109,7 @@ struct v9fs_session_info {
#define V9FS_INO_INVALID_ATTR 0x01
struct v9fs_inode {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode; /* the VFS's inode record */
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct p9_qid qid;
unsigned int cache_validity;
struct p9_fid *writeback_fid;
@@ -122,13 +118,13 @@ struct v9fs_inode {
static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
{
- return container_of(inode, struct v9fs_inode, vfs_inode);
+ return container_of(inode, struct v9fs_inode, netfs.inode);
}
static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode)
{
#ifdef CONFIG_9P_FSCACHE
- return netfs_i_cookie(&v9inode->vfs_inode);
+ return netfs_i_cookie(&v9inode->netfs);
#else
return NULL;
#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 8ce82ff1e40a..d0833fa69faf 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -58,21 +58,33 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
*/
static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
+ struct inode *inode = file_inode(file);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct p9_fid *fid = file->private_data;
+ BUG_ON(!fid);
+
+ /* we might need to read from a fid that was opened write-only
+ * for read-modify-write of page cache, use the writeback fid
+ * for that */
+ if (rreq->origin == NETFS_READ_FOR_WRITE &&
+ (fid->mode & O_ACCMODE) == O_WRONLY) {
+ fid = v9inode->writeback_fid;
+ BUG_ON(!fid);
+ }
+
refcount_inc(&fid->count);
rreq->netfs_priv = fid;
return 0;
}
/**
- * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request
- * @mapping: unused mapping of request to cleanup
- * @priv: private data to cleanup, a fid, guaranted non-null.
+ * v9fs_free_request - Cleanup request initialized by v9fs_init_rreq
+ * @rreq: The I/O request to clean up
*/
-static void v9fs_req_cleanup(struct address_space *mapping, void *priv)
+static void v9fs_free_request(struct netfs_io_request *rreq)
{
- struct p9_fid *fid = priv;
+ struct p9_fid *fid = rreq->netfs_priv;
p9_client_clunk(fid);
}
@@ -94,9 +106,9 @@ static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
const struct netfs_request_ops v9fs_req_ops = {
.init_request = v9fs_init_request,
+ .free_request = v9fs_free_request,
.begin_cache_operation = v9fs_begin_cache_operation,
.issue_read = v9fs_issue_read,
- .cleanup = v9fs_req_cleanup,
};
/**
@@ -140,7 +152,7 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
transferred_or_error != -ENOBUFS) {
version = cpu_to_le32(v9inode->qid.version);
fscache_invalidate(v9fs_inode_cookie(v9inode), &version,
- i_size_read(&v9inode->vfs_inode), 0);
+ i_size_read(&v9inode->netfs.inode), 0);
}
}
@@ -274,7 +286,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- retval = netfs_write_begin(filp, mapping, pos, len, &folio, fsdata);
+ retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata);
if (retval < 0)
return retval;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 55367ecb9442..3d8297714772 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -234,7 +234,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
v9inode->writeback_fid = NULL;
v9inode->cache_validity = 0;
mutex_init(&v9inode->v_mutex);
- return &v9inode->vfs_inode;
+ return &v9inode->netfs.inode;
}
/**
@@ -252,7 +252,8 @@ void v9fs_free_inode(struct inode *inode)
*/
static void v9fs_set_netfs_context(struct inode *inode)
{
- netfs_i_context_init(inode, &v9fs_req_ops);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ netfs_inode_init(&v9inode->netfs, &v9fs_req_ops);
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
@@ -1250,15 +1251,15 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry,
return ERR_PTR(-ECHILD);
v9ses = v9fs_dentry2v9ses(dentry);
- fid = v9fs_fid_lookup(dentry);
+ if (!v9fs_proto_dotu(v9ses))
+ return ERR_PTR(-EBADF);
+
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
+ fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return ERR_CAST(fid);
- if (!v9fs_proto_dotu(v9ses))
- return ERR_PTR(-EBADF);
-
st = p9_client_stat(fid);
p9_client_clunk(fid);
if (IS_ERR(st))
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index d17502a738a9..b6eb1160296c 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -274,6 +274,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ p9_client_clunk(dfid);
goto out;
}
@@ -285,6 +286,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err) {
p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
err);
+ p9_client_clunk(dfid);
goto error;
}
err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
@@ -292,6 +294,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
err);
+ p9_client_clunk(dfid);
goto error;
}
v9fs_invalidate_inode_attr(dir);
diff --git a/fs/Kconfig b/fs/Kconfig
index 30b751c7f11a..5976eb33535f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -245,19 +245,27 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
-config HUGETLB_PAGE_FREE_VMEMMAP
+#
+# Select this config option from the architecture Kconfig, if it is preferred
+# to enable the feature of minimizing overhead of struct page associated with
+# each HugeTLB page.
+#
+config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ bool
+
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
- depends on X86_64
+ depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on SPARSEMEM_VMEMMAP
-config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
- bool "Default freeing vmemmap pages of HugeTLB to on"
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
+ bool "Default optimizing vmemmap pages of HugeTLB to on"
default n
- depends on HUGETLB_PAGE_FREE_VMEMMAP
+ depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
help
- When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap
+ When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
pages associated with each HugeTLB page is default off. Say Y here
- to enable freeing vmemmap pages of HugeTLB by default. It can then
+ to enable optimizing vmemmap pages of HugeTLB by default. It can then
be disabled on the command line via hugetlb_free_vmemmap=off.
config MEMFD_CREATE
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 32dff7ba3dda..21e154516bf2 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
- depends on (ARM || (SUPERH && !MMU))
+ depends on ARM || ((M68K || SUPERH) && !MMU)
select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 1b4d5809808d..a484fa642808 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -30,7 +30,7 @@ void afs_invalidate_mmap_work(struct work_struct *work)
{
struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work);
- unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false);
+ unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
}
void afs_server_init_callback_work(struct work_struct *work)
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 94aa7356248e..56ae5cd5184f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -109,7 +109,7 @@ struct afs_lookup_cookie {
*/
static void afs_dir_read_cleanup(struct afs_read *req)
{
- struct address_space *mapping = req->vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = req->vnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
@@ -153,7 +153,7 @@ static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio,
block = kmap_local_folio(folio, offset);
if (block->hdr.magic != AFS_DIR_MAGIC) {
printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n",
- __func__, dvnode->vfs_inode.i_ino,
+ __func__, dvnode->netfs.inode.i_ino,
pos, offset, size, ntohs(block->hdr.magic));
trace_afs_dir_check_failed(dvnode, pos + offset, i_size);
kunmap_local(block);
@@ -183,7 +183,7 @@ error:
static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
{
union afs_xdr_dir_block *block;
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
size_t offset, size;
@@ -217,7 +217,7 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
*/
static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
{
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
int ret = 0;
@@ -269,7 +269,7 @@ static int afs_dir_open(struct inode *inode, struct file *file)
static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
__acquires(&dvnode->validate_lock)
{
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct afs_read *req;
loff_t i_size;
int nr_pages, i;
@@ -287,7 +287,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
req->cleanup = afs_dir_read_cleanup;
expand:
- i_size = i_size_read(&dvnode->vfs_inode);
+ i_size = i_size_read(&dvnode->netfs.inode);
if (i_size < 2048) {
ret = afs_bad(dvnode, afs_file_error_dir_small);
goto error;
@@ -305,7 +305,7 @@ expand:
req->actual_len = i_size; /* May change */
req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
req->data_version = dvnode->status.data_version; /* May change */
- iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages,
+ iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages,
0, i_size);
req->iter = &req->def_iter;
@@ -463,8 +463,11 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
}
/* skip if starts before the current position */
- if (offset < curr)
+ if (offset < curr) {
+ if (next > curr)
+ ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
continue;
+ }
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
@@ -894,7 +897,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
out_op:
if (op->error == 0) {
- inode = &op->file[1].vnode->vfs_inode;
+ inode = &op->file[1].vnode->netfs.inode;
op->file[1].vnode = NULL;
}
@@ -1136,7 +1139,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
afs_stat_v(dir, n_reval);
/* search the directory for this vnode */
- ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version);
+ ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version);
switch (ret) {
case 0:
/* the filename maps to something */
@@ -1167,7 +1170,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
_debug("%pd: file deleted (uq %u -> %u I:%u)",
dentry, fid.unique,
vnode->fid.unique,
- vnode->vfs_inode.i_generation);
+ vnode->netfs.inode.i_generation);
goto not_found;
}
goto out_valid;
@@ -1365,7 +1368,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
if (d_really_is_positive(dentry)) {
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
- clear_nlink(&vnode->vfs_inode);
+ clear_nlink(&vnode->netfs.inode);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
@@ -1484,8 +1487,8 @@ static void afs_dir_remove_link(struct afs_operation *op)
/* Already done */
} else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
write_seqlock(&vnode->cb_lock);
- drop_nlink(&vnode->vfs_inode);
- if (vnode->vfs_inode.i_nlink == 0) {
+ drop_nlink(&vnode->netfs.inode);
+ if (vnode->netfs.inode.i_nlink == 0) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
__afs_break_callback(vnode, afs_cb_break_for_unlink);
}
@@ -1501,7 +1504,7 @@ static void afs_dir_remove_link(struct afs_operation *op)
op->error = ret;
}
- _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, op->error);
+ _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error);
}
static void afs_unlink_success(struct afs_operation *op)
@@ -1677,8 +1680,8 @@ static void afs_link_success(struct afs_operation *op)
afs_update_dentry_version(op, dvp, op->dentry);
if (op->dentry_2->d_parent == op->dentry->d_parent)
afs_update_dentry_version(op, dvp, op->dentry_2);
- ihold(&vp->vnode->vfs_inode);
- d_instantiate(op->dentry, &vp->vnode->vfs_inode);
+ ihold(&vp->vnode->netfs.inode);
+ d_instantiate(op->dentry, &vp->vnode->netfs.inode);
}
static void afs_link_put(struct afs_operation *op)
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index d98e109ecee9..0ab7752d1b75 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -109,7 +109,7 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
*/
static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
{
- struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = vnode->netfs.inode.i_mapping;
struct folio *folio;
folio = __filemap_get_folio(mapping, index,
@@ -216,7 +216,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
_enter(",,{%d,%s},", name->len, name->name);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
@@ -336,7 +336,7 @@ found_space:
if (b < AFS_DIR_BLOCKS_WITH_CTR)
meta->meta.alloc_ctrs[b] -= need_slots;
- inode_inc_iversion_raw(&vnode->vfs_inode);
+ inode_inc_iversion_raw(&vnode->netfs.inode);
afs_stat_v(vnode, n_dir_cr);
_debug("Insert %s in %u[%u]", name->name, b, slot);
@@ -383,7 +383,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
_enter(",,{%d,%s},", name->len, name->name);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (i_size < AFS_DIR_BLOCK_SIZE ||
i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
@@ -463,7 +463,7 @@ found_dirent:
if (b < AFS_DIR_BLOCKS_WITH_CTR)
meta->meta.alloc_ctrs[b] += need_slots;
- inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version);
+ inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
afs_stat_v(vnode, n_dir_rm);
_debug("Remove %s from %u[%u]", name->name, b, slot);
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 45cfd50a9521..bb5807e87fa4 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -131,7 +131,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
goto out;
} while (!d_is_negative(sdentry));
- ihold(&vnode->vfs_inode);
+ ihold(&vnode->netfs.inode);
ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key);
switch (ret) {
@@ -148,7 +148,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
d_drop(sdentry);
}
- iput(&vnode->vfs_inode);
+ iput(&vnode->netfs.inode);
dput(sdentry);
out:
_leave(" = %d", ret);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index f120bcb8bf73..d7d9402ff718 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
- netfs_i_context_init(inode, NULL);
+ netfs_inode_init(&vnode->netfs, NULL);
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a8e8832179e4..42118a4f3383 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -194,7 +194,7 @@ int afs_release(struct inode *inode, struct file *file)
afs_put_wb_key(af->wb);
if ((file->f_mode & FMODE_WRITE)) {
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
afs_set_cache_aux(vnode, &aux);
fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size);
} else {
@@ -325,7 +325,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->iter = &fsreq->def_iter;
iov_iter_xarray(&fsreq->def_iter, READ,
- &fsreq->vnode->vfs_inode.i_mapping->i_pages,
+ &fsreq->vnode->netfs.inode.i_mapping->i_pages,
fsreq->pos, fsreq->len);
afs_fetch_data(fsreq->vnode, fsreq);
@@ -382,17 +382,17 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
}
-static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv)
+static void afs_free_request(struct netfs_io_request *rreq)
{
- key_put(netfs_priv);
+ key_put(rreq->netfs_priv);
}
const struct netfs_request_ops afs_req_ops = {
.init_request = afs_init_request,
+ .free_request = afs_free_request,
.begin_cache_operation = afs_begin_cache_operation,
.check_write_begin = afs_check_write_begin,
.issue_read = afs_issue_read,
- .cleanup = afs_priv_cleanup,
};
int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index d222dfbe976b..7a3803ce3a22 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -232,14 +232,14 @@ int afs_put_operation(struct afs_operation *op)
if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode)
clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags);
if (op->file[0].put_vnode)
- iput(&op->file[0].vnode->vfs_inode);
+ iput(&op->file[0].vnode->netfs.inode);
if (op->file[1].put_vnode)
- iput(&op->file[1].vnode->vfs_inode);
+ iput(&op->file[1].vnode->netfs.inode);
if (op->more_files) {
for (i = 0; i < op->nr_files - 2; i++)
if (op->more_files[i].put_vnode)
- iput(&op->more_files[i].vnode->vfs_inode);
+ iput(&op->more_files[i].vnode->netfs.inode);
kfree(op->more_files);
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 30b066299d39..64dab70d4a4f 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
*/
static void afs_set_netfs_context(struct afs_vnode *vnode)
{
- netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops);
+ netfs_inode_init(&vnode->netfs, &afs_req_ops);
}
/*
@@ -96,7 +96,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_flags |= S_NOATIME;
inode->i_uid = make_kuid(&init_user_ns, status->owner);
inode->i_gid = make_kgid(&init_user_ns, status->group);
- set_nlink(&vnode->vfs_inode, status->nlink);
+ set_nlink(&vnode->netfs.inode, status->nlink);
switch (status->type) {
case AFS_FTYPE_FILE:
@@ -139,7 +139,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
afs_set_netfs_context(vnode);
vnode->invalid_before = status->data_version;
- inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
+ inode_set_iversion_raw(&vnode->netfs.inode, status->data_version);
if (!vp->scb.have_cb) {
/* it's a symlink we just created (the fileserver
@@ -163,7 +163,7 @@ static void afs_apply_status(struct afs_operation *op,
{
struct afs_file_status *status = &vp->scb.status;
struct afs_vnode *vnode = vp->vnode;
- struct inode *inode = &vnode->vfs_inode;
+ struct inode *inode = &vnode->netfs.inode;
struct timespec64 t;
umode_t mode;
bool data_changed = false;
@@ -246,7 +246,7 @@ static void afs_apply_status(struct afs_operation *op,
* idea of what the size should be that's not the same as
* what's on the server.
*/
- vnode->netfs_ctx.remote_i_size = status->size;
+ vnode->netfs.remote_i_size = status->size;
if (change_size) {
afs_set_i_size(vnode, status->size);
inode->i_ctime = t;
@@ -289,7 +289,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
*/
if (vp->scb.status.abort_code == VNOVNODE) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
- clear_nlink(&vnode->vfs_inode);
+ clear_nlink(&vnode->netfs.inode);
__afs_break_callback(vnode, afs_cb_break_for_deleted);
op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
}
@@ -306,8 +306,8 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
if (vp->scb.have_cb)
afs_apply_callback(op, vp);
} else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) {
- drop_nlink(&vnode->vfs_inode);
- if (vnode->vfs_inode.i_nlink == 0) {
+ drop_nlink(&vnode->netfs.inode);
+ if (vnode->netfs.inode.i_nlink == 0) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
__afs_break_callback(vnode, afs_cb_break_for_deleted);
}
@@ -326,7 +326,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
struct afs_vnode *vnode = vp->vnode;
int ret;
- if (vnode->vfs_inode.i_state & I_NEW) {
+ if (vnode->netfs.inode.i_state & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
op->error = ret;
if (ret == 0)
@@ -430,7 +430,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
struct afs_vnode_cache_aux aux;
if (vnode->status.type != AFS_FTYPE_FILE) {
- vnode->netfs_ctx.cache = NULL;
+ vnode->netfs.cache = NULL;
return;
}
@@ -457,7 +457,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp)
{
struct afs_vnode_param *dvp = &op->file[0];
- struct super_block *sb = dvp->vnode->vfs_inode.i_sb;
+ struct super_block *sb = dvp->vnode->netfs.inode.i_sb;
struct afs_vnode *vnode;
struct inode *inode;
int ret;
@@ -582,10 +582,10 @@ static void afs_zap_data(struct afs_vnode *vnode)
/* nuke all the non-dirty pages that aren't locked, mapped or being
* written back in a regular file and completely discard the pages in a
* directory or symlink */
- if (S_ISREG(vnode->vfs_inode.i_mode))
- invalidate_remote_inode(&vnode->vfs_inode);
+ if (S_ISREG(vnode->netfs.inode.i_mode))
+ invalidate_remote_inode(&vnode->netfs.inode);
else
- invalidate_inode_pages2(vnode->vfs_inode.i_mapping);
+ invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
}
/*
@@ -683,8 +683,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
key_serial(key));
if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
- if (vnode->vfs_inode.i_nlink)
- clear_nlink(&vnode->vfs_inode);
+ if (vnode->netfs.inode.i_nlink)
+ clear_nlink(&vnode->netfs.inode);
goto valid;
}
@@ -745,7 +745,8 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
- if (!(query_flags & AT_STATX_DONT_SYNC) &&
+ if (vnode->volume &&
+ !(query_flags & AT_STATX_DONT_SYNC) &&
!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key))
@@ -826,7 +827,7 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->vfs_inode;
+ struct inode *inode = &vp->vnode->netfs.inode;
loff_t old_i_size = i_size_read(inode);
op->setattr.old_i_size = old_i_size;
@@ -843,7 +844,7 @@ static void afs_setattr_success(struct afs_operation *op)
static void afs_setattr_edit_file(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->vfs_inode;
+ struct inode *inode = &vp->vnode->netfs.inode;
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
loff_t size = op->setattr.attr->ia_size;
@@ -875,7 +876,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH;
struct afs_operation *op;
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
- struct inode *inode = &vnode->vfs_inode;
+ struct inode *inode = &vnode->netfs.inode;
loff_t i_size;
int ret;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a30995901266..a6f25d9e75b5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -619,12 +619,7 @@ enum afs_lock_state {
* leak from one inode to another.
*/
struct afs_vnode {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode; /* the VFS's inode record */
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
-
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct afs_volume *volume; /* volume on which vnode resides */
struct afs_fid fid; /* the file identifier for this inode */
struct afs_file_status status; /* AFS status info for this file */
@@ -675,7 +670,7 @@ struct afs_vnode {
static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
{
#ifdef CONFIG_AFS_FSCACHE
- return netfs_i_cookie(&vnode->vfs_inode);
+ return netfs_i_cookie(&vnode->netfs);
#else
return NULL;
#endif
@@ -685,7 +680,7 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode,
struct fscache_cookie *cookie)
{
#ifdef CONFIG_AFS_FSCACHE
- vnode->netfs_ctx.cache = cookie;
+ vnode->netfs.cache = cookie;
#endif
}
@@ -892,7 +887,7 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
afs_set_cache_aux(vnode, &aux);
fscache_invalidate(afs_vnode_cache(vnode), &aux,
- i_size_read(&vnode->vfs_inode), flags);
+ i_size_read(&vnode->netfs.inode), flags);
}
/*
@@ -1217,7 +1212,7 @@ static inline struct afs_net *afs_i2net(struct inode *inode)
static inline struct afs_net *afs_v2net(struct afs_vnode *vnode)
{
- return afs_i2net(&vnode->vfs_inode);
+ return afs_i2net(&vnode->netfs.inode);
}
static inline struct afs_net *afs_sock2net(struct sock *sk)
@@ -1593,12 +1588,12 @@ extern void yfs_fs_store_opaque_acl2(struct afs_operation *);
*/
static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
{
- return container_of(inode, struct afs_vnode, vfs_inode);
+ return container_of(inode, struct afs_vnode, netfs.inode);
}
static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
{
- return &vnode->vfs_inode;
+ return &vnode->netfs.inode;
}
/*
@@ -1621,8 +1616,8 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
*/
static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size)
{
- i_size_write(&vnode->vfs_inode, size);
- vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1;
+ i_size_write(&vnode->netfs.inode, size);
+ vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1;
}
/*
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 1d1a8debe472..933e67fcdab1 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -163,8 +163,11 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
return;
case -ECONNABORTED:
+ error = afs_abort_to_error(abort_code);
+ fallthrough;
+ case -ENETRESET: /* Responded, but we seem to have changed address */
e->responded = true;
- e->error = afs_abort_to_error(abort_code);
+ e->error = error;
return;
}
}
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 79e1a5f6701b..a840c3588ebb 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -292,6 +292,10 @@ bool afs_select_fileserver(struct afs_operation *op)
op->error = error;
goto iterate_address;
+ case -ENETRESET:
+ pr_warn("kAFS: Peer reset %s (op=%x)\n",
+ op->type ? op->type->name : "???", op->debug_id);
+ fallthrough;
case -ECONNRESET:
_debug("call reset");
op->error = error;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 23a1a92d64bb..a5434f3e57c6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -537,6 +537,8 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
+ case -ENOMEM:
+ case -EFAULT:
abort_code = RXGEN_CC_UNMARSHAL;
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
@@ -544,7 +546,7 @@ static void afs_deliver_to_call(struct afs_call *call)
abort_code, ret, "KUM");
goto local_abort;
default:
- abort_code = RX_USER_ABORT;
+ abort_code = RX_CALL_DEAD;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret, "KER");
goto local_abort;
@@ -836,7 +838,7 @@ void afs_send_empty_reply(struct afs_call *call)
case -ENOMEM:
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RX_USER_ABORT, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
fallthrough;
default:
_leave(" [error]");
@@ -878,7 +880,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RX_USER_ABORT, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
}
_leave(" [error]");
}
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3c7a8fc4f93f..7c6a63a30394 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -219,8 +219,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
* yet.
*/
size++;
- new = kzalloc(sizeof(struct afs_permits) +
- sizeof(struct afs_permit) * size, GFP_NOFS);
+ new = kzalloc(struct_size(new, permits, size), GFP_NOFS);
if (!new)
goto out_put;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fea195b0b27..95d713074dc8 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -659,7 +659,7 @@ static void afs_i_init_once(void *_vnode)
struct afs_vnode *vnode = _vnode;
memset(vnode, 0, sizeof(*vnode));
- inode_init_once(&vnode->vfs_inode);
+ inode_init_once(&vnode->netfs.inode);
mutex_init(&vnode->io_lock);
init_rwsem(&vnode->validate_lock);
spin_lock_init(&vnode->wb_lock);
@@ -700,8 +700,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
init_rwsem(&vnode->rmdir_lock);
INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work);
- _leave(" = %p", &vnode->vfs_inode);
- return &vnode->vfs_inode;
+ _leave(" = %p", &vnode->netfs.inode);
+ return &vnode->netfs.inode;
}
static void afs_free_inode(struct inode *inode)
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 94a3d247924b..cc665cef0abe 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -9,8 +9,7 @@
#include <linux/slab.h>
#include "internal.h"
-unsigned __read_mostly afs_volume_gc_delay = 10;
-unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static unsigned __read_mostly afs_volume_record_life = 60 * 60;
/*
* Insert a volume into a cell. If there's an existing volume record, that is
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 5224e346fbad..2c885b22de34 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -60,7 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- ret = netfs_write_begin(file, mapping, pos, len, &folio, fsdata);
+ ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata);
if (ret < 0)
return ret;
@@ -146,10 +146,10 @@ int afs_write_end(struct file *file, struct address_space *mapping,
write_end_pos = pos + copied;
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (write_end_pos > i_size) {
write_seqlock(&vnode->cb_lock);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (write_end_pos > i_size)
afs_set_i_size(vnode, write_end_pos);
write_sequnlock(&vnode->cb_lock);
@@ -257,7 +257,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
*/
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
- struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = vnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t end;
@@ -354,7 +354,6 @@ static const struct afs_operation_ops afs_store_data_operation = {
static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos,
bool laundering)
{
- struct netfs_i_context *ictx = &vnode->netfs_ctx;
struct afs_operation *op;
struct afs_wb_key *wbk = NULL;
loff_t size = iov_iter_count(iter);
@@ -385,9 +384,9 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
op->store.write_iter = iter;
op->store.pos = pos;
op->store.size = size;
- op->store.i_size = max(pos + size, ictx->remote_i_size);
+ op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
op->store.laundering = laundering;
- op->mtime = vnode->vfs_inode.i_mtime;
+ op->mtime = vnode->netfs.inode.i_mtime;
op->flags |= AFS_OPERATION_UNINTR;
op->ops = &afs_store_data_operation;
@@ -554,7 +553,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
struct iov_iter iter;
unsigned long priv;
unsigned int offset, to, len, max_len;
- loff_t i_size = i_size_read(&vnode->vfs_inode);
+ loff_t i_size = i_size_read(&vnode->netfs.inode);
bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode));
long count = wbc->nr_to_write;
@@ -636,6 +635,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
case -EKEYEXPIRED:
case -EKEYREJECTED:
case -EKEYREVOKED:
+ case -ENETRESET:
afs_redirty_pages(wbc, mapping, start, len);
mapping_set_error(mapping, ret);
break;
@@ -844,7 +844,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
_enter("{%llx:%llu},{%zu},",
vnode->fid.vid, vnode->fid.vnode, count);
- if (IS_SWAPFILE(&vnode->vfs_inode)) {
+ if (IS_SWAPFILE(&vnode->netfs.inode)) {
printk(KERN_INFO
"AFS: Attempt to write to active swap file!\n");
return -EBUSY;
@@ -957,8 +957,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
/* Discard unused keys */
spin_lock(&vnode->wb_lock);
- if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
- !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) {
+ if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
+ !mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) {
list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) {
if (refcount_read(&wbk->usage) == 1)
list_move(&wbk->vnode_link, &graveyard);
@@ -1033,6 +1033,6 @@ static void afs_write_to_cache(struct afs_vnode *vnode,
bool caching)
{
fscache_write_to_cache(afs_vnode_cache(vnode),
- vnode->vfs_inode.i_mapping, start, len, i_size,
+ vnode->netfs.inode.i_mapping, start, len, i_size,
afs_write_to_cache_done, vnode, caching);
}
diff --git a/fs/attr.c b/fs/attr.c
index 66899b6e9bd8..dbe996b0dedf 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -61,9 +61,15 @@ static bool chgrp_ok(struct user_namespace *mnt_userns,
const struct inode *inode, kgid_t gid)
{
kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) &&
- (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
- return true;
+ if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
+ kgid_t mapped_gid;
+
+ if (gid_eq(gid, inode->i_gid))
+ return true;
+ mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid);
+ if (in_group_p(mapped_gid))
+ return true;
+ }
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
if (gid_eq(kgid, INVALID_GID) &&
@@ -123,12 +129,20 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
/* Make sure a caller can chmod. */
if (ia_valid & ATTR_MODE) {
+ kgid_t mapped_gid;
+
if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
+
+ if (ia_valid & ATTR_GID)
+ mapped_gid = mapped_kgid_fs(mnt_userns,
+ i_user_ns(inode), attr->ia_gid);
+ else
+ mapped_gid = i_gid_into_mnt(mnt_userns, inode);
+
/* Also check the setgid bit! */
- if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
- i_gid_into_mnt(mnt_userns, inode)) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_p(mapped_gid) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 3ac668ace50a..35e0e860cc0b 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -104,6 +104,7 @@ struct btrfs_block_group {
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
unsigned int zone_is_active:1;
+ unsigned int zoned_data_reloc_ongoing:1;
int disk_cache_state;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0e49b1a0c071..415bf1823fb3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1330,6 +1330,8 @@ struct btrfs_replace_extent_info {
* existing extent into a file range.
*/
bool is_new_extent;
+ /* Indicate if we should update the inode's mtime and ctime. */
+ bool update_times;
/* Meaningful only if is_new_extent is true. */
int qgroup_reserved;
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 89e94ea2fef5..4ba005c41983 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4632,6 +4632,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
int ret;
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
+
+ /*
+ * We may have the reclaim task running and relocating a data block group,
+ * in which case it may create delayed iputs. So stop it before we park
+ * the cleaner kthread otherwise we can get new delayed iputs after
+ * parking the cleaner, and that can make the async reclaim task to hang
+ * if it's waiting for delayed iputs to complete, since the cleaner is
+ * parked and can not run delayed iputs - this will make us hang when
+ * trying to stop the async reclaim task.
+ */
+ cancel_work_sync(&fs_info->reclaim_bgs_work);
/*
* We don't want the cleaner to start new transactions, add more delayed
* iputs, etc. while we're closing. We can't use kthread_stop() yet
@@ -4672,8 +4683,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
- cancel_work_sync(&fs_info->reclaim_bgs_work);
-
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0867c5cd6e01..4157ecc27d4b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3832,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
block_group->start == fs_info->data_reloc_bg ||
fs_info->data_reloc_bg == 0);
- if (block_group->ro) {
+ if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
ret = 1;
goto out;
}
@@ -3894,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
- if (ret && ffe_ctl->for_data_reloc)
+ if (ret && ffe_ctl->for_data_reloc &&
+ fs_info->data_reloc_bg == block_group->start) {
+ /*
+ * Do not allow further allocations from this block group.
+ * Compared to increasing the ->ro, setting the
+ * ->zoned_data_reloc_ongoing flag still allows nocow
+ * writers to come in. See btrfs_inc_nocow_writers().
+ *
+ * We need to disable an allocation to avoid an allocation of
+ * regular (non-relocation data) extent. With mix of relocation
+ * extents and regular extents, we can dispatch WRITE commands
+ * (for relocation extents) and ZONE APPEND commands (for
+ * regular extents) at the same time to the same zone, which
+ * easily break the write pointer.
+ */
+ block_group->zoned_data_reloc_ongoing = 1;
fs_info->data_reloc_bg = 0;
+ }
spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8f6b544ae616..04e36343da3a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5241,13 +5241,14 @@ int extent_writepages(struct address_space *mapping,
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, wbc, &epd);
- btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
ASSERT(ret <= 0);
if (ret < 0) {
+ btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
end_write_bio(&epd, ret);
return ret;
}
flush_write_bio(&epd);
+ btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
return ret;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1fd827b99c1b..9dfde1af8a64 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
- if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret == BTRFS_NO_LOG_SYNC) {
+ ret = btrfs_end_transaction(trans);
+ goto out;
+ }
+
+ /* We successfully logged the inode, attempt to sync the log. */
+ if (!ret) {
+ ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
- ret = btrfs_sync_log(trans, root, &ctx);
- if (!ret) {
- ret = btrfs_end_transaction(trans);
- goto out;
- }
- }
- if (!full_sync) {
- ret = btrfs_wait_ordered_range(inode, start, len);
- if (ret) {
- btrfs_end_transaction(trans);
- goto out;
- }
+ ret = btrfs_end_transaction(trans);
+ goto out;
}
- ret = btrfs_commit_transaction(trans);
- } else {
+ }
+
+ /*
+ * At this point we need to commit the transaction because we had
+ * btrfs_need_log_full_commit() or some other error.
+ *
+ * If we didn't do a full sync we have to stop the trans handle, wait on
+ * the ordered extents, start it again and commit the transaction. If
+ * we attempt to wait on the ordered extents here we could deadlock with
+ * something like fallocate() that is holding the extent lock trying to
+ * start a transaction while some other thread is trying to commit the
+ * transaction while we (fsync) are currently holding the transaction
+ * open.
+ */
+ if (!full_sync) {
ret = btrfs_end_transaction(trans);
+ if (ret)
+ goto out;
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret)
+ goto out;
+
+ /*
+ * This is safe to use here because we're only interested in
+ * making sure the transaction that had the ordered extents is
+ * committed. We aren't waiting on anything past this point,
+ * we're purely getting the transaction and committing it.
+ */
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+
+ /*
+ * We committed the transaction and there's no currently
+ * running transaction, this means everything we care
+ * about made it to disk and we are done.
+ */
+ if (ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
}
+
+ ret = btrfs_commit_transaction(trans);
out:
ASSERT(list_empty(&ctx.list));
err = file_check_and_advance_wb_err(file);
@@ -2719,7 +2756,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, false);
- BUG_ON(ret);
+ if (WARN_ON(ret))
+ goto out_trans;
trans->block_rsv = rsv;
cur_offset = start;
@@ -2803,6 +2841,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
extent_info->file_offset += replace_len;
}
+ /*
+ * We are releasing our handle on the transaction, balance the
+ * dirty pages of the btree inode and flush delayed items, and
+ * then get a new transaction handle, which may now point to a
+ * new transaction in case someone else may have committed the
+ * transaction we used to replace/drop file extent items. So
+ * bump the inode's iversion and update mtime and ctime except
+ * if we are called from a dedupe context. This is because a
+ * power failure/crash may happen after the transaction is
+ * committed and before we finish replacing/dropping all the
+ * file extent items we need.
+ */
+ inode_inc_iversion(&inode->vfs_inode);
+
+ if (!extent_info || extent_info->update_times) {
+ inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
+ inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
+ }
+
ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
@@ -2819,7 +2876,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
- BUG_ON(ret); /* shouldn't happen */
+ if (WARN_ON(ret))
+ break;
trans->block_rsv = rsv;
cur_offset = drop_args.drop_end;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81737eff92f3..05e0c4a5affd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3195,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
+ btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
} else {
BUG_ON(root == fs_info->tree_root);
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
@@ -9897,6 +9899,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
extent_info.file_offset = file_offset;
extent_info.extent_buf = (char *)&stack_fi;
extent_info.is_new_extent = true;
+ extent_info.update_times = true;
extent_info.qgroup_reserved = qgroup_released;
extent_info.insertions = 0;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 313d9d685adb..33461b4f9c8b 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne
start_ns = ktime_get_ns();
down_read_nested(&eb->lock, nest);
- eb->lock_owner = current->pid;
trace_btrfs_tree_read_lock(eb, start_ns);
}
@@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
if (down_read_trylock(&eb->lock)) {
- eb->lock_owner = current->pid;
trace_btrfs_try_tree_read_lock(eb);
return 1;
}
@@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_read_unlock(eb);
- eb->lock_owner = 0;
up_read(&eb->lock);
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index c39f8b3a5a4a..a3549d587464 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
int ret;
const u64 len = olen_aligned;
u64 last_dest_end = destoff;
+ u64 prev_extent_end = off;
ret = -ENOMEM;
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
@@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
key.offset = off;
while (1) {
- u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
u64 extent_gen;
int type;
@@ -431,14 +431,21 @@ process_slot:
* The first search might have left us at an extent item that
* ends before our target range's start, can happen if we have
* holes and NO_HOLES feature enabled.
+ *
+ * Subsequent searches may leave us on a file range we have
+ * processed before - this happens due to a race with ordered
+ * extent completion for a file range that is outside our source
+ * range, but that range was part of a file extent item that
+ * also covered a leading part of our source range.
*/
- if (key.offset + datal <= off) {
+ if (key.offset + datal <= prev_extent_end) {
path->slots[0]++;
goto process_slot;
} else if (key.offset >= off + len) {
break;
}
- next_key_min_offset = key.offset + datal;
+
+ prev_extent_end = key.offset + datal;
size = btrfs_item_size(leaf, slot);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
size);
@@ -489,6 +496,7 @@ process_slot:
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
clone_info.is_new_extent = false;
+ clone_info.update_times = !no_time_update;
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
drop_start, new_key.offset + datal - 1,
&clone_info, &trans);
@@ -550,7 +558,7 @@ process_slot:
break;
btrfs_release_path(path);
- key.offset = next_key_min_offset;
+ key.offset = prev_extent_end;
if (fatal_signal_pending(current)) {
ret = -EINTR;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b1fdc6a26c76..6627dd7875ee 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -763,6 +763,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
compress_force = false;
no_compress++;
} else {
+ btrfs_err(info, "unrecognized compression value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -821,8 +823,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_thread_pool:
ret = match_int(&args[0], &intarg);
if (ret) {
+ btrfs_err(info, "unrecognized thread_pool value %s",
+ args[0].from);
goto out;
} else if (intarg == 0) {
+ btrfs_err(info, "invalid value 0 for thread_pool");
ret = -EINVAL;
goto out;
}
@@ -883,8 +888,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized metadata_ratio value %s",
+ args[0].from);
goto out;
+ }
info->metadata_ratio = intarg;
btrfs_info(info, "metadata ratio %u",
info->metadata_ratio);
@@ -901,6 +909,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_and_info(info, DISCARD_ASYNC,
"turning on async discard");
} else {
+ btrfs_err(info, "unrecognized discard mode value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -933,6 +943,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_and_info(info, FREE_SPACE_TREE,
"enabling free space tree");
} else {
+ btrfs_err(info, "unrecognized space_cache value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -1014,8 +1026,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_check_integrity_print_mask:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info,
+ "unrecognized check_integrity_print_mask value %s",
+ args[0].from);
goto out;
+ }
info->check_integrity_print_mask = intarg;
btrfs_info(info, "check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
@@ -1030,13 +1046,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
goto out;
#endif
case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0)
+ if (strcmp(args[0].from, "panic") == 0) {
btrfs_set_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else if (strcmp(args[0].from, "bug") == 0)
+ } else if (strcmp(args[0].from, "bug") == 0) {
btrfs_clear_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else {
+ } else {
+ btrfs_err(info, "unrecognized fatal_errors value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -1044,8 +1062,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_commit_interval:
intarg = 0;
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized commit_interval value %s",
+ args[0].from);
+ ret = -EINVAL;
goto out;
+ }
if (intarg == 0) {
btrfs_info(info,
"using default commit interval %us",
@@ -1059,8 +1081,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_rescue:
ret = parse_rescue_options(info, args[0].from);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_err(info, "unrecognized rescue value %s",
+ args[0].from);
goto out;
+ }
break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
@@ -1985,6 +2010,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
+ /* V1 cache is not supported for subpage mount. */
+ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ btrfs_warn(fs_info,
+ "v1 space cache is not supported for page size %lu with sectorsize %u",
+ PAGE_SIZE, fs_info->sectorsize);
+ ret = -EINVAL;
+ goto restore;
+ }
btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 11237a913bee..79e8c8cd75ed 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2139,3 +2139,30 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
factor = div64_u64(used * 100, total);
return factor >= fs_info->bg_reclaim_threshold;
}
+
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length)
+{
+ struct btrfs_block_group *block_group;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ /* It should be called on a previous data relocation block group. */
+ ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
+
+ spin_lock(&block_group->lock);
+ if (!block_group->zoned_data_reloc_ongoing)
+ goto out;
+
+ /* All relocation extents are written. */
+ if (block_group->start + block_group->alloc_offset == logical + length) {
+ /* Now, release this block group for further allocations. */
+ block_group->zoned_data_reloc_ongoing = 0;
+ }
+
+out:
+ spin_unlock(&block_group->lock);
+ btrfs_put_block_group(block_group);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index bb1a189e11f9..6b2eec99162b 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -77,6 +77,8 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -243,6 +245,9 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
{
return false;
}
+
+static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7584aa6e5025..6dee88815491 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -256,6 +256,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
struct iov_iter iter;
ssize_t err = 0;
size_t len;
+ int mode;
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
@@ -264,7 +265,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
goto out;
/* We need to fetch the inline data. */
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
@@ -392,11 +394,10 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
return 0;
}
-static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
+static void ceph_netfs_free_request(struct netfs_io_request *rreq)
{
- struct inode *inode = mapping->host;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int got = (uintptr_t)priv;
+ struct ceph_inode_info *ci = ceph_inode(rreq->inode);
+ int got = (uintptr_t)rreq->netfs_priv;
if (got)
ceph_put_cap_refs(ci, got);
@@ -404,12 +405,12 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
+ .free_request = ceph_netfs_free_request,
.begin_cache_operation = ceph_begin_cache_operation,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
.check_write_begin = ceph_netfs_check_write_begin,
- .cleanup = ceph_readahead_cleanup,
};
#ifdef CONFIG_CEPH_FSCACHE
@@ -604,8 +605,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
true);
- if (IS_ERR(req))
+ if (IS_ERR(req)) {
+ redirty_page_for_writepage(wbc, page);
return PTR_ERR(req);
+ }
set_page_writeback(page);
if (caching)
@@ -1318,10 +1321,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct folio *folio = NULL;
int r;
- r = netfs_write_begin(file, inode->i_mapping, pos, len, &folio, NULL);
+ r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL);
if (r == 0)
folio_wait_fscache(folio);
if (r < 0) {
@@ -1644,7 +1648,7 @@ int ceph_uninline_data(struct file *file)
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_osd_request *req;
+ struct ceph_osd_request *req = NULL;
struct ceph_cap_flush *prealloc_cf;
struct folio *folio = NULL;
u64 inline_version = CEPH_INLINE_NONE;
@@ -1652,10 +1656,23 @@ int ceph_uninline_data(struct file *file)
int err = 0;
u64 len;
+ spin_lock(&ci->i_ceph_lock);
+ inline_version = ci->i_inline_version;
+ spin_unlock(&ci->i_ceph_lock);
+
+ dout("uninline_data %p %llx.%llx inline_version %llu\n",
+ inode, ceph_vinop(inode), inline_version);
+
+ if (inline_version == CEPH_INLINE_NONE)
+ return 0;
+
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
+ if (inline_version == 1) /* initial version, no data */
+ goto out_uninline;
+
folio = read_mapping_folio(inode->i_mapping, 0, file);
if (IS_ERR(folio)) {
err = PTR_ERR(folio);
@@ -1664,17 +1681,6 @@ int ceph_uninline_data(struct file *file)
folio_lock(folio);
- spin_lock(&ci->i_ceph_lock);
- inline_version = ci->i_inline_version;
- spin_unlock(&ci->i_ceph_lock);
-
- dout("uninline_data %p %llx.%llx inline_version %llu\n",
- inode, ceph_vinop(inode), inline_version);
-
- if (inline_version == 1 || /* initial version, no data */
- inline_version == CEPH_INLINE_NONE)
- goto out_unlock;
-
len = i_size_read(inode);
if (len > folio_size(folio))
len = folio_size(folio);
@@ -1739,6 +1745,7 @@ int ceph_uninline_data(struct file *file)
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
+out_uninline:
if (!err) {
int dirty;
@@ -1757,8 +1764,10 @@ out_put_req:
if (err == -ECANCELED)
err = 0;
out_unlock:
- folio_unlock(folio);
- folio_put(folio);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
out:
ceph_free_cap_flush(prealloc_cf);
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
@@ -1777,7 +1786,6 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
if (!mapping->a_ops->read_folio)
return -ENOEXEC;
- file_accessed(file);
vma->vm_ops = &ceph_vmops;
return 0;
}
@@ -1790,7 +1798,7 @@ enum {
static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
s64 pool, struct ceph_string *pool_ns)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
struct rb_node **p, *parent;
@@ -1905,7 +1913,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
0, false, true);
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
- wr_req->r_mtime = ci->vfs_inode.i_mtime;
+ wr_req->r_mtime = ci->netfs.inode.i_mtime;
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
if (!err)
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index ddea99922073..177d8e8d73fe 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -29,9 +29,9 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
if (!(inode->i_state & I_NEW))
return;
- WARN_ON_ONCE(ci->netfs_ctx.cache);
+ WARN_ON_ONCE(ci->netfs.cache);
- ci->netfs_ctx.cache =
+ ci->netfs.cache =
fscache_acquire_cookie(fsc->fscache, 0,
&ci->i_vino, sizeof(ci->i_vino),
&ci->i_version, sizeof(ci->i_version),
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 7255b790a4c1..dc502daac49a 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -28,7 +28,7 @@ void ceph_fscache_invalidate(struct inode *inode, bool dio_write);
static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
{
- return netfs_i_cookie(&ci->vfs_inode);
+ return netfs_i_cookie(&ci->netfs);
}
static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5c14ef04e474..38c930384d41 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -492,7 +492,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
+ dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
ci->i_hold_caps_max - jiffies);
}
@@ -507,7 +507,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
+ dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
@@ -531,7 +531,7 @@ no_change:
static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+ dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
spin_lock(&mdsc->cap_delay_lock);
ci->i_ceph_flags |= CEPH_I_FLUSH;
if (!list_empty(&ci->i_cap_delay_list))
@@ -548,7 +548,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+ dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
if (list_empty(&ci->i_cap_delay_list))
return;
spin_lock(&mdsc->cap_delay_lock);
@@ -568,7 +568,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* Each time we receive FILE_CACHE anew, we increment
* i_rdcache_gen.
*/
- if (S_ISREG(ci->vfs_inode.i_mode) &&
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
(issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
@@ -583,14 +583,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
if (issued & CEPH_CAP_FILE_SHARED)
atomic_inc(&ci->i_shared_gen);
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->vfs_inode);
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
+ dout(" marking %p NOT complete\n", &ci->netfs.inode);
__ceph_dir_clear_complete(ci);
}
}
/* Wipe saved layout if we're losing DIR_CREATE caps */
- if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
+ if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
!(issued & CEPH_CAP_DIR_CREATE)) {
ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
@@ -771,7 +771,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
dout("__cap_is_valid %p cap %p issued %s "
- "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+ "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
return 0;
}
@@ -797,7 +797,7 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
if (!__cap_is_valid(cap))
continue;
dout("__ceph_caps_issued %p cap %p issued %s\n",
- &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+ &ci->netfs.inode, cap, ceph_cap_string(cap->issued));
have |= cap->issued;
if (implemented)
*implemented |= cap->implemented;
@@ -844,12 +844,12 @@ static void __touch_cap(struct ceph_cap *cap)
spin_lock(&s->s_cap_lock);
if (!s->s_cap_iterator) {
- dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+ dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
s->s_mds);
list_move_tail(&cap->session_caps, &s->s_caps);
} else {
dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
- &cap->ci->vfs_inode, cap, s->s_mds);
+ &cap->ci->netfs.inode, cap, s->s_mds);
}
spin_unlock(&s->s_cap_lock);
}
@@ -867,7 +867,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if ((have & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode),
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode),
ceph_cap_string(have),
ceph_cap_string(mask));
return 1;
@@ -879,7 +879,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
continue;
if ((cap->issued & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch)
@@ -891,7 +891,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
have |= cap->issued;
if ((have & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode),
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode),
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch) {
@@ -919,7 +919,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
int touch)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
int r;
r = __ceph_caps_issued_mask(ci, mask, touch);
@@ -950,7 +950,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int ret;
spin_lock(&ci->i_ceph_lock);
@@ -969,8 +969,8 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref ||
- (S_ISREG(ci->vfs_inode.i_mode) &&
- ci->vfs_inode.i_data.nrpages))
+ (S_ISREG(ci->netfs.inode.i_mode) &&
+ ci->netfs.inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
@@ -993,11 +993,11 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+ ceph_inode_to_client(&ci->netfs.inode)->mount_options;
unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
int want = 0;
/* use used_cutoff here, to keep dir's wanted caps longer */
@@ -1050,7 +1050,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
int __ceph_caps_wanted(struct ceph_inode_info *ci)
{
int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
/* we want EXCL if holding caps of dir ops */
if (w & CEPH_CAP_ANY_DIR_OPS)
w |= CEPH_CAP_FILE_EXCL;
@@ -1116,9 +1116,9 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
lockdep_assert_held(&ci->i_ceph_lock);
- dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+ dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
- mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc;
+ mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc;
/* remove from inode's cap rbtree, and clear auth cap */
rb_erase(&cap->ci_node, &ci->i_caps);
@@ -1169,7 +1169,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
* keep i_snap_realm.
*/
if (ci->i_wr_ref == 0 && ci->i_snap_realm)
- ceph_change_snap_realm(&ci->vfs_inode, NULL);
+ ceph_change_snap_realm(&ci->netfs.inode, NULL);
__cap_delay_cancel(mdsc, ci);
}
@@ -1188,11 +1188,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
lockdep_assert_held(&ci->i_ceph_lock);
- fsc = ceph_inode_to_client(&ci->vfs_inode);
+ fsc = ceph_inode_to_client(&ci->netfs.inode);
WARN_ON_ONCE(ci->i_auth_cap == cap &&
!list_empty(&ci->i_dirty_item) &&
!fsc->blocklisted &&
- !ceph_inode_is_shutdown(&ci->vfs_inode));
+ !ceph_inode_is_shutdown(&ci->netfs.inode));
__ceph_remove_cap(cap, queue_release);
}
@@ -1343,7 +1343,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
int flushing, u64 flush_tid, u64 oldest_flush_tid)
{
struct ceph_inode_info *ci = cap->ci;
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int held, revoking;
lockdep_assert_held(&ci->i_ceph_lock);
@@ -1440,7 +1440,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
{
struct ceph_msg *msg;
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
if (!msg) {
@@ -1528,7 +1528,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap_snap *capsnap;
u64 oldest_flush_tid = 0;
@@ -1577,7 +1577,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
while (first_tid <= last_tid) {
struct ceph_cap *cap = ci->i_auth_cap;
- struct ceph_cap_flush *cf;
+ struct ceph_cap_flush *cf = NULL, *iter;
int ret;
if (!(cap && cap->session == session)) {
@@ -1587,8 +1587,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
}
ret = -ENOENT;
- list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
- if (cf->tid >= first_tid) {
+ list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
+ if (iter->tid >= first_tid) {
+ cf = iter;
ret = 0;
break;
}
@@ -1621,7 +1622,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL;
int mds;
@@ -1681,8 +1682,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
struct ceph_cap_flush **pcf)
{
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct inode *inode = &ci->vfs_inode;
+ ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc;
+ struct inode *inode = &ci->netfs.inode;
int was = ci->i_dirty_caps;
int dirty = 0;
@@ -1695,7 +1696,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
return 0;
}
- dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+ dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,
ceph_cap_string(mask), ceph_cap_string(was),
ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
@@ -1711,7 +1712,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
ci->i_snap_realm->cached_context);
}
dout(" inode %p now dirty snapc %p auth cap %p\n",
- &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+ &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
list_add(&ci->i_dirty_item, &session->s_cap_dirty);
@@ -1874,7 +1875,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
bool __ceph_should_report_size(struct ceph_inode_info *ci)
{
- loff_t size = i_size_read(&ci->vfs_inode);
+ loff_t size = i_size_read(&ci->netfs.inode);
/* mds will adjust max size according to the reported size */
if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
return false;
@@ -1899,7 +1900,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
@@ -1910,6 +1911,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct rb_node *p;
bool queue_invalidate = false;
bool tried_invalidate = false;
+ bool queue_writeback = false;
if (session)
ceph_get_mds_session(session);
@@ -2062,10 +2064,27 @@ retry:
}
/* completed revocation? going down and there are no caps? */
- if (revoking && (revoking & cap_used) == 0) {
- dout("completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
+ if (revoking) {
+ if ((revoking & cap_used) == 0) {
+ dout("completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /*
+ * If the "i_wrbuffer_ref" was increased by mmap or generic
+ * cache write just before the ceph_check_caps() is called,
+ * the Fb capability revoking will fail this time. Then we
+ * must wait for the BDI's delayed work to flush the dirty
+ * pages and to release the "i_wrbuffer_ref", which will cost
+ * at most 5 seconds. That means the MDS needs to wait at
+ * most 5 seconds to finished the Fb capability's revocation.
+ *
+ * Let's queue a writeback for it.
+ */
+ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
+ (revoking & CEPH_CAP_FILE_BUFFER))
+ queue_writeback = true;
}
/* want more caps from mds? */
@@ -2135,6 +2154,8 @@ ack:
spin_unlock(&ci->i_ceph_lock);
ceph_put_mds_session(session);
+ if (queue_writeback)
+ ceph_queue_writeback(inode);
if (queue_invalidate)
ceph_queue_invalidate(inode);
}
@@ -2218,9 +2239,9 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
}
/*
- * wait for any unsafe requests to complete.
+ * flush the mdlog and wait for any unsafe requests to complete.
*/
-static int unsafe_request_wait(struct inode *inode)
+static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -2336,7 +2357,7 @@ retry:
kfree(sessions);
}
- dout("unsafe_request_wait %p wait on tid %llu %llu\n",
+ dout("%s %p wait on tid %llu %llu\n", __func__,
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) {
ret = !wait_for_completion_timeout(&req1->r_safe_completion,
@@ -2380,7 +2401,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
- err = unsafe_request_wait(inode);
+ err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
/*
* only wait on non-file metadata writeback (the mds
@@ -2446,7 +2467,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
int ret;
@@ -2539,7 +2560,7 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n",
- &ci->vfs_inode, cap, session->s_mds);
+ &ci->netfs.inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2589,7 +2610,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n",
- &ci->vfs_inode, cap, session->s_mds);
+ &ci->netfs.inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2609,7 +2630,7 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
lockdep_assert_held(&ci->i_ceph_lock);
- dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
+ dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,
ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
@@ -2652,10 +2673,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
}
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
- ihold(&ci->vfs_inode);
+ ihold(&ci->netfs.inode);
ci->i_wb_ref++;
dout("%s %p wb %d -> %d (?)\n", __func__,
- &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
+ &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -2983,7 +3004,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
return ret;
}
- if (S_ISREG(ci->vfs_inode.i_mode) &&
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
ci->i_inline_version != CEPH_INLINE_NONE &&
(_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
i_size_read(inode) > 0) {
@@ -3073,7 +3094,7 @@ enum put_cap_refs_mode {
static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
enum put_cap_refs_mode mode)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int last = 0, put = 0, flushsnaps = 0, wake = 0;
bool check_flushsnaps = false;
@@ -3181,11 +3202,10 @@ void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap = NULL;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_cap_snap *capsnap = NULL, *iter;
int put = 0;
bool last = false;
- bool found = false;
bool flush_snaps = false;
bool complete_capsnap = false;
@@ -3212,14 +3232,14 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
last ? " LAST" : "");
} else {
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->context == snapc) {
- found = true;
+ list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
+ if (iter->context == snapc) {
+ capsnap = iter;
break;
}
}
- if (!found) {
+ if (!capsnap) {
/*
* The capsnap should already be removed when removing
* auth cap in the case of a forced unmount.
@@ -3678,7 +3698,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
session->s_mds,
&list_first_entry(&session->s_cap_flushing,
struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
+ i_flushing_item)->netfs.inode);
}
}
mdsc->num_cap_flushing--;
@@ -3769,8 +3789,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows);
- struct ceph_cap_snap *capsnap;
- bool flushed = false;
+ struct ceph_cap_snap *capsnap = NULL, *iter;
bool wake_ci = false;
bool wake_mdsc = false;
@@ -3778,26 +3797,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
inode, ci, session->s_mds, follows);
spin_lock(&ci->i_ceph_lock);
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->follows == follows) {
- if (capsnap->cap_flush.tid != flush_tid) {
+ list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
+ if (iter->follows == follows) {
+ if (iter->cap_flush.tid != flush_tid) {
dout(" cap_snap %p follows %lld tid %lld !="
- " %lld\n", capsnap, follows,
- flush_tid, capsnap->cap_flush.tid);
+ " %lld\n", iter, follows,
+ flush_tid, iter->cap_flush.tid);
break;
}
- flushed = true;
+ capsnap = iter;
break;
} else {
dout(" skipping cap_snap %p follows %lld\n",
- capsnap, capsnap->follows);
+ iter, iter->follows);
}
}
- if (flushed)
+ if (capsnap)
ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
spin_unlock(&ci->i_ceph_lock);
- if (flushed) {
+ if (capsnap) {
ceph_put_snap_context(capsnap->context);
ceph_put_cap_snap(capsnap);
if (wake_ci)
@@ -4326,7 +4345,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
break;
list_del_init(&ci->i_cap_delay_list);
- inode = igrab(&ci->vfs_inode);
+ inode = igrab(&ci->netfs.inode);
if (inode) {
spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
@@ -4354,7 +4373,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
while (!list_empty(&s->s_cap_dirty)) {
ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
i_dirty_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
@@ -4388,7 +4407,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
int bits = (fmode << 1) | 1;
bool already_opened = false;
int i;
@@ -4422,7 +4441,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
int bits = (fmode << 1) | 1;
bool is_closed = true;
int i;
@@ -4637,7 +4656,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
lockdep_assert_held(&ci->i_ceph_lock);
dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
+ cap, ci, &ci->netfs.inode);
is_auth = (cap == ci->i_auth_cap);
__ceph_remove_cap(cap, false);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8c8226c0feac..da59e836a06e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -205,7 +205,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+ ceph_inode_to_client(&ci->netfs.inode)->mount_options;
struct ceph_file_info *fi;
int ret;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 63113e2a4890..56c53ab3618e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -176,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
rb_insert_color(&frag->node, &ci->i_fragtree);
dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->vfs_inode), f);
+ ceph_vinop(&ci->netfs.inode), f);
return frag;
}
@@ -457,10 +457,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
if (!ci)
return NULL;
- dout("alloc_inode %p\n", &ci->vfs_inode);
+ dout("alloc_inode %p\n", &ci->netfs.inode);
/* Set parameters for the netfs library */
- netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops);
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
spin_lock_init(&ci->i_ceph_lock);
@@ -547,7 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
- return &ci->vfs_inode;
+ return &ci->netfs.inode;
}
void ceph_free_inode(struct inode *inode)
@@ -578,7 +578,7 @@ void ceph_evict_inode(struct inode *inode)
__ceph_remove_caps(ci);
- if (__ceph_has_any_quota(ci))
+ if (__ceph_has_quota(ci, QUOTA_GET_ANY))
ceph_adjust_quota_realms_count(inode, false);
/*
@@ -1466,10 +1466,12 @@ retry_lookup:
} else if (have_lease) {
if (d_unhashed(dn))
d_add(dn, NULL);
+ }
+
+ if (!d_unhashed(dn) && have_lease)
update_dentry_lease(dir, dn,
rinfo->dlease, session,
req->r_request_started);
- }
goto done;
}
@@ -1884,7 +1886,6 @@ static void ceph_do_invalidate_pages(struct inode *inode)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- ceph_fscache_invalidate(inode, false);
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
ceph_vinop(inode));
@@ -1977,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work)
{
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_work);
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
dout("writeback %p\n", inode);
@@ -2258,6 +2259,30 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
+int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
+{
+ int issued = ceph_caps_issued(ceph_inode(inode));
+
+ /*
+ * If any 'x' caps is issued we can just choose the auth MDS
+ * instead of the random replica MDSes. Because only when the
+ * Locker is in LOCK_EXEC state will the loner client could
+ * get the 'x' caps. And if we send the getattr requests to
+ * any replica MDS it must auth pin and tries to rdlock from
+ * the auth MDS, and then the auth MDS need to do the Locker
+ * state transition to LOCK_SYNC. And after that the lock state
+ * will change back.
+ *
+ * This cost much when doing the Locker state transition and
+ * usually will need to revoke caps from clients.
+ */
+ if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
+ || (mask & CEPH_STAT_RSTAT))
+ return USE_AUTH_MDS;
+ else
+ return USE_ANY_MDS;
+}
+
/*
* Verify that we have a lease on the given mask. If not,
* do a getattr against an mds.
@@ -2281,7 +2306,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
return 0;
- mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
+ mode = ceph_try_to_choose_auth_mds(inode, mask);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -2423,7 +2448,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
return -ESTALE;
/* Skip the getattr altogether if we're asked not to sync */
- if (!(flags & AT_STATX_DONT_SYNC)) {
+ if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
err = ceph_do_getattr(inode,
statx_to_caps(request_mask, inode->i_mode),
flags & AT_STATX_FORCE_SYNC);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 00c3de177dd6..33f517d549ce 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -437,7 +437,7 @@ static int ceph_parse_deleg_inos(void **p, void *end,
ceph_decode_32_safe(p, end, sets, bad);
dout("got %u sets of delegated inodes\n", sets);
while (sets--) {
- u64 start, len, ino;
+ u64 start, len;
ceph_decode_64_safe(p, end, start, bad);
ceph_decode_64_safe(p, end, len, bad);
@@ -449,7 +449,7 @@ static int ceph_parse_deleg_inos(void **p, void *end,
continue;
}
while (len--) {
- int err = xa_insert(&s->s_delegated_inos, ino = start++,
+ int err = xa_insert(&s->s_delegated_inos, start++,
DELEGATED_INO_AVAILABLE,
GFP_KERNEL);
if (!err) {
@@ -1564,7 +1564,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
p = session->s_caps.next;
while (p != &session->s_caps) {
cap = list_entry(p, struct ceph_cap, session_caps);
- inode = igrab(&cap->ci->vfs_inode);
+ inode = igrab(&cap->ci->netfs.inode);
if (!inode) {
p = p->next;
continue;
@@ -1622,7 +1622,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
int iputs;
dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
+ cap, ci, &ci->netfs.inode);
spin_lock(&ci->i_ceph_lock);
iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
spin_unlock(&ci->i_ceph_lock);
@@ -2651,7 +2651,28 @@ static int __prepare_send_request(struct ceph_mds_session *session,
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_mds_request_head_old *rhead;
struct ceph_msg *msg;
- int flags = 0;
+ int flags = 0, max_retry;
+
+ /*
+ * The type of 'r_attempts' in kernel 'ceph_mds_request'
+ * is 'int', while in 'ceph_mds_request_head' the type of
+ * 'num_retry' is '__u8'. So in case the request retries
+ * exceeding 256 times, the MDS will receive a incorrect
+ * retry seq.
+ *
+ * In this case it's ususally a bug in MDS and continue
+ * retrying the request makes no sense.
+ *
+ * In future this could be fixed in ceph code, so avoid
+ * using the hardcode here.
+ */
+ max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
+ max_retry = 1 << (max_retry * BITS_PER_BYTE);
+ if (req->r_attempts >= max_retry) {
+ pr_warn_ratelimited("%s request tid %llu seq overflow\n",
+ __func__, req->r_tid);
+ return -EMULTIHOP;
+ }
req->r_attempts++;
if (req->r_inode) {
@@ -2663,7 +2684,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
else
req->r_sent_on_mseq = -1;
}
- dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+ dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
@@ -3265,6 +3286,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
int err = -EINVAL;
void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len;
+ bool aborted = false;
ceph_decode_need(&p, end, 2*sizeof(u32), bad);
next_mds = ceph_decode_32(&p);
@@ -3273,16 +3295,41 @@ static void handle_forward(struct ceph_mds_client *mdsc,
mutex_lock(&mdsc->mutex);
req = lookup_get_request(mdsc, tid);
if (!req) {
+ mutex_unlock(&mdsc->mutex);
dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
- goto out; /* dup reply? */
+ return; /* dup reply? */
}
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
dout("forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
} else if (fwd_seq <= req->r_num_fwd) {
- dout("forward tid %llu to mds%d - old seq %d <= %d\n",
- tid, next_mds, req->r_num_fwd, fwd_seq);
+ /*
+ * The type of 'num_fwd' in ceph 'MClientRequestForward'
+ * is 'int32_t', while in 'ceph_mds_request_head' the
+ * type is '__u8'. So in case the request bounces between
+ * MDSes exceeding 256 times, the client will get stuck.
+ *
+ * In this case it's ususally a bug in MDS and continue
+ * bouncing the request makes no sense.
+ *
+ * In future this could be fixed in ceph code, so avoid
+ * using the hardcode here.
+ */
+ int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
+ max = 1 << (max * BITS_PER_BYTE);
+ if (req->r_num_fwd >= max) {
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = -EMULTIHOP;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ aborted = true;
+ pr_warn_ratelimited("forward tid %llu seq overflow\n",
+ tid);
+ } else {
+ dout("forward tid %llu to mds%d - old seq %d <= %d\n",
+ tid, next_mds, req->r_num_fwd, fwd_seq);
+ }
} else {
/* resend. forward race not possible; mds would drop */
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
@@ -3294,9 +3341,12 @@ static void handle_forward(struct ceph_mds_client *mdsc,
put_request_session(req);
__do_request(mdsc, req);
}
- ceph_mdsc_put_request(req);
-out:
mutex_unlock(&mdsc->mutex);
+
+ /* kick calling process */
+ if (aborted)
+ complete_request(mdsc, req);
+ ceph_mdsc_put_request(req);
return;
bad:
@@ -3375,13 +3425,17 @@ static void handle_session(struct ceph_mds_session *session,
}
if (msg_version >= 5) {
- u32 flags;
- /* version >= 4, struct_v, struct_cv, len, metric_spec */
- ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad);
+ u32 flags, len;
+
+ /* version >= 4 */
+ ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */
+ ceph_decode_32_safe(&p, end, len, bad); /* len */
+ ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */
+
/* version >= 5, flags */
- ceph_decode_32_safe(&p, end, flags, bad);
+ ceph_decode_32_safe(&p, end, flags, bad);
if (flags & CEPH_SESSION_BLOCKLISTED) {
- pr_warn("mds%d session blocklisted\n", session->s_mds);
+ pr_warn("mds%d session blocklisted\n", session->s_mds);
blocklisted = true;
}
}
@@ -4396,12 +4450,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
memcpy((void *)(lease + 1) + 4,
dentry->d_name.name, dentry->d_name.len);
spin_unlock(&dentry->d_lock);
- /*
- * if this is a preemptive lease RELEASE, no need to
- * flush request stream, since the actual request will
- * soon follow.
- */
- msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
ceph_con_send(&session->s_con, msg);
}
@@ -4696,15 +4744,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
}
/*
- * wait for all write mds requests to flush.
+ * flush the mdlog and wait for all write mds requests to flush.
*/
-static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
+ u64 want_tid)
{
struct ceph_mds_request *req = NULL, *nextreq;
+ struct ceph_mds_session *last_session = NULL;
struct rb_node *n;
mutex_lock(&mdsc->mutex);
- dout("wait_unsafe_requests want %lld\n", want_tid);
+ dout("%s want %lld\n", __func__, want_tid);
restart:
req = __get_oldest_req(mdsc);
while (req && req->r_tid <= want_tid) {
@@ -4716,14 +4766,32 @@ restart:
nextreq = NULL;
if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
(req->r_op & CEPH_MDS_OP_WRITE)) {
+ struct ceph_mds_session *s = req->r_session;
+
+ if (!s) {
+ req = nextreq;
+ continue;
+ }
+
/* write op */
ceph_mdsc_get_request(req);
if (nextreq)
ceph_mdsc_get_request(nextreq);
+ s = ceph_get_mds_session(s);
mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests wait on %llu (want %llu)\n",
+
+ /* send flush mdlog request to MDS */
+ if (last_session != s) {
+ send_flush_mdlog(s);
+ ceph_put_mds_session(last_session);
+ last_session = s;
+ } else {
+ ceph_put_mds_session(s);
+ }
+ dout("%s wait on %llu (want %llu)\n", __func__,
req->r_tid, want_tid);
wait_for_completion(&req->r_safe_completion);
+
mutex_lock(&mdsc->mutex);
ceph_mdsc_put_request(req);
if (!nextreq)
@@ -4738,7 +4806,8 @@ restart:
req = nextreq;
}
mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests done\n");
+ ceph_put_mds_session(last_session);
+ dout("%s done\n", __func__);
}
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
@@ -4767,7 +4836,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
dout("sync want tid %lld flush_seq %lld\n",
want_tid, want_flush);
- wait_unsafe_requests(mdsc, want_tid);
+ flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
wait_caps_flush(mdsc, want_flush);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 33497846e47e..1140aecd82ce 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -579,7 +579,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
struct ceph_inode_info *ci = ceph_inode(inode);
return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
- TASK_INTERRUPTIBLE);
+ TASK_KILLABLE);
}
extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index a338a3ec0dc4..64592adfe48f 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -195,9 +195,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
/*
* This function walks through the snaprealm for an inode and returns the
- * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
- * or max_bytes). If the root is reached, return the root ceph_snap_realm
- * instead.
+ * ceph_snap_realm for the first snaprealm that has quotas set (max_files,
+ * max_bytes, or any, depending on the 'which_quota' argument). If the root is
+ * reached, return the root ceph_snap_realm instead.
*
* Note that the caller is responsible for calling ceph_put_snap_realm() on the
* returned realm.
@@ -209,7 +209,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
* will be restarted.
*/
static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
- struct inode *inode, bool retry)
+ struct inode *inode,
+ enum quota_get_realm which_quota,
+ bool retry)
{
struct ceph_inode_info *ci = NULL;
struct ceph_snap_realm *realm, *next;
@@ -248,7 +250,7 @@ restart:
}
ci = ceph_inode(in);
- has_quota = __ceph_has_any_quota(ci);
+ has_quota = __ceph_has_quota(ci, which_quota);
iput(in);
next = realm->parent;
@@ -279,8 +281,8 @@ restart:
* dropped and we can then restart the whole operation.
*/
down_read(&mdsc->snap_rwsem);
- old_realm = get_quota_realm(mdsc, old, true);
- new_realm = get_quota_realm(mdsc, new, false);
+ old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true);
+ new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false);
if (PTR_ERR(new_realm) == -EAGAIN) {
up_read(&mdsc->snap_rwsem);
if (old_realm)
@@ -483,7 +485,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
bool is_updated = false;
down_read(&mdsc->snap_rwsem);
- realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
+ realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root),
+ QUOTA_GET_MAX_BYTES, true);
up_read(&mdsc->snap_rwsem);
if (!realm)
return false;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 322ee5add942..864cdaa0d2bd 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -521,7 +521,7 @@ static bool has_new_snaps(struct ceph_snap_context *o,
static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap **pcapsnap)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_snap_context *old_snapc, *new_snapc;
struct ceph_cap_snap *capsnap = *pcapsnap;
struct ceph_buffer *old_blob = NULL;
@@ -652,7 +652,7 @@ update_snapc:
int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
BUG_ON(capsnap->writing);
@@ -712,7 +712,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
- struct inode *inode = igrab(&ci->vfs_inode);
+ struct inode *inode = igrab(&ci->netfs.inode);
if (!inode)
continue;
spin_unlock(&realm->inodes_with_caps_lock);
@@ -904,7 +904,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
struct ceph_inode_info, i_snap_flush_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
ceph_flush_snaps(ci, &session);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e6987d295079..40140805bdcf 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -876,7 +876,7 @@ mempool_t *ceph_wb_pagevec_pool;
static void ceph_inode_init_once(void *foo)
{
struct ceph_inode_info *ci = foo;
- inode_init_once(&ci->vfs_inode);
+ inode_init_once(&ci->netfs.inode);
}
static int __init init_caches(void)
@@ -1119,6 +1119,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
s->s_time_gran = 1;
s->s_time_min = 0;
s->s_time_max = U32_MAX;
+ s->s_flags |= SB_NODIRATIME | SB_NOATIME;
ret = set_anon_super_fc(s, fc);
if (ret != 0)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 20ceab74e871..f59dac66955b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -316,11 +316,7 @@ struct ceph_inode_xattrs_info {
* Ceph inode.
*/
struct ceph_inode_info {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode;
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct ceph_vino i_vino; /* ceph ino + snap */
spinlock_t i_ceph_lock;
@@ -436,7 +432,7 @@ struct ceph_inode_info {
static inline struct ceph_inode_info *
ceph_inode(const struct inode *inode)
{
- return container_of(inode, struct ceph_inode_info, vfs_inode);
+ return container_of(inode, struct ceph_inode_info, netfs.inode);
}
static inline struct ceph_fs_client *
@@ -1022,6 +1018,7 @@ static inline void ceph_queue_flush_snaps(struct inode *inode)
ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
}
+extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask);
extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
int mask, bool force);
static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
@@ -1278,9 +1275,29 @@ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client);
extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
/* quota.c */
-static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci)
+
+enum quota_get_realm {
+ QUOTA_GET_MAX_FILES,
+ QUOTA_GET_MAX_BYTES,
+ QUOTA_GET_ANY
+};
+
+static inline bool __ceph_has_quota(struct ceph_inode_info *ci,
+ enum quota_get_realm which)
{
- return ci->i_max_files || ci->i_max_bytes;
+ bool has_quota = false;
+
+ switch (which) {
+ case QUOTA_GET_MAX_BYTES:
+ has_quota = !!ci->i_max_bytes;
+ break;
+ case QUOTA_GET_MAX_FILES:
+ has_quota = !!ci->i_max_files;
+ break;
+ default:
+ has_quota = !!(ci->i_max_files || ci->i_max_bytes);
+ }
+ return has_quota;
}
extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc);
@@ -1289,13 +1306,13 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
u64 max_bytes, u64 max_files)
{
bool had_quota, has_quota;
- had_quota = __ceph_has_any_quota(ci);
+ had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
ci->i_max_bytes = max_bytes;
ci->i_max_files = max_files;
- has_quota = __ceph_has_any_quota(ci);
+ has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
if (had_quota != has_quota)
- ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota);
+ ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
}
extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index afec84088471..f141f5246163 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -57,7 +57,7 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_string *pool_ns;
s64 pool = ci->i_layout.pool_id;
@@ -69,7 +69,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
- dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+ dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode);
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
@@ -161,7 +161,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
char *val, size_t size)
{
ssize_t ret;
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ci->i_layout.pool_id;
const char *pool_name;
@@ -313,7 +313,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);
}
@@ -321,7 +321,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "client%lld",
ceph_client_gid(fsc->client));
@@ -366,6 +366,14 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci,
}
#define XATTR_RSTAT_FIELD(_type, _name) \
XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
+#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = NULL, \
+ .flags = VXATTR_FLAG_RSTAT, \
+ }
#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
{ \
.name = CEPH_XATTR_NAME2(_type, _name, _field), \
@@ -404,7 +412,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rsubdirs),
XATTR_RSTAT_FIELD(dir, rsnaps),
XATTR_RSTAT_FIELD(dir, rbytes),
- XATTR_RSTAT_FIELD(dir, rctime),
+ XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime),
{
.name = "ceph.dir.pin",
.name_size = sizeof("ceph.dir.pin"),
@@ -621,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
}
dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n",
- ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val);
+ ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val);
return 0;
}
@@ -863,7 +871,7 @@ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
struct ceph_buffer *old_blob = NULL;
void *dest;
- dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+ dout("__build_xattrs_blob %p\n", &ci->netfs.inode);
if (ci->i_xattrs.dirty) {
int need = __get_required_blob_size(ci, 0, 0);
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index cc8fdcb35b71..8c9f2c00be72 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_CIFS) += cifs.o
cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
inode.o link.o misc.o netmisc.o smbencrypt.o transport.o \
cifs_unicode.o nterr.o cifsencrypt.o \
- readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \
+ readdir.o ioctl.o sess.o export.o unc.o winucase.o \
smb2ops.o smb2maperror.o smb2transport.o \
smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o
@@ -30,3 +30,5 @@ cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o
cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o
+
+cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 9d334816eac0..2cfbac8bb965 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -116,7 +116,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
tcon->ses->server->ops->dump_share_caps(m, tcon);
if (tcon->use_witness)
seq_puts(m, " Witness");
-
+ if (tcon->broken_sparse_sup)
+ seq_puts(m, " nosparse");
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
seq_putc(m, '\n');
@@ -161,6 +162,8 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr);
else if (iface->sockaddr.ss_family == AF_INET6)
seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr);
+ if (!iface->is_active)
+ seq_puts(m, "\t\t[for-cleanup]\n");
}
static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
@@ -220,6 +223,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
+ struct cifs_server_iface *iface;
int c, i, j;
seq_puts(m,
@@ -386,7 +390,7 @@ skip_rdma:
(ses->serverNOS == NULL)) {
seq_printf(m, "\n\t%d) Address: %s Uses: %d Capability: 0x%x\tSession Status: %d ",
i, ses->ip_addr, ses->ses_count,
- ses->capabilities, ses->status);
+ ses->capabilities, ses->ses_status);
if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST)
seq_printf(m, "Guest ");
else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL)
@@ -398,7 +402,7 @@ skip_rdma:
"\n\tSMB session status: %d ",
i, ses->ip_addr, ses->serverDomain,
ses->ses_count, ses->serverOS, ses->serverNOS,
- ses->capabilities, ses->status);
+ ses->capabilities, ses->ses_status);
}
seq_printf(m, "\n\tSecurity type: %s ",
@@ -418,6 +422,8 @@ skip_rdma:
spin_lock(&ses->chan_lock);
if (CIFS_CHAN_NEEDS_RECONNECT(ses, 0))
seq_puts(m, "\tPrimary channel: DISCONNECTED ");
+ if (CIFS_CHAN_IN_RECONNECT(ses, 0))
+ seq_puts(m, "\t[RECONNECTING] ");
if (ses->chan_count > 1) {
seq_printf(m, "\n\n\tExtra Channels: %zu ",
@@ -426,6 +432,8 @@ skip_rdma:
cifs_dump_channel(m, j, &ses->chans[j]);
if (CIFS_CHAN_NEEDS_RECONNECT(ses, j))
seq_puts(m, "\tDISCONNECTED ");
+ if (CIFS_CHAN_IN_RECONNECT(ses, j))
+ seq_puts(m, "\t[RECONNECTING] ");
}
}
spin_unlock(&ses->chan_lock);
@@ -451,11 +459,10 @@ skip_rdma:
if (ses->iface_count)
seq_printf(m, "\n\n\tServer interfaces: %zu",
ses->iface_count);
- for (j = 0; j < ses->iface_count; j++) {
- struct cifs_server_iface *iface;
-
- iface = &ses->iface_list[j];
- seq_printf(m, "\n\t%d)", j+1);
+ j = 0;
+ list_for_each_entry(iface, &ses->iface_list,
+ iface_head) {
+ seq_printf(m, "\n\t%d)", ++j);
cifs_dump_iface(m, iface);
if (is_ses_using_iface(ses, iface))
seq_puts(m, "\t\t[CONNECTED]\n");
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index 180c234c2f46..1e4c7cc5287f 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -465,7 +465,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a
int ret = 0;
/* Store the reconnect address */
- mutex_lock(&tcon->ses->server->srv_mutex);
+ cifs_server_lock(tcon->ses->server);
if (cifs_sockaddr_equal(&tcon->ses->server->dstaddr, addr))
goto unlock;
@@ -501,7 +501,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a
cifs_signal_cifsd_for_reconnect(tcon->ses->server, false);
unlock:
- mutex_unlock(&tcon->ses->server->srv_mutex);
+ cifs_server_unlock(tcon->ses->server);
return ret;
}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 0912d8bbbac1..663cb9db4908 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -236,9 +236,9 @@ int cifs_verify_signature(struct smb_rqst *rqst,
cpu_to_le32(expected_sequence_number);
cifs_pdu->Signature.Sequence.Reserved = 0;
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
rc = cifs_calc_signature(rqst, server, what_we_think_sig_should_be);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
if (rc)
return rc;
@@ -626,7 +626,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
memcpy(ses->auth_key.response + baselen, tiblob, tilen);
- mutex_lock(&ses->server->srv_mutex);
+ cifs_server_lock(ses->server);
rc = cifs_alloc_hash("hmac(md5)",
&ses->server->secmech.hmacmd5,
@@ -678,7 +678,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
unlock:
- mutex_unlock(&ses->server->srv_mutex);
+ cifs_server_unlock(ses->server);
setup_ntlmv2_rsp_ret:
kfree(tiblob);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2b1a1c029c75..8f2e003e0590 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -377,7 +377,7 @@ cifs_alloc_inode(struct super_block *sb)
cifs_inode->flags = 0;
spin_lock_init(&cifs_inode->writers_lock);
cifs_inode->writers = 0;
- cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
+ cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
cifs_inode->server_eof = 0;
cifs_inode->uniqueid = 0;
cifs_inode->createtime = 0;
@@ -389,12 +389,12 @@ cifs_alloc_inode(struct super_block *sb)
* Can not set i_flags here - they get immediately overwritten to zero
* by the VFS.
*/
- /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */
+ /* cifs_inode->netfs.inode.i_flags = S_NOATIME | S_NOCMTIME; */
INIT_LIST_HEAD(&cifs_inode->openFileList);
INIT_LIST_HEAD(&cifs_inode->llist);
INIT_LIST_HEAD(&cifs_inode->deferred_closes);
spin_lock_init(&cifs_inode->deferred_lock);
- return &cifs_inode->vfs_inode;
+ return &cifs_inode->netfs.inode;
}
static void
@@ -582,6 +582,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",nocase");
if (tcon->nodelete)
seq_puts(s, ",nodelete");
+ if (cifs_sb->ctx->no_sparse)
+ seq_puts(s, ",nosparse");
if (tcon->local_lease)
seq_puts(s, ",locallease");
if (tcon->retry)
@@ -836,7 +838,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
int flags, struct smb3_fs_context *old_ctx)
{
int rc;
- struct super_block *sb;
+ struct super_block *sb = NULL;
struct cifs_sb_info *cifs_sb = NULL;
struct cifs_mnt_data mnt_data;
struct dentry *root;
@@ -932,9 +934,11 @@ out_super:
return root;
out:
if (cifs_sb) {
- kfree(cifs_sb->prepath);
- smb3_cleanup_fs_context(cifs_sb->ctx);
- kfree(cifs_sb);
+ if (!sb || IS_ERR(sb)) { /* otherwise kill_sb will handle */
+ kfree(cifs_sb->prepath);
+ smb3_cleanup_fs_context(cifs_sb->ctx);
+ kfree(cifs_sb);
+ }
}
return root;
}
@@ -1082,7 +1086,7 @@ struct file_system_type cifs_fs_type = {
};
MODULE_ALIAS_FS("cifs");
-static struct file_system_type smb3_fs_type = {
+struct file_system_type smb3_fs_type = {
.owner = THIS_MODULE,
.name = "smb3",
.init_fs_context = smb3_init_fs_context,
@@ -1414,7 +1418,7 @@ cifs_init_once(void *inode)
{
struct cifsInodeInfo *cifsi = inode;
- inode_init_once(&cifsi->vfs_inode);
+ inode_init_once(&cifsi->netfs.inode);
init_rwsem(&cifsi->lock_sem);
}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c0542bdcd06b..b17be47a8e59 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -38,7 +38,7 @@ static inline unsigned long cifs_get_time(struct dentry *dentry)
return (unsigned long) dentry->d_fsdata;
}
-extern struct file_system_type cifs_fs_type;
+extern struct file_system_type cifs_fs_type, smb3_fs_type;
extern const struct address_space_operations cifs_addr_ops;
extern const struct address_space_operations cifs_addr_ops_smallbuf;
@@ -152,6 +152,7 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define SMB3_PRODUCT_BUILD 35
-#define CIFS_VERSION "2.36"
+/* when changing internal version - update following two lines at same time */
+#define SMB3_PRODUCT_BUILD 37
+#define CIFS_VERSION "2.37"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 8de977c359b1..a643c84ff1e9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -16,6 +16,7 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/utsname.h>
+#include <linux/sched/mm.h>
#include <linux/netfs.h>
#include "cifs_fs_sb.h"
#include "cifsacl.h"
@@ -79,6 +80,9 @@
#define SMB_DNS_RESOLVE_INTERVAL_MIN 120
#define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600
+/* smb multichannel query server interfaces interval in seconds */
+#define SMB_INTERFACE_POLL_INTERVAL 600
+
/* maximum number of PDUs in one compound */
#define MAX_COMPOUND 5
@@ -106,7 +110,7 @@
* CIFS vfs client Status information (based on what we know.)
*/
-/* associated with each tcp and smb session */
+/* associated with each connection */
enum statusEnum {
CifsNew = 0,
CifsGood,
@@ -114,8 +118,15 @@ enum statusEnum {
CifsNeedReconnect,
CifsNeedNegotiate,
CifsInNegotiate,
- CifsNeedSessSetup,
- CifsInSessSetup,
+};
+
+/* associated with each smb session */
+enum ses_status_enum {
+ SES_NEW = 0,
+ SES_GOOD,
+ SES_EXITING,
+ SES_NEED_RECON,
+ SES_IN_SETUP
};
/* associated with each tree connection to the server */
@@ -621,7 +632,8 @@ struct TCP_Server_Info {
unsigned int in_flight; /* number of requests on the wire to server */
unsigned int max_in_flight; /* max number of requests that were on wire */
spinlock_t req_lock; /* protect the two values above */
- struct mutex srv_mutex;
+ struct mutex _srv_mutex;
+ unsigned int nofs_flag;
struct task_struct *tsk;
char server_GUID[16];
__u16 sec_mode;
@@ -736,6 +748,22 @@ struct TCP_Server_Info {
#endif
};
+static inline void cifs_server_lock(struct TCP_Server_Info *server)
+{
+ unsigned int nofs_flag = memalloc_nofs_save();
+
+ mutex_lock(&server->_srv_mutex);
+ server->nofs_flag = nofs_flag;
+}
+
+static inline void cifs_server_unlock(struct TCP_Server_Info *server)
+{
+ unsigned int nofs_flag = server->nofs_flag;
+
+ mutex_unlock(&server->_srv_mutex);
+ memalloc_nofs_restore(nofs_flag);
+}
+
struct cifs_credits {
unsigned int value;
unsigned int instance;
@@ -908,14 +936,67 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
#endif
struct cifs_server_iface {
+ struct list_head iface_head;
+ struct kref refcount;
size_t speed;
unsigned int rdma_capable : 1;
unsigned int rss_capable : 1;
+ unsigned int is_active : 1; /* unset if non existent */
struct sockaddr_storage sockaddr;
};
+/* release iface when last ref is dropped */
+static inline void
+release_iface(struct kref *ref)
+{
+ struct cifs_server_iface *iface = container_of(ref,
+ struct cifs_server_iface,
+ refcount);
+ list_del_init(&iface->iface_head);
+ kfree(iface);
+}
+
+/*
+ * compare two interfaces a and b
+ * return 0 if everything matches.
+ * return 1 if a has higher link speed, or rdma capable, or rss capable
+ * return -1 otherwise.
+ */
+static inline int
+iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b)
+{
+ int cmp_ret = 0;
+
+ WARN_ON(!a || !b);
+ if (a->speed == b->speed) {
+ if (a->rdma_capable == b->rdma_capable) {
+ if (a->rss_capable == b->rss_capable) {
+ cmp_ret = memcmp(&a->sockaddr, &b->sockaddr,
+ sizeof(a->sockaddr));
+ if (!cmp_ret)
+ return 0;
+ else if (cmp_ret > 0)
+ return 1;
+ else
+ return -1;
+ } else if (a->rss_capable > b->rss_capable)
+ return 1;
+ else
+ return -1;
+ } else if (a->rdma_capable > b->rdma_capable)
+ return 1;
+ else
+ return -1;
+ } else if (a->speed > b->speed)
+ return 1;
+ else
+ return -1;
+}
+
struct cifs_chan {
+ unsigned int in_reconnect : 1; /* if session setup in progress for this channel */
struct TCP_Server_Info *server;
+ struct cifs_server_iface *iface; /* interface in use */
__u8 signkey[SMB3_SIGN_KEY_SIZE];
};
@@ -930,7 +1011,7 @@ struct cifs_ses {
struct mutex session_mutex;
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
- enum statusEnum status; /* updates protected by cifs_tcp_ses_lock */
+ enum ses_status_enum ses_status; /* updates protected by cifs_tcp_ses_lock */
unsigned overrideSecFlg; /* if non-zero override global sec flags */
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
@@ -944,7 +1025,7 @@ struct cifs_ses {
and after mount option parsing we fill it */
char *domainName;
char *password;
- char *workstation_name;
+ char workstation_name[CIFS_MAX_WORKSTATION_LEN];
struct session_key auth_key;
struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
enum securityEnum sectype; /* what security flavor was specified? */
@@ -967,7 +1048,7 @@ struct cifs_ses {
*/
spinlock_t iface_lock;
/* ========= begin: protected by iface_lock ======== */
- struct cifs_server_iface *iface_list;
+ struct list_head iface_list;
size_t iface_count;
unsigned long iface_last_update; /* jiffies */
/* ========= end: protected by iface_lock ======== */
@@ -977,12 +1058,16 @@ struct cifs_ses {
#define CIFS_MAX_CHANNELS 16
#define CIFS_ALL_CHANNELS_SET(ses) \
((1UL << (ses)->chan_count) - 1)
+#define CIFS_ALL_CHANS_GOOD(ses) \
+ (!(ses)->chans_need_reconnect)
#define CIFS_ALL_CHANS_NEED_RECONNECT(ses) \
((ses)->chans_need_reconnect == CIFS_ALL_CHANNELS_SET(ses))
#define CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses) \
((ses)->chans_need_reconnect = CIFS_ALL_CHANNELS_SET(ses))
#define CIFS_CHAN_NEEDS_RECONNECT(ses, index) \
test_bit((index), &(ses)->chans_need_reconnect)
+#define CIFS_CHAN_IN_RECONNECT(ses, index) \
+ ((ses)->chans[(index)].in_reconnect)
struct cifs_chan chans[CIFS_MAX_CHANNELS];
size_t chan_count;
@@ -1009,6 +1094,58 @@ cap_unix(struct cifs_ses *ses)
return ses->server->vals->cap_unix & ses->capabilities;
}
+/*
+ * common struct for holding inode info when searching for or updating an
+ * inode with new info
+ */
+
+#define CIFS_FATTR_DFS_REFERRAL 0x1
+#define CIFS_FATTR_DELETE_PENDING 0x2
+#define CIFS_FATTR_NEED_REVAL 0x4
+#define CIFS_FATTR_INO_COLLISION 0x8
+#define CIFS_FATTR_UNKNOWN_NLINK 0x10
+#define CIFS_FATTR_FAKE_ROOT_INO 0x20
+
+struct cifs_fattr {
+ u32 cf_flags;
+ u32 cf_cifsattrs;
+ u64 cf_uniqueid;
+ u64 cf_eof;
+ u64 cf_bytes;
+ u64 cf_createtime;
+ kuid_t cf_uid;
+ kgid_t cf_gid;
+ umode_t cf_mode;
+ dev_t cf_rdev;
+ unsigned int cf_nlink;
+ unsigned int cf_dtype;
+ struct timespec64 cf_atime;
+ struct timespec64 cf_mtime;
+ struct timespec64 cf_ctime;
+ u32 cf_cifstag;
+};
+
+struct cached_dirent {
+ struct list_head entry;
+ char *name;
+ int namelen;
+ loff_t pos;
+
+ struct cifs_fattr fattr;
+};
+
+struct cached_dirents {
+ bool is_valid:1;
+ bool is_failed:1;
+ struct dir_context *ctx; /*
+ * Only used to make sure we only take entries
+ * from a single context. Never dereferenced.
+ */
+ struct mutex de_mutex;
+ int pos; /* Expected ctx->pos */
+ struct list_head entries;
+};
+
struct cached_fid {
bool is_valid:1; /* Do we have a useable root fid */
bool file_all_info_is_valid:1;
@@ -1021,6 +1158,7 @@ struct cached_fid {
struct dentry *dentry;
struct work_struct lease_break;
struct smb2_file_all_info file_all_info;
+ struct cached_dirents dirents;
};
/*
@@ -1120,6 +1258,7 @@ struct cifs_tcon {
#ifdef CONFIG_CIFS_DFS_UPCALL
struct list_head ulist; /* cache update list */
#endif
+ struct delayed_work query_interfaces; /* query interfaces workqueue job */
};
/*
@@ -1396,20 +1535,16 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
#define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG)
#define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG)
-#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE))
+#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE))
#define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG)
-#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE))
+#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE))
/*
* One of these for each file inode
*/
struct cifsInodeInfo {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode; /* the VFS's inode record */
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
bool can_cache_brlcks;
struct list_head llist; /* locks helb by this inode */
/*
@@ -1448,7 +1583,7 @@ struct cifsInodeInfo {
static inline struct cifsInodeInfo *
CIFS_I(struct inode *inode)
{
- return container_of(inode, struct cifsInodeInfo, vfs_inode);
+ return container_of(inode, struct cifsInodeInfo, netfs.inode);
}
static inline struct cifs_sb_info *
@@ -1641,37 +1776,6 @@ struct file_list {
struct cifsFileInfo *cfile;
};
-/*
- * common struct for holding inode info when searching for or updating an
- * inode with new info
- */
-
-#define CIFS_FATTR_DFS_REFERRAL 0x1
-#define CIFS_FATTR_DELETE_PENDING 0x2
-#define CIFS_FATTR_NEED_REVAL 0x4
-#define CIFS_FATTR_INO_COLLISION 0x8
-#define CIFS_FATTR_UNKNOWN_NLINK 0x10
-#define CIFS_FATTR_FAKE_ROOT_INO 0x20
-
-struct cifs_fattr {
- u32 cf_flags;
- u32 cf_cifsattrs;
- u64 cf_uniqueid;
- u64 cf_eof;
- u64 cf_bytes;
- u64 cf_createtime;
- kuid_t cf_uid;
- kgid_t cf_gid;
- umode_t cf_mode;
- dev_t cf_rdev;
- unsigned int cf_nlink;
- unsigned int cf_dtype;
- struct timespec64 cf_atime;
- struct timespec64 cf_mtime;
- struct timespec64 cf_ctime;
- u32 cf_cifstag;
-};
-
static inline void free_dfs_info_param(struct dfs_info3_param *param)
{
if (param) {
@@ -1911,11 +2015,13 @@ extern mempool_t *cifs_mid_poolp;
/* Operations for different SMB versions */
#define SMB1_VERSION_STRING "1.0"
+#define SMB20_VERSION_STRING "2.0"
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
extern struct smb_version_operations smb1_operations;
extern struct smb_version_values smb1_values;
-#define SMB20_VERSION_STRING "2.0"
extern struct smb_version_operations smb20_operations;
extern struct smb_version_values smb20_values;
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
#define SMB21_VERSION_STRING "2.1"
extern struct smb_version_operations smb21_operations;
extern struct smb_version_values smb21_values;
@@ -1979,4 +2085,22 @@ static inline bool cifs_is_referral_server(struct cifs_tcon *tcon,
return is_tcon_dfs(tcon) || (ref && (ref->flags & DFSREF_REFERRAL_SERVER));
}
+static inline u64 cifs_flock_len(struct file_lock *fl)
+{
+ return fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1;
+}
+
+static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses)
+{
+ if (WARN_ON_ONCE(!ses || !ses->server))
+ return 0;
+ /*
+ * Make workstation name no more than 15 chars when using insecure dialects as some legacy
+ * servers do require it during NTLMSSP.
+ */
+ if (ses->server->dialect <= SMB20_PROT_ID)
+ return min_t(size_t, sizeof(ses->workstation_name), RFC1001_NAME_LEN_WITH_NULL);
+ return sizeof(ses->workstation_name);
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 0df3b24a0bf4..d59aebefa71c 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -619,6 +619,15 @@ unsigned int
cifs_ses_get_chan_index(struct cifs_ses *ses,
struct TCP_Server_Info *server);
void
+cifs_chan_set_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+void
+cifs_chan_clear_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+bool
+cifs_chan_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+void
cifs_chan_set_need_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server);
void
@@ -627,6 +636,13 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses,
bool
cifs_chan_needs_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server);
+bool
+cifs_chan_is_iface_active(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+int
+cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
+int
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 47e927c4ff8d..6371b9eebdad 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -75,7 +75,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
/* only send once per connect */
spin_lock(&cifs_tcp_ses_lock);
- if ((tcon->ses->status != CifsGood) || (tcon->status != TID_NEED_RECON)) {
+ if ((tcon->ses->ses_status != SES_GOOD) || (tcon->status != TID_NEED_RECON)) {
spin_unlock(&cifs_tcp_ses_lock);
return;
}
@@ -2558,7 +2558,8 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
pLockData->fl_start = le64_to_cpu(parm_data->start);
pLockData->fl_end = pLockData->fl_start +
- le64_to_cpu(parm_data->length) - 1;
+ (le64_to_cpu(parm_data->length) ?
+ le64_to_cpu(parm_data->length) - 1 : 0);
pLockData->fl_pid = -le32_to_cpu(parm_data->pid);
}
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 42e14f408856..fa29c9aae24b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -97,6 +97,10 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
if (!server->hostname)
return -EINVAL;
+ /* if server hostname isn't populated, there's nothing to do here */
+ if (server->hostname[0] == '\0')
+ return 0;
+
len = strlen(server->hostname) + 3;
unc = kmalloc(len, GFP_KERNEL);
@@ -141,6 +145,25 @@ requeue_resolve:
return rc;
}
+static void smb2_query_server_interfaces(struct work_struct *work)
+{
+ int rc;
+ struct cifs_tcon *tcon = container_of(work,
+ struct cifs_tcon,
+ query_interfaces.work);
+
+ /*
+ * query server network interfaces, in case they change
+ */
+ rc = SMB3_request_interfaces(0, tcon);
+ if (rc) {
+ cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
+ __func__, rc);
+ }
+
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+}
static void cifs_resolve_server(struct work_struct *work)
{
@@ -148,7 +171,7 @@ static void cifs_resolve_server(struct work_struct *work)
struct TCP_Server_Info *server = container_of(work,
struct TCP_Server_Info, resolve.work);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
/*
* Resolve the hostname again to make sure that IP address is up-to-date.
@@ -159,7 +182,7 @@ static void cifs_resolve_server(struct work_struct *work)
__func__, rc);
}
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
}
/*
@@ -213,7 +236,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
bool mark_smb_session)
{
struct TCP_Server_Info *pserver;
- struct cifs_ses *ses;
+ struct cifs_ses *ses, *nses;
struct cifs_tcon *tcon;
/*
@@ -227,7 +250,20 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
+ /* check if iface is still active */
+ if (!cifs_chan_is_iface_active(ses, server)) {
+ /*
+ * HACK: drop the lock before calling
+ * cifs_chan_update_iface to avoid deadlock
+ */
+ ses->ses_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+ cifs_chan_update_iface(ses, server);
+ spin_lock(&cifs_tcp_ses_lock);
+ ses->ses_count--;
+ }
+
spin_lock(&ses->chan_lock);
if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server))
goto next_session;
@@ -241,7 +277,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses))
goto next_session;
- ses->status = CifsNeedReconnect;
+ ses->ses_status = SES_NEED_RECON;
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
tcon->need_reconnect = true;
@@ -267,7 +303,7 @@ cifs_abort_connection(struct TCP_Server_Info *server)
/* do not want to be sending data on a socket we are freeing */
cifs_dbg(FYI, "%s: tearing down socket\n", __func__);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
if (server->ssocket) {
cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", server->ssocket->state,
server->ssocket->flags);
@@ -296,7 +332,7 @@ cifs_abort_connection(struct TCP_Server_Info *server)
mid->mid_flags |= MID_DELETED;
}
spin_unlock(&GlobalMid_Lock);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
list_for_each_entry_safe(mid, nmid, &retry_list, qhead) {
@@ -306,9 +342,9 @@ cifs_abort_connection(struct TCP_Server_Info *server)
}
if (cifs_rdma_enabled(server)) {
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
smbd_destroy(server);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
}
}
@@ -359,7 +395,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
do {
try_to_freeze();
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
if (!cifs_swn_set_server_dstaddr(server)) {
/* resolve the hostname again to make sure that IP address is up-to-date */
@@ -372,7 +408,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
else
rc = generic_ip_connect(server);
if (rc) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc);
msleep(3000);
} else {
@@ -383,7 +419,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
server->tcpStatus = CifsNeedNegotiate;
spin_unlock(&cifs_tcp_ses_lock);
cifs_swn_reset_server_dstaddr(server);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
}
} while (server->tcpStatus == CifsNeedReconnect);
@@ -488,12 +524,12 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
do {
try_to_freeze();
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
rc = reconnect_target_unlocked(server, &tl, &target_hint);
if (rc) {
/* Failed to reconnect socket */
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc);
msleep(3000);
continue;
@@ -510,7 +546,7 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
server->tcpStatus = CifsNeedNegotiate;
spin_unlock(&cifs_tcp_ses_lock);
cifs_swn_reset_server_dstaddr(server);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
} while (server->tcpStatus == CifsNeedReconnect);
@@ -1565,7 +1601,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
init_waitqueue_head(&tcp_ses->response_q);
init_waitqueue_head(&tcp_ses->request_q);
INIT_LIST_HEAD(&tcp_ses->pending_mid_q);
- mutex_init(&tcp_ses->srv_mutex);
+ mutex_init(&tcp_ses->_srv_mutex);
memcpy(tcp_ses->workstation_RFC1001_name,
ctx->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
memcpy(tcp_ses->server_RFC1001_name,
@@ -1789,7 +1825,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
goto out;
}
- cifs_dbg(FYI, "IPC tcon rc = %d ipc tid = %d\n", rc, tcon->tid);
+ cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid);
ses->tcon_ipc = tcon;
out:
@@ -1828,7 +1864,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
- if (ses->status == CifsExiting)
+ if (ses->ses_status == SES_EXITING)
continue;
if (!match_session(ses, ctx))
continue;
@@ -1845,10 +1881,9 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
unsigned int rc, xid;
unsigned int chan_count;
struct TCP_Server_Info *server = ses->server;
- cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
spin_lock(&cifs_tcp_ses_lock);
- if (ses->status == CifsExiting) {
+ if (ses->ses_status == SES_EXITING) {
spin_unlock(&cifs_tcp_ses_lock);
return;
}
@@ -1864,13 +1899,13 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
/* ses_count can never go negative */
WARN_ON(ses->ses_count < 0);
- if (ses->status == CifsGood)
- ses->status = CifsExiting;
+ if (ses->ses_status == SES_GOOD)
+ ses->ses_status = SES_EXITING;
spin_unlock(&cifs_tcp_ses_lock);
cifs_free_ipc(ses);
- if (ses->status == CifsExiting && server->ops->logoff) {
+ if (ses->ses_status == SES_EXITING && server->ops->logoff) {
xid = get_xid();
rc = server->ops->logoff(xid, ses);
if (rc)
@@ -1891,9 +1926,11 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
int i;
for (i = 1; i < chan_count; i++) {
- spin_unlock(&ses->chan_lock);
+ if (ses->chans[i].iface) {
+ kref_put(&ses->chans[i].iface->refcount, release_iface);
+ ses->chans[i].iface = NULL;
+ }
cifs_put_tcp_session(ses->chans[i].server, 0);
- spin_lock(&ses->chan_lock);
ses->chans[i].server = NULL;
}
}
@@ -2037,18 +2074,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
}
}
- ctx->workstation_name = kstrdup(ses->workstation_name, GFP_KERNEL);
- if (!ctx->workstation_name) {
- cifs_dbg(FYI, "Unable to allocate memory for workstation_name\n");
- rc = -ENOMEM;
- kfree(ctx->username);
- ctx->username = NULL;
- kfree_sensitive(ctx->password);
- ctx->password = NULL;
- kfree(ctx->domainname);
- ctx->domainname = NULL;
- goto out_key_put;
- }
+ strscpy(ctx->workstation_name, ses->workstation_name, sizeof(ctx->workstation_name));
out_key_put:
up_read(&key->sem);
@@ -2090,7 +2116,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
ses = cifs_find_smb_ses(server, ctx);
if (ses) {
cifs_dbg(FYI, "Existing smb sess found (status=%d)\n",
- ses->status);
+ ses->ses_status);
spin_lock(&ses->chan_lock);
if (cifs_chan_needs_reconnect(ses, server)) {
@@ -2157,12 +2183,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
if (!ses->domainName)
goto get_ses_fail;
}
- if (ctx->workstation_name) {
- ses->workstation_name = kstrdup(ctx->workstation_name,
- GFP_KERNEL);
- if (!ses->workstation_name)
- goto get_ses_fail;
- }
+
+ strscpy(ses->workstation_name, ctx->workstation_name, sizeof(ses->workstation_name));
+
if (ctx->domainauto)
ses->domainAuto = ctx->domainauto;
ses->cred_uid = ctx->cred_uid;
@@ -2281,6 +2304,9 @@ cifs_put_tcon(struct cifs_tcon *tcon)
list_del_init(&tcon->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
+ /* cancel polling of interfaces */
+ cancel_delayed_work_sync(&tcon->query_interfaces);
+
if (tcon->use_witness) {
int rc;
@@ -2509,6 +2535,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
*/
tcon->retry = ctx->retry;
tcon->nocase = ctx->nocase;
+ tcon->broken_sparse_sup = ctx->no_sparse;
if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)
tcon->nohandlecache = ctx->nohandlecache;
else
@@ -2517,6 +2544,12 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->local_lease = ctx->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
+ /* schedule query interfaces poll */
+ INIT_DELAYED_WORK(&tcon->query_interfaces,
+ smb2_query_server_interfaces);
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+
spin_lock(&cifs_tcp_ses_lock);
list_add(&tcon->tcon_list, &ses->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -3420,8 +3453,9 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
}
/*
- * Check if path is remote (e.g. a DFS share). Return -EREMOTE if it is,
- * otherwise 0.
+ * Check if path is remote (i.e. a DFS share).
+ *
+ * Return -EREMOTE if it is, otherwise 0 or -errno.
*/
static int is_path_remote(struct mount_ctx *mnt_ctx)
{
@@ -3432,6 +3466,9 @@ static int is_path_remote(struct mount_ctx *mnt_ctx)
struct cifs_tcon *tcon = mnt_ctx->tcon;
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
char *full_path;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ bool nodfs = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS;
+#endif
if (!server->ops->is_path_accessible)
return -EOPNOTSUPP;
@@ -3449,14 +3486,20 @@ static int is_path_remote(struct mount_ctx *mnt_ctx)
rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
full_path);
#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (nodfs) {
+ if (rc == -EREMOTE)
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* path *might* exist with non-ASCII characters in DFS root
+ * try again with full path (only if nodfs is not set) */
if (rc == -ENOENT && is_tcon_dfs(tcon))
rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb,
full_path);
#endif
- if (rc != 0 && rc != -EREMOTE) {
- kfree(full_path);
- return rc;
- }
+ if (rc != 0 && rc != -EREMOTE)
+ goto out;
if (rc != -EREMOTE) {
rc = cifs_are_all_path_components_accessible(server, xid, tcon,
@@ -3468,6 +3511,7 @@ static int is_path_remote(struct mount_ctx *mnt_ctx)
}
}
+out:
kfree(full_path);
return rc;
}
@@ -3703,6 +3747,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
if (!isdfs)
goto out;
+ /* proceed as DFS mount */
uuid_gen(&mnt_ctx.mount_id);
rc = connect_dfs_root(&mnt_ctx, &tl);
dfs_cache_free_tgts(&tl);
@@ -3960,7 +4005,7 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
if (rc == 0) {
spin_lock(&cifs_tcp_ses_lock);
if (server->tcpStatus == CifsInNegotiate)
- server->tcpStatus = CifsNeedSessSetup;
+ server->tcpStatus = CifsGood;
else
rc = -EHOSTDOWN;
spin_unlock(&cifs_tcp_ses_lock);
@@ -3980,22 +4025,39 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct nls_table *nls_info)
{
int rc = -ENOSYS;
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+ struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
bool is_binding = false;
- /* only send once per connect */
spin_lock(&cifs_tcp_ses_lock);
- if ((server->tcpStatus != CifsNeedSessSetup) &&
- (ses->status == CifsGood)) {
+ if (server->dstaddr.ss_family == AF_INET6)
+ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr);
+ else
+ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr);
+
+ if (ses->ses_status != SES_GOOD &&
+ ses->ses_status != SES_NEW &&
+ ses->ses_status != SES_NEED_RECON) {
spin_unlock(&cifs_tcp_ses_lock);
return 0;
}
- server->tcpStatus = CifsInSessSetup;
- spin_unlock(&cifs_tcp_ses_lock);
+ /* only send once per connect */
spin_lock(&ses->chan_lock);
+ if (CIFS_ALL_CHANS_GOOD(ses) ||
+ cifs_chan_in_reconnect(ses, server)) {
+ spin_unlock(&ses->chan_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ return 0;
+ }
is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+ cifs_chan_set_in_reconnect(ses, server);
spin_unlock(&ses->chan_lock);
+ if (!is_binding)
+ ses->ses_status = SES_IN_SETUP;
+ spin_unlock(&cifs_tcp_ses_lock);
+
if (!is_binding) {
ses->capabilities = server->capabilities;
if (!linuxExtEnabled)
@@ -4019,20 +4081,21 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
if (rc) {
cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc);
spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus == CifsInSessSetup)
- server->tcpStatus = CifsNeedSessSetup;
+ if (ses->ses_status == SES_IN_SETUP)
+ ses->ses_status = SES_NEED_RECON;
+ spin_lock(&ses->chan_lock);
+ cifs_chan_clear_in_reconnect(ses, server);
+ spin_unlock(&ses->chan_lock);
spin_unlock(&cifs_tcp_ses_lock);
} else {
spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus == CifsInSessSetup)
- server->tcpStatus = CifsGood;
- /* Even if one channel is active, session is in good state */
- ses->status = CifsGood;
- spin_unlock(&cifs_tcp_ses_lock);
-
+ if (ses->ses_status == SES_IN_SETUP)
+ ses->ses_status = SES_GOOD;
spin_lock(&ses->chan_lock);
+ cifs_chan_clear_in_reconnect(ses, server);
cifs_chan_clear_need_reconnect(ses, server);
spin_unlock(&ses->chan_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
}
return rc;
@@ -4497,7 +4560,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* only send once per connect */
spin_lock(&cifs_tcp_ses_lock);
- if (tcon->ses->status != CifsGood ||
+ if (tcon->ses->ses_status != SES_GOOD ||
(tcon->status != TID_NEW &&
tcon->status != TID_NEED_TCON)) {
spin_unlock(&cifs_tcp_ses_lock);
@@ -4565,7 +4628,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* only send once per connect */
spin_lock(&cifs_tcp_ses_lock);
- if (tcon->ses->status != CifsGood ||
+ if (tcon->ses->ses_status != SES_GOOD ||
(tcon->status != TID_NEW &&
tcon->status != TID_NEED_TCON)) {
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 956f8e5cf3e7..34a8f3baed5e 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -654,7 +654,7 @@ static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int h
return ce;
}
}
- return ERR_PTR(-EEXIST);
+ return ERR_PTR(-ENOENT);
}
/*
@@ -662,7 +662,7 @@ static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int h
*
* Use whole path components in the match. Must be called with htable_rw_lock held.
*
- * Return ERR_PTR(-EEXIST) if the entry is not found.
+ * Return ERR_PTR(-ENOENT) if the entry is not found.
*/
static struct cache_entry *lookup_cache_entry(const char *path)
{
@@ -710,7 +710,7 @@ static struct cache_entry *lookup_cache_entry(const char *path)
while (e > s && *e != sep)
e--;
}
- return ERR_PTR(-EEXIST);
+ return ERR_PTR(-ENOENT);
}
/**
@@ -1229,6 +1229,30 @@ void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id)
kref_put(&mg->refcount, mount_group_release);
}
+/* Extract share from DFS target and return a pointer to prefix path or NULL */
+static const char *parse_target_share(const char *target, char **share)
+{
+ const char *s, *seps = "/\\";
+ size_t len;
+
+ s = strpbrk(target + 1, seps);
+ if (!s)
+ return ERR_PTR(-EINVAL);
+
+ len = strcspn(s + 1, seps);
+ if (!len)
+ return ERR_PTR(-EINVAL);
+ s += len;
+
+ len = s - target + 1;
+ *share = kstrndup(target, len, GFP_KERNEL);
+ if (!*share)
+ return ERR_PTR(-ENOMEM);
+
+ s = target + len;
+ return s + strspn(s, seps);
+}
+
/**
* dfs_cache_get_tgt_share - parse a DFS target
*
@@ -1242,56 +1266,46 @@ void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id)
int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share,
char **prefix)
{
- char *s, sep, *p;
- size_t len;
- size_t plen1, plen2;
+ char sep;
+ char *target_share;
+ char *ppath = NULL;
+ const char *target_ppath, *dfsref_ppath;
+ size_t target_pplen, dfsref_pplen;
+ size_t len, c;
if (!it || !path || !share || !prefix || strlen(path) < it->it_path_consumed)
return -EINVAL;
- *share = NULL;
- *prefix = NULL;
-
sep = it->it_name[0];
if (sep != '\\' && sep != '/')
return -EINVAL;
- s = strchr(it->it_name + 1, sep);
- if (!s)
- return -EINVAL;
+ target_ppath = parse_target_share(it->it_name, &target_share);
+ if (IS_ERR(target_ppath))
+ return PTR_ERR(target_ppath);
- /* point to prefix in target node */
- s = strchrnul(s + 1, sep);
+ /* point to prefix in DFS referral path */
+ dfsref_ppath = path + it->it_path_consumed;
+ dfsref_ppath += strspn(dfsref_ppath, "/\\");
- /* extract target share */
- *share = kstrndup(it->it_name, s - it->it_name, GFP_KERNEL);
- if (!*share)
- return -ENOMEM;
+ target_pplen = strlen(target_ppath);
+ dfsref_pplen = strlen(dfsref_ppath);
- /* skip separator */
- if (*s)
- s++;
- /* point to prefix in DFS path */
- p = path + it->it_path_consumed;
- if (*p == sep)
- p++;
-
- /* merge prefix paths from DFS path and target node */
- plen1 = it->it_name + strlen(it->it_name) - s;
- plen2 = path + strlen(path) - p;
- if (plen1 || plen2) {
- len = plen1 + plen2 + 2;
- *prefix = kmalloc(len, GFP_KERNEL);
- if (!*prefix) {
- kfree(*share);
- *share = NULL;
+ /* merge prefix paths from DFS referral path and target node */
+ if (target_pplen || dfsref_pplen) {
+ len = target_pplen + dfsref_pplen + 2;
+ ppath = kzalloc(len, GFP_KERNEL);
+ if (!ppath) {
+ kfree(target_share);
return -ENOMEM;
}
- if (plen1)
- scnprintf(*prefix, len, "%.*s%c%.*s", (int)plen1, s, sep, (int)plen2, p);
- else
- strscpy(*prefix, p, len);
+ c = strscpy(ppath, target_ppath, len);
+ if (c && dfsref_pplen)
+ ppath[c] = sep;
+ strlcat(ppath, dfsref_ppath, len);
}
+ *share = target_share;
+ *prefix = ppath;
return 0;
}
@@ -1327,9 +1341,9 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c
cifs_dbg(VFS, "%s: failed to convert address \'%s\'. skip address matching.\n",
__func__, ip);
} else {
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, &sa);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
}
kfree(ip);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 06003bb9cbe9..e64cda7a7610 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1395,7 +1395,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
cifs_dbg(VFS, "Can't push all brlocks!\n");
break;
}
- length = 1 + flock->fl_end - flock->fl_start;
+ length = cifs_flock_len(flock);
if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
type = CIFS_RDLCK;
else
@@ -1511,7 +1511,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
bool wait_flag, bool posix_lck, unsigned int xid)
{
int rc = 0;
- __u64 length = 1 + flock->fl_end - flock->fl_start;
+ __u64 length = cifs_flock_len(flock);
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -1609,7 +1609,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifsLockInfo *li, *tmp;
- __u64 length = 1 + flock->fl_end - flock->fl_start;
+ __u64 length = cifs_flock_len(flock);
struct list_head tmp_llist;
INIT_LIST_HEAD(&tmp_llist);
@@ -1713,7 +1713,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
unsigned int xid)
{
int rc = 0;
- __u64 length = 1 + flock->fl_end - flock->fl_start;
+ __u64 length = cifs_flock_len(flock);
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -2004,7 +2004,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
bool fsuid_only)
{
struct cifsFileInfo *open_file = NULL;
- struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
@@ -2060,7 +2060,7 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags,
return rc;
}
- cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+ cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
@@ -2777,8 +2777,11 @@ int cifs_flush(struct file *file, fl_owner_t id)
rc = filemap_write_and_wait(inode->i_mapping);
cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc);
- if (rc)
+ if (rc) {
+ /* get more nuanced writeback errors */
+ rc = filemap_check_wb_err(file->f_mapping, 0);
trace_cifs_flush_err(inode->i_ino, rc);
+ }
return rc;
}
@@ -4666,14 +4669,14 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
/* This inode is open for write at least once */
struct cifs_sb_info *cifs_sb;
- cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb);
+ cifs_sb = CIFS_SB(cifsInode->netfs.inode.i_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
/* since no page cache to corrupt on directio
we can change size safely */
return true;
}
- if (i_size_read(&cifsInode->vfs_inode) < end_of_file)
+ if (i_size_read(&cifsInode->netfs.inode) < end_of_file)
return true;
return false;
@@ -4906,6 +4909,10 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
cifs_dbg(FYI, "swap activate\n");
+ if (!swap_file->f_mapping->a_ops->swap_rw)
+ /* Cannot support swap */
+ return -EINVAL;
+
spin_lock(&inode->i_lock);
blocks = inode->i_blocks;
isize = inode->i_size;
@@ -4934,7 +4941,8 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
* from reading or writing the file
*/
- return 0;
+ sis->flags |= SWP_FS_OPS;
+ return add_swap_extent(sis, 0, sis->max, 0);
}
static void cifs_swap_deactivate(struct file *file)
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index a92e9eec521f..8dc0d923ef6a 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -119,6 +119,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag_no("persistenthandles", Opt_persistent),
fsparam_flag_no("resilienthandles", Opt_resilient),
fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay),
+ fsparam_flag("nosparse", Opt_nosparse),
fsparam_flag("domainauto", Opt_domainauto),
fsparam_flag("rdma", Opt_rdma),
fsparam_flag("modesid", Opt_modesid),
@@ -312,7 +313,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->password = NULL;
new_ctx->server_hostname = NULL;
new_ctx->domainname = NULL;
- new_ctx->workstation_name = NULL;
new_ctx->UNC = NULL;
new_ctx->source = NULL;
new_ctx->iocharset = NULL;
@@ -327,7 +327,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(UNC);
DUP_CTX_STR(source);
DUP_CTX_STR(domainname);
- DUP_CTX_STR(workstation_name);
DUP_CTX_STR(nodename);
DUP_CTX_STR(iocharset);
@@ -766,8 +765,7 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
cifs_errorf(fc, "can not change domainname during remount\n");
return -EINVAL;
}
- if (new_ctx->workstation_name &&
- (!old_ctx->workstation_name || strcmp(new_ctx->workstation_name, old_ctx->workstation_name))) {
+ if (strcmp(new_ctx->workstation_name, old_ctx->workstation_name)) {
cifs_errorf(fc, "can not change workstation_name during remount\n");
return -EINVAL;
}
@@ -814,7 +812,6 @@ static int smb3_reconfigure(struct fs_context *fc)
STEAL_STRING(cifs_sb, ctx, username);
STEAL_STRING(cifs_sb, ctx, password);
STEAL_STRING(cifs_sb, ctx, domainname);
- STEAL_STRING(cifs_sb, ctx, workstation_name);
STEAL_STRING(cifs_sb, ctx, nodename);
STEAL_STRING(cifs_sb, ctx, iocharset);
@@ -943,6 +940,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_nolease:
ctx->no_lease = 1;
break;
+ case Opt_nosparse:
+ ctx->no_sparse = 1;
+ break;
case Opt_nodelete:
ctx->nodelete = 1;
break;
@@ -1467,22 +1467,15 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
int smb3_init_fs_context(struct fs_context *fc)
{
- int rc;
struct smb3_fs_context *ctx;
char *nodename = utsname()->nodename;
int i;
ctx = kzalloc(sizeof(struct smb3_fs_context), GFP_KERNEL);
- if (unlikely(!ctx)) {
- rc = -ENOMEM;
- goto err_exit;
- }
+ if (unlikely(!ctx))
+ return -ENOMEM;
- ctx->workstation_name = kstrdup(nodename, GFP_KERNEL);
- if (unlikely(!ctx->workstation_name)) {
- rc = -ENOMEM;
- goto err_exit;
- }
+ strscpy(ctx->workstation_name, nodename, sizeof(ctx->workstation_name));
/*
* does not have to be perfect mapping since field is
@@ -1555,14 +1548,6 @@ int smb3_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &smb3_fs_context_ops;
return 0;
-
-err_exit:
- if (ctx) {
- kfree(ctx->workstation_name);
- kfree(ctx);
- }
-
- return rc;
}
void
@@ -1588,8 +1573,6 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->source = NULL;
kfree(ctx->domainname);
ctx->domainname = NULL;
- kfree(ctx->workstation_name);
- ctx->workstation_name = NULL;
kfree(ctx->nodename);
ctx->nodename = NULL;
kfree(ctx->iocharset);
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index e54090d9ef36..5f093cb7e9b9 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -62,6 +62,7 @@ enum cifs_param {
Opt_noblocksend,
Opt_noautotune,
Opt_nolease,
+ Opt_nosparse,
Opt_hard,
Opt_soft,
Opt_perm,
@@ -170,7 +171,7 @@ struct smb3_fs_context {
char *server_hostname;
char *UNC;
char *nodename;
- char *workstation_name;
+ char workstation_name[CIFS_MAX_WORKSTATION_LEN];
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
@@ -222,6 +223,7 @@ struct smb3_fs_context {
bool noautotune:1;
bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
bool no_lease:1; /* disable requesting leases */
+ bool no_sparse:1; /* do not attempt to set files sparse */
bool fsc:1; /* enable fscache */
bool mfsymlinks:1; /* use Minshall+French Symlinks */
bool multiuser:1;
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a638b29e9062..23ef56f55ce5 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -101,13 +101,13 @@ void cifs_fscache_get_inode_cookie(struct inode *inode)
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
- cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd);
+ cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd);
- cifsi->netfs_ctx.cache =
+ cifsi->netfs.cache =
fscache_acquire_cookie(tcon->fscache, 0,
&cifsi->uniqueid, sizeof(cifsi->uniqueid),
&cd, sizeof(cd),
- i_size_read(&cifsi->vfs_inode));
+ i_size_read(&cifsi->netfs.inode));
}
void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update)
@@ -131,7 +131,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
if (cookie) {
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie);
fscache_relinquish_cookie(cookie, false);
- cifsi->netfs_ctx.cache = NULL;
+ cifsi->netfs.cache = NULL;
}
}
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 52355c0912ae..aa3b941a5555 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -52,16 +52,16 @@ void cifs_fscache_fill_coherency(struct inode *inode,
struct cifsInodeInfo *cifsi = CIFS_I(inode);
memset(cd, 0, sizeof(*cd));
- cd->last_write_time_sec = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec);
- cd->last_write_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec);
- cd->last_change_time_sec = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec);
- cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec);
+ cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec);
+ cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec);
+ cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec);
+ cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec);
}
static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode)
{
- return netfs_i_cookie(inode);
+ return netfs_i_cookie(&CIFS_I(inode)->netfs);
}
static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2f9e7d2f81b6..81da81e18553 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -115,7 +115,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
__func__, cifs_i->uniqueid);
set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags);
/* Invalidate fscache cookie */
- cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd);
+ cifs_fscache_fill_coherency(&cifs_i->netfs.inode, &cd);
fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0);
}
@@ -2499,7 +2499,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
u64 len)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
- struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->vfs_inode.i_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->netfs.inode.i_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct TCP_Server_Info *server = tcon->ses->server;
struct cifsFileInfo *cfile;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index afaf59c22193..0e84e6fcf8ab 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -69,12 +69,13 @@ sesInfoAlloc(void)
ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL);
if (ret_buf) {
atomic_inc(&sesInfoAllocCount);
- ret_buf->status = CifsNew;
+ ret_buf->ses_status = SES_NEW;
++ret_buf->ses_count;
INIT_LIST_HEAD(&ret_buf->smb_ses_list);
INIT_LIST_HEAD(&ret_buf->tcon_list);
mutex_init(&ret_buf->session_mutex);
spin_lock_init(&ret_buf->iface_lock);
+ INIT_LIST_HEAD(&ret_buf->iface_list);
spin_lock_init(&ret_buf->chan_lock);
}
return ret_buf;
@@ -83,6 +84,8 @@ sesInfoAlloc(void)
void
sesInfoFree(struct cifs_ses *buf_to_free)
{
+ struct cifs_server_iface *iface = NULL, *niface = NULL;
+
if (buf_to_free == NULL) {
cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n");
return;
@@ -95,9 +98,12 @@ sesInfoFree(struct cifs_ses *buf_to_free)
kfree_sensitive(buf_to_free->password);
kfree(buf_to_free->user_name);
kfree(buf_to_free->domainName);
- kfree(buf_to_free->workstation_name);
kfree_sensitive(buf_to_free->auth_key.response);
- kfree(buf_to_free->iface_list);
+ spin_lock(&buf_to_free->iface_lock);
+ list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list,
+ iface_head)
+ kref_put(&iface->refcount, release_iface);
+ spin_unlock(&buf_to_free->iface_lock);
kfree_sensitive(buf_to_free);
}
@@ -114,6 +120,8 @@ tconInfoAlloc(void)
kfree(ret_buf);
return NULL;
}
+ INIT_LIST_HEAD(&ret_buf->crfid.dirents.entries);
+ mutex_init(&ret_buf->crfid.dirents.de_mutex);
atomic_inc(&tconInfoAllocCount);
ret_buf->status = TID_NEW;
@@ -536,11 +544,11 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
if (oplock == OPLOCK_EXCLUSIVE) {
cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else if (oplock == OPLOCK_READ) {
cinode->oplock = CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else
cinode->oplock = 0;
}
@@ -1210,18 +1218,23 @@ static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void
.data = data,
.sb = NULL,
};
+ struct file_system_type **fs_type = (struct file_system_type *[]) {
+ &cifs_fs_type, &smb3_fs_type, NULL,
+ };
- iterate_supers_type(&cifs_fs_type, f, &sd);
-
- if (!sd.sb)
- return ERR_PTR(-EINVAL);
- /*
- * Grab an active reference in order to prevent automounts (DFS links)
- * of expiring and then freeing up our cifs superblock pointer while
- * we're doing failover.
- */
- cifs_sb_active(sd.sb);
- return sd.sb;
+ for (; *fs_type; fs_type++) {
+ iterate_supers_type(*fs_type, f, &sd);
+ if (sd.sb) {
+ /*
+ * Grab an active reference in order to prevent automounts (DFS links)
+ * of expiring and then freeing up our cifs superblock pointer while
+ * we're doing failover.
+ */
+ cifs_sb_active(sd.sb);
+ return sd.sb;
+ }
+ }
+ return ERR_PTR(-EINVAL);
}
static void __cifs_put_super(struct super_block *sb)
@@ -1309,7 +1322,7 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix)
* for "\<server>\<dfsname>\<linkpath>" DFS reference,
* where <dfsname> contains non-ASCII unicode symbols.
*
- * Check such DFS reference and emulate -ENOENT if it is actual.
+ * Check such DFS reference.
*/
int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
struct cifs_tcon *tcon,
@@ -1341,10 +1354,6 @@ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
cifs_dbg(FYI, "DFS ref '%s' is found, emulate -EREMOTE\n",
dfspath);
rc = -EREMOTE;
- } else if (rc == -EEXIST) {
- cifs_dbg(FYI, "DFS ref '%s' is not found, emulate -ENOENT\n",
- dfspath);
- rc = -ENOENT;
} else {
cifs_dbg(FYI, "%s: dfs_cache_find returned %d\n", __func__, rc);
}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 1929e80c09ee..384cabdf47ca 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -840,9 +840,109 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
return rc;
}
+static bool emit_cached_dirents(struct cached_dirents *cde,
+ struct dir_context *ctx)
+{
+ struct cached_dirent *dirent;
+ int rc;
+
+ list_for_each_entry(dirent, &cde->entries, entry) {
+ if (ctx->pos >= dirent->pos)
+ continue;
+ ctx->pos = dirent->pos;
+ rc = dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->fattr.cf_uniqueid,
+ dirent->fattr.cf_dtype);
+ if (!rc)
+ return rc;
+ }
+ return true;
+}
+
+static void update_cached_dirents_count(struct cached_dirents *cde,
+ struct dir_context *ctx)
+{
+ if (cde->ctx != ctx)
+ return;
+ if (cde->is_valid || cde->is_failed)
+ return;
+
+ cde->pos++;
+}
+
+static void finished_cached_dirents_count(struct cached_dirents *cde,
+ struct dir_context *ctx)
+{
+ if (cde->ctx != ctx)
+ return;
+ if (cde->is_valid || cde->is_failed)
+ return;
+ if (ctx->pos != cde->pos)
+ return;
+
+ cde->is_valid = 1;
+}
+
+static void add_cached_dirent(struct cached_dirents *cde,
+ struct dir_context *ctx,
+ const char *name, int namelen,
+ struct cifs_fattr *fattr)
+{
+ struct cached_dirent *de;
+
+ if (cde->ctx != ctx)
+ return;
+ if (cde->is_valid || cde->is_failed)
+ return;
+ if (ctx->pos != cde->pos) {
+ cde->is_failed = 1;
+ return;
+ }
+ de = kzalloc(sizeof(*de), GFP_ATOMIC);
+ if (de == NULL) {
+ cde->is_failed = 1;
+ return;
+ }
+ de->namelen = namelen;
+ de->name = kstrndup(name, namelen, GFP_ATOMIC);
+ if (de->name == NULL) {
+ kfree(de);
+ cde->is_failed = 1;
+ return;
+ }
+ de->pos = ctx->pos;
+
+ memcpy(&de->fattr, fattr, sizeof(struct cifs_fattr));
+
+ list_add_tail(&de->entry, &cde->entries);
+}
+
+static bool cifs_dir_emit(struct dir_context *ctx,
+ const char *name, int namelen,
+ struct cifs_fattr *fattr,
+ struct cached_fid *cfid)
+{
+ bool rc;
+ ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
+
+ rc = dir_emit(ctx, name, namelen, ino, fattr->cf_dtype);
+ if (!rc)
+ return rc;
+
+ if (cfid) {
+ mutex_lock(&cfid->dirents.de_mutex);
+ add_cached_dirent(&cfid->dirents, ctx, name, namelen,
+ fattr);
+ mutex_unlock(&cfid->dirents.de_mutex);
+ }
+
+ return rc;
+}
+
static int cifs_filldir(char *find_entry, struct file *file,
- struct dir_context *ctx,
- char *scratch_buf, unsigned int max_len)
+ struct dir_context *ctx,
+ char *scratch_buf, unsigned int max_len,
+ struct cached_fid *cfid)
{
struct cifsFileInfo *file_info = file->private_data;
struct super_block *sb = file_inode(file)->i_sb;
@@ -851,7 +951,6 @@ static int cifs_filldir(char *find_entry, struct file *file,
struct cifs_fattr fattr;
struct qstr name;
int rc = 0;
- ino_t ino;
rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level,
file_info->srch_inf.unicode);
@@ -931,8 +1030,8 @@ static int cifs_filldir(char *find_entry, struct file *file,
cifs_prime_dcache(file_dentry(file), &name, &fattr);
- ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
- return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
+ return !cifs_dir_emit(ctx, name.name, name.len,
+ &fattr, cfid);
}
@@ -941,8 +1040,9 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
int rc = 0;
unsigned int xid;
int i;
+ struct tcon_link *tlink = NULL;
struct cifs_tcon *tcon;
- struct cifsFileInfo *cifsFile = NULL;
+ struct cifsFileInfo *cifsFile;
char *current_entry;
int num_to_fill = 0;
char *tmp_buf = NULL;
@@ -950,6 +1050,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
unsigned int max_len;
const char *full_path;
void *page = alloc_dentry_path();
+ struct cached_fid *cfid = NULL;
+ struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
xid = get_xid();
@@ -959,6 +1061,54 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
goto rddir2_exit;
}
+ if (file->private_data == NULL) {
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ goto cache_not_found;
+ tcon = tlink_tcon(tlink);
+ } else {
+ cifsFile = file->private_data;
+ tcon = tlink_tcon(cifsFile->tlink);
+ }
+
+ rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid);
+ cifs_put_tlink(tlink);
+ if (rc)
+ goto cache_not_found;
+
+ mutex_lock(&cfid->dirents.de_mutex);
+ /*
+ * If this was reading from the start of the directory
+ * we need to initialize scanning and storing the
+ * directory content.
+ */
+ if (ctx->pos == 0 && cfid->dirents.ctx == NULL) {
+ cfid->dirents.ctx = ctx;
+ cfid->dirents.pos = 2;
+ }
+ /*
+ * If we already have the entire directory cached then
+ * we can just serve the cache.
+ */
+ if (cfid->dirents.is_valid) {
+ if (!dir_emit_dots(file, ctx)) {
+ mutex_unlock(&cfid->dirents.de_mutex);
+ goto rddir2_exit;
+ }
+ emit_cached_dirents(&cfid->dirents, ctx);
+ mutex_unlock(&cfid->dirents.de_mutex);
+ goto rddir2_exit;
+ }
+ mutex_unlock(&cfid->dirents.de_mutex);
+
+ /* Drop the cache while calling initiate_cifs_search and
+ * find_cifs_entry in case there will be reconnects during
+ * query_directory.
+ */
+ close_cached_dir(cfid);
+ cfid = NULL;
+
+ cache_not_found:
/*
* Ensure FindFirst doesn't fail before doing filldir() for '.' and
* '..'. Otherwise we won't be able to notify VFS in case of failure.
@@ -977,7 +1127,6 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
is in current search buffer?
if it before then restart search
if after then keep searching till find it */
-
cifsFile = file->private_data;
if (cifsFile->srch_inf.endOfSearch) {
if (cifsFile->srch_inf.emptyDir) {
@@ -993,12 +1142,18 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
tcon = tlink_tcon(cifsFile->tlink);
rc = find_cifs_entry(xid, tcon, ctx->pos, file, full_path,
&current_entry, &num_to_fill);
+ open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid);
if (rc) {
cifs_dbg(FYI, "fce error %d\n", rc);
goto rddir2_exit;
} else if (current_entry != NULL) {
cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
} else {
+ if (cfid) {
+ mutex_lock(&cfid->dirents.de_mutex);
+ finished_cached_dirents_count(&cfid->dirents, ctx);
+ mutex_unlock(&cfid->dirents.de_mutex);
+ }
cifs_dbg(FYI, "Could not find entry\n");
goto rddir2_exit;
}
@@ -1028,7 +1183,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
*/
*tmp_buf = 0;
rc = cifs_filldir(current_entry, file, ctx,
- tmp_buf, max_len);
+ tmp_buf, max_len, cfid);
if (rc) {
if (rc > 0)
rc = 0;
@@ -1036,6 +1191,12 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
}
ctx->pos++;
+ if (cfid) {
+ mutex_lock(&cfid->dirents.de_mutex);
+ update_cached_dirents_count(&cfid->dirents, ctx);
+ mutex_unlock(&cfid->dirents.de_mutex);
+ }
+
if (ctx->pos ==
cifsFile->srch_inf.index_of_last_entry) {
cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
@@ -1050,6 +1211,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
kfree(tmp_buf);
rddir2_exit:
+ if (cfid)
+ close_cached_dir(cfid);
free_dentry_path(page);
free_xid(xid);
return rc;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 32f478c7a66d..b85718f32b53 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -58,7 +58,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
- if (is_server_using_iface(ses->chans[i].server, iface)) {
+ if (ses->chans[i].iface == iface) {
spin_unlock(&ses->chan_lock);
return true;
}
@@ -81,11 +81,41 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
}
/* If we didn't find the channel, it is likely a bug */
+ if (server)
+ cifs_dbg(VFS, "unable to get chan index for server: 0x%llx",
+ server->conn_id);
WARN_ON(1);
return 0;
}
void
+cifs_chan_set_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ ses->chans[chan_index].in_reconnect = true;
+}
+
+void
+cifs_chan_clear_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ ses->chans[chan_index].in_reconnect = false;
+}
+
+bool
+cifs_chan_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ return CIFS_CHAN_IN_RECONNECT(ses, chan_index);
+}
+
+void
cifs_chan_set_need_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
@@ -116,16 +146,24 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,
return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index);
}
+bool
+cifs_chan_is_iface_active(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ return ses->chans[chan_index].iface &&
+ ses->chans[chan_index].iface->is_active;
+}
+
/* returns number of channels added */
int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
{
int old_chan_count, new_chan_count;
int left;
- int i = 0;
int rc = 0;
int tries = 0;
- struct cifs_server_iface *ifaces = NULL;
- size_t iface_count;
+ struct cifs_server_iface *iface = NULL, *niface = NULL;
spin_lock(&ses->chan_lock);
@@ -155,32 +193,16 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
spin_unlock(&ses->chan_lock);
/*
- * Make a copy of the iface list at the time and use that
- * instead so as to not hold the iface spinlock for opening
- * channels
- */
- spin_lock(&ses->iface_lock);
- iface_count = ses->iface_count;
- if (iface_count <= 0) {
- spin_unlock(&ses->iface_lock);
- cifs_dbg(VFS, "no iface list available to open channels\n");
- return 0;
- }
- ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces),
- GFP_ATOMIC);
- if (!ifaces) {
- spin_unlock(&ses->iface_lock);
- return 0;
- }
- spin_unlock(&ses->iface_lock);
-
- /*
* Keep connecting to same, fastest, iface for all channels as
* long as its RSS. Try next fastest one if not RSS or channel
* creation fails.
*/
+ spin_lock(&ses->iface_lock);
+ iface = list_first_entry(&ses->iface_list, struct cifs_server_iface,
+ iface_head);
+ spin_unlock(&ses->iface_lock);
+
while (left > 0) {
- struct cifs_server_iface *iface;
tries++;
if (tries > 3*ses->chan_max) {
@@ -189,31 +211,128 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
break;
}
- iface = &ifaces[i];
- if (is_ses_using_iface(ses, iface) && !iface->rss_capable) {
- i = (i+1) % iface_count;
- continue;
+ spin_lock(&ses->iface_lock);
+ if (!ses->iface_count) {
+ spin_unlock(&ses->iface_lock);
+ break;
}
- rc = cifs_ses_add_channel(cifs_sb, ses, iface);
- if (rc) {
- cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n",
- i, rc);
- i = (i+1) % iface_count;
- continue;
+ list_for_each_entry_safe_from(iface, niface, &ses->iface_list,
+ iface_head) {
+ /* skip ifaces that are unusable */
+ if (!iface->is_active ||
+ (is_ses_using_iface(ses, iface) &&
+ !iface->rss_capable)) {
+ continue;
+ }
+
+ /* take ref before unlock */
+ kref_get(&iface->refcount);
+
+ spin_unlock(&ses->iface_lock);
+ rc = cifs_ses_add_channel(cifs_sb, ses, iface);
+ spin_lock(&ses->iface_lock);
+
+ if (rc) {
+ cifs_dbg(VFS, "failed to open extra channel on iface:%pIS rc=%d\n",
+ &iface->sockaddr,
+ rc);
+ kref_put(&iface->refcount, release_iface);
+ continue;
+ }
+
+ cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n",
+ &iface->sockaddr);
+ break;
}
+ spin_unlock(&ses->iface_lock);
- cifs_dbg(FYI, "successfully opened new channel on iface#%d\n",
- i);
left--;
new_chan_count++;
}
- kfree(ifaces);
return new_chan_count - old_chan_count;
}
/*
+ * update the iface for the channel if necessary.
+ * will return 0 when iface is updated, 1 if removed, 2 otherwise
+ * Must be called with chan_lock held.
+ */
+int
+cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
+{
+ unsigned int chan_index;
+ struct cifs_server_iface *iface = NULL;
+ struct cifs_server_iface *old_iface = NULL;
+ int rc = 0;
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (!chan_index) {
+ spin_unlock(&ses->chan_lock);
+ return 0;
+ }
+
+ if (ses->chans[chan_index].iface) {
+ old_iface = ses->chans[chan_index].iface;
+ if (old_iface->is_active) {
+ spin_unlock(&ses->chan_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&ses->chan_lock);
+
+ spin_lock(&ses->iface_lock);
+ /* then look for a new one */
+ list_for_each_entry(iface, &ses->iface_list, iface_head) {
+ if (!iface->is_active ||
+ (is_ses_using_iface(ses, iface) &&
+ !iface->rss_capable)) {
+ continue;
+ }
+ kref_get(&iface->refcount);
+ }
+
+ if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ rc = 1;
+ iface = NULL;
+ cifs_dbg(FYI, "unable to find a suitable iface\n");
+ }
+
+ /* now drop the ref to the current iface */
+ if (old_iface && iface) {
+ kref_put(&old_iface->refcount, release_iface);
+ cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n",
+ &old_iface->sockaddr,
+ &iface->sockaddr);
+ } else if (old_iface) {
+ kref_put(&old_iface->refcount, release_iface);
+ cifs_dbg(FYI, "releasing ref to iface: %pIS\n",
+ &old_iface->sockaddr);
+ } else {
+ WARN_ON(!iface);
+ cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr);
+ }
+ spin_unlock(&ses->iface_lock);
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ ses->chans[chan_index].iface = iface;
+
+ /* No iface is found. if secondary chan, drop connection */
+ if (!iface && CIFS_SERVER_IS_CHAN(server))
+ ses->chans[chan_index].server = NULL;
+
+ spin_unlock(&ses->chan_lock);
+
+ if (!iface && CIFS_SERVER_IS_CHAN(server))
+ cifs_put_tcp_session(server, false);
+
+ return rc;
+}
+
+/*
* If server is a channel of ses, return the corresponding enclosing
* cifs_chan otherwise return NULL.
*/
@@ -274,7 +393,10 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
/* Auth */
ctx.domainauto = ses->domainAuto;
ctx.domainname = ses->domainName;
- ctx.server_hostname = ses->server->hostname;
+
+ /* no hostname for extra channels */
+ ctx.server_hostname = "";
+
ctx.username = ses->user_name;
ctx.password = ses->password;
ctx.sectype = ses->sectype;
@@ -322,6 +444,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
spin_unlock(&ses->chan_lock);
goto out;
}
+ chan->iface = iface;
ses->chan_count++;
atomic_set(&ses->chan_seq, 0);
@@ -714,9 +837,9 @@ static int size_of_ntlmssp_blob(struct cifs_ses *ses, int base_size)
else
sz += sizeof(__le16);
- if (ses->workstation_name)
+ if (ses->workstation_name[0])
sz += sizeof(__le16) * strnlen(ses->workstation_name,
- CIFS_MAX_WORKSTATION_LEN);
+ ntlmssp_workstation_name_size(ses));
else
sz += sizeof(__le16);
@@ -960,7 +1083,7 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
cifs_security_buffer_from_str(&sec_blob->WorkstationName,
ses->workstation_name,
- CIFS_MAX_WORKSTATION_LEN,
+ ntlmssp_workstation_name_size(ses),
*pbuffer, &tmp,
nls_cp);
@@ -1093,14 +1216,14 @@ sess_establish_session(struct sess_data *sess_data)
struct cifs_ses *ses = sess_data->ses;
struct TCP_Server_Info *server = sess_data->server;
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
if (!server->session_estab) {
if (server->sign) {
server->session_key.response =
kmemdup(ses->auth_key.response,
ses->auth_key.len, GFP_KERNEL);
if (!server->session_key.response) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
return -ENOMEM;
}
server->session_key.len =
@@ -1109,7 +1232,7 @@ sess_establish_session(struct sess_data *sess_data)
server->sequence_number = 0x2;
server->session_estab = true;
}
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "CIFS session established successfully\n");
return 0;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index c71c9a44bef4..2e20ee4dab7b 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -38,10 +38,10 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst,
in_buf->WordCount = 0;
put_bcc(0, in_buf);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
if (rc) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
return rc;
}
@@ -55,7 +55,7 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst,
if (rc < 0)
server->sequence_number--;
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n",
get_mid(in_buf), rc);
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index fe5bfa245fa7..8571a459c710 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -362,8 +362,6 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
num_rqst++;
if (cfile) {
- cifsFileInfo_put(cfile);
- cfile = NULL;
rc = compound_send_recv(xid, ses, server,
flags, num_rqst - 2,
&rqst[1], &resp_buftype[1],
@@ -514,8 +512,11 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
if (smb2_data == NULL)
return -ENOMEM;
+ if (strcmp(full_path, ""))
+ rc = -ENOENT;
+ else
+ rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid);
/* If it is a root and its handle is cached then use it */
- rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid);
if (!rc) {
if (tcon->crfid.file_all_info_is_valid) {
move_smb2_info_to_cifs(data,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 3fe47a88f47d..17813c3d0c6e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -656,6 +656,12 @@ smb2_is_valid_lease_break(char *buffer)
}
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
+ trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState),
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId),
+ *((u64 *)rsp->LeaseKey),
+ *((u64 *)&rsp->LeaseKey[8]));
+
return false;
}
@@ -726,6 +732,10 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
}
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "No file id matched, oplock break ignored\n");
+ trace_smb3_oplock_not_found(0 /* no xid */, rsp->PersistentFid,
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId));
+
return true;
}
@@ -798,7 +808,7 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
if (tcon->ses)
server = tcon->ses->server;
- cifs_server_dbg(FYI, "tid=%u: tcon is closing, skipping async close retry of fid %llu %llu\n",
+ cifs_server_dbg(FYI, "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n",
tcon->tid, persistent_fid, volatile_fid);
return 0;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index d6aaeff4a30a..8802995b2d3d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -512,73 +512,41 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
static int
parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
size_t buf_len,
- struct cifs_server_iface **iface_list,
- size_t *iface_count)
+ struct cifs_ses *ses)
{
struct network_interface_info_ioctl_rsp *p;
struct sockaddr_in *addr4;
struct sockaddr_in6 *addr6;
struct iface_info_ipv4 *p4;
struct iface_info_ipv6 *p6;
- struct cifs_server_iface *info;
+ struct cifs_server_iface *info = NULL, *iface = NULL, *niface = NULL;
+ struct cifs_server_iface tmp_iface;
ssize_t bytes_left;
size_t next = 0;
int nb_iface = 0;
- int rc = 0;
-
- *iface_list = NULL;
- *iface_count = 0;
-
- /*
- * Fist pass: count and sanity check
- */
+ int rc = 0, ret = 0;
bytes_left = buf_len;
p = buf;
- while (bytes_left >= sizeof(*p)) {
- nb_iface++;
- next = le32_to_cpu(p->Next);
- if (!next) {
- bytes_left -= sizeof(*p);
- break;
- }
- p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
- bytes_left -= next;
- }
-
- if (!nb_iface) {
- cifs_dbg(VFS, "%s: malformed interface info\n", __func__);
- rc = -EINVAL;
- goto out;
- }
-
- /* Azure rounds the buffer size up 8, to a 16 byte boundary */
- if ((bytes_left > 8) || p->Next)
- cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-
+ spin_lock(&ses->iface_lock);
/*
- * Second pass: extract info to internal structure
+ * Go through iface_list and do kref_put to remove
+ * any unused ifaces. ifaces in use will be removed
+ * when the last user calls a kref_put on it
*/
-
- *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL);
- if (!*iface_list) {
- rc = -ENOMEM;
- goto out;
+ list_for_each_entry_safe(iface, niface, &ses->iface_list,
+ iface_head) {
+ iface->is_active = 0;
+ kref_put(&iface->refcount, release_iface);
}
+ spin_unlock(&ses->iface_lock);
- info = *iface_list;
- bytes_left = buf_len;
- p = buf;
while (bytes_left >= sizeof(*p)) {
- info->speed = le64_to_cpu(p->LinkSpeed);
- info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
- info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
-
- cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count);
- cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
- cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__,
- le32_to_cpu(p->Capability));
+ memset(&tmp_iface, 0, sizeof(tmp_iface));
+ tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
+ tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
+ tmp_iface.rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
switch (p->Family) {
/*
@@ -587,7 +555,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
* conversion explicit in case either one changes.
*/
case INTERNETWORK:
- addr4 = (struct sockaddr_in *)&info->sockaddr;
+ addr4 = (struct sockaddr_in *)&tmp_iface.sockaddr;
p4 = (struct iface_info_ipv4 *)p->Buffer;
addr4->sin_family = AF_INET;
memcpy(&addr4->sin_addr, &p4->IPv4Address, 4);
@@ -599,7 +567,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
&addr4->sin_addr);
break;
case INTERNETWORKV6:
- addr6 = (struct sockaddr_in6 *)&info->sockaddr;
+ addr6 = (struct sockaddr_in6 *)&tmp_iface.sockaddr;
p6 = (struct iface_info_ipv6 *)p->Buffer;
addr6->sin6_family = AF_INET6;
memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16);
@@ -619,46 +587,96 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
goto next_iface;
}
- (*iface_count)++;
- info++;
+ /*
+ * The iface_list is assumed to be sorted by speed.
+ * Check if the new interface exists in that list.
+ * NEVER change iface. it could be in use.
+ * Add a new one instead
+ */
+ spin_lock(&ses->iface_lock);
+ iface = niface = NULL;
+ list_for_each_entry_safe(iface, niface, &ses->iface_list,
+ iface_head) {
+ ret = iface_cmp(iface, &tmp_iface);
+ if (!ret) {
+ /* just get a ref so that it doesn't get picked/freed */
+ iface->is_active = 1;
+ kref_get(&iface->refcount);
+ spin_unlock(&ses->iface_lock);
+ goto next_iface;
+ } else if (ret < 0) {
+ /* all remaining ifaces are slower */
+ kref_get(&iface->refcount);
+ break;
+ }
+ }
+ spin_unlock(&ses->iface_lock);
+
+ /* no match. insert the entry in the list */
+ info = kmalloc(sizeof(struct cifs_server_iface),
+ GFP_KERNEL);
+ if (!info) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ memcpy(info, &tmp_iface, sizeof(tmp_iface));
+
+ /* add this new entry to the list */
+ kref_init(&info->refcount);
+ info->is_active = 1;
+
+ cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, ses->iface_count);
+ cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
+ cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__,
+ le32_to_cpu(p->Capability));
+
+ spin_lock(&ses->iface_lock);
+ if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ list_add_tail(&info->iface_head, &iface->iface_head);
+ kref_put(&iface->refcount, release_iface);
+ } else
+ list_add_tail(&info->iface_head, &ses->iface_list);
+ spin_unlock(&ses->iface_lock);
+
+ ses->iface_count++;
+ ses->iface_last_update = jiffies;
next_iface:
+ nb_iface++;
next = le32_to_cpu(p->Next);
- if (!next)
+ if (!next) {
+ bytes_left -= sizeof(*p);
break;
+ }
p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
bytes_left -= next;
}
- if (!*iface_count) {
+ if (!nb_iface) {
+ cifs_dbg(VFS, "%s: malformed interface info\n", __func__);
rc = -EINVAL;
goto out;
}
-out:
- if (rc) {
- kfree(*iface_list);
- *iface_count = 0;
- *iface_list = NULL;
- }
- return rc;
-}
+ /* Azure rounds the buffer size up 8, to a 16 byte boundary */
+ if ((bytes_left > 8) || p->Next)
+ cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-static int compare_iface(const void *ia, const void *ib)
-{
- const struct cifs_server_iface *a = (struct cifs_server_iface *)ia;
- const struct cifs_server_iface *b = (struct cifs_server_iface *)ib;
- return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1);
+ if (!ses->iface_count) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+out:
+ return rc;
}
-static int
+int
SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
{
int rc;
unsigned int ret_data_len = 0;
struct network_interface_info_ioctl_rsp *out_buf = NULL;
- struct cifs_server_iface *iface_list;
- size_t iface_count;
struct cifs_ses *ses = tcon->ses;
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
@@ -674,21 +692,10 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
goto out;
}
- rc = parse_server_interfaces(out_buf, ret_data_len,
- &iface_list, &iface_count);
+ rc = parse_server_interfaces(out_buf, ret_data_len, ses);
if (rc)
goto out;
- /* sort interfaces from fastest to slowest */
- sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL);
-
- spin_lock(&ses->iface_lock);
- kfree(ses->iface_list);
- ses->iface_list = iface_list;
- ses->iface_count = iface_count;
- ses->iface_last_update = jiffies;
- spin_unlock(&ses->iface_lock);
-
out:
kfree(out_buf);
return rc;
@@ -699,6 +706,7 @@ smb2_close_cached_fid(struct kref *ref)
{
struct cached_fid *cfid = container_of(ref, struct cached_fid,
refcount);
+ struct cached_dirent *dirent, *q;
if (cfid->is_valid) {
cifs_dbg(FYI, "clear cached root file handle\n");
@@ -718,6 +726,21 @@ smb2_close_cached_fid(struct kref *ref)
dput(cfid->dentry);
cfid->dentry = NULL;
}
+ /*
+ * Delete all cached dirent names
+ */
+ mutex_lock(&cfid->dirents.de_mutex);
+ list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) {
+ list_del(&dirent->entry);
+ kfree(dirent->name);
+ kfree(dirent);
+ }
+ cfid->dirents.is_valid = 0;
+ cfid->dirents.is_failed = 0;
+ cfid->dirents.ctx = NULL;
+ cfid->dirents.pos = 0;
+ mutex_unlock(&cfid->dirents.de_mutex);
+
}
void close_cached_dir(struct cached_fid *cfid)
@@ -754,14 +777,15 @@ smb2_cached_lease_break(struct work_struct *work)
/*
* Open the and cache a directory handle.
* Only supported for the root handle.
+ * If error then *cfid is not initialized.
*/
int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
const char *path,
struct cifs_sb_info *cifs_sb,
struct cached_fid **cfid)
{
- struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = ses->server;
+ struct cifs_ses *ses;
+ struct TCP_Server_Info *server;
struct cifs_open_parms oparms;
struct smb2_create_rsp *o_rsp = NULL;
struct smb2_query_info_rsp *qi_rsp = NULL;
@@ -776,9 +800,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid *pfid;
struct dentry *dentry;
- if (tcon->nohandlecache)
+ if (tcon == NULL || tcon->nohandlecache ||
+ is_smb1_server(tcon->ses->server))
return -ENOTSUPP;
+ ses = tcon->ses;
+ server = ses->server;
+
if (cifs_sb->root == NULL)
return -ENOENT;
@@ -824,7 +852,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
oparms.tcon = tcon;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE);
oparms.desired_access = FILE_READ_ATTRIBUTES;
oparms.disposition = FILE_OPEN;
oparms.fid = pfid;
@@ -2695,7 +2723,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
- rc = open_cached_dir(xid, tcon, path, cifs_sb, &cfid);
+ if (!strcmp(path, ""))
+ open_cached_dir(xid, tcon, path, cifs_sb, &cfid); /* cfid null if open dir failed */
memset(&open_iov, 0, sizeof(open_iov));
rqst[0].rq_iov = open_iov;
@@ -3837,7 +3866,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
if (rc)
goto out;
- if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0)
+ if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)
smb2_set_sparse(xid, tcon, cfile, inode, false);
eof = cpu_to_le64(off + len);
@@ -4238,15 +4267,15 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
if (oplock == SMB2_OPLOCK_LEVEL_BATCH) {
cinode->oplock = CIFS_CACHE_RHW_FLG;
cifs_dbg(FYI, "Batch Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
cinode->oplock = CIFS_CACHE_RW_FLG;
cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else if (oplock == SMB2_OPLOCK_LEVEL_II) {
cinode->oplock = CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else
cinode->oplock = 0;
}
@@ -4285,7 +4314,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
cinode->oplock = new_oplock;
cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
}
static void
@@ -4323,11 +4352,13 @@ smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
}
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static bool
smb2_is_read_op(__u32 oplock)
{
return oplock == SMB2_OPLOCK_LEVEL_II;
}
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
static bool
smb21_is_read_op(__u32 oplock)
@@ -5426,7 +5457,7 @@ out:
return rc;
}
-
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -5525,6 +5556,7 @@ struct smb_version_operations smb20_operations = {
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
};
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
struct smb_version_operations smb21_operations = {
.compare_fids = smb2_compare_fids,
@@ -5856,6 +5888,7 @@ struct smb_version_operations smb311_operations = {
.is_network_name_deleted = smb2_is_network_name_deleted,
};
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct smb_version_values smb20_values = {
.version_string = SMB20_VERSION_STRING,
.protocol_id = SMB20_PROT_ID,
@@ -5876,6 +5909,7 @@ struct smb_version_values smb20_values = {
.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
.create_lease_size = sizeof(struct create_lease),
};
+#endif /* ALLOW_INSECURE_LEGACY */
struct smb_version_values smb21_values = {
.version_string = SMB21_VERSION_STRING,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 1b7ad0c09566..12b4dddaedb0 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -179,7 +179,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
}
}
spin_unlock(&cifs_tcp_ses_lock);
- if ((!tcon->ses) || (tcon->ses->status == CifsExiting) ||
+ if ((!tcon->ses) || (tcon->ses->ses_status == SES_EXITING) ||
(!tcon->ses->server) || !server)
return -EIO;
@@ -288,6 +288,9 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
mutex_unlock(&ses->session_mutex);
rc = -EHOSTDOWN;
goto failed;
+ } else if (rc) {
+ mutex_unlock(&ses->session_mutex);
+ goto out;
}
} else {
mutex_unlock(&ses->session_mutex);
@@ -540,6 +543,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
struct TCP_Server_Info *server, unsigned int *total_len)
{
char *pneg_ctxt;
+ char *hostname = NULL;
unsigned int ctxt_len, neg_context_count;
if (*total_len > 200) {
@@ -567,16 +571,24 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
- ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
- server->hostname);
- *total_len += ctxt_len;
- pneg_ctxt += ctxt_len;
-
build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
*total_len += sizeof(struct smb2_posix_neg_context);
pneg_ctxt += sizeof(struct smb2_posix_neg_context);
- neg_context_count = 4;
+ /*
+ * secondary channels don't have the hostname field populated
+ * use the hostname field in the primary channel instead
+ */
+ hostname = CIFS_SERVER_IS_CHAN(server) ?
+ server->primary_server->hostname : server->hostname;
+ if (hostname && (hostname[0] != 0)) {
+ ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
+ hostname);
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
+ neg_context_count = 4;
+ } else /* second channels do not have a hostname */
+ neg_context_count = 3;
if (server->compress_algorithm) {
build_compression_ctxt((struct smb2_compression_capabilities_context *)
@@ -1369,13 +1381,13 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
struct cifs_ses *ses = sess_data->ses;
struct TCP_Server_Info *server = sess_data->server;
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
if (server->ops->generate_signingkey) {
rc = server->ops->generate_signingkey(ses, server);
if (rc) {
cifs_dbg(FYI,
"SMB3 session key generation failed\n");
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
return rc;
}
}
@@ -1383,7 +1395,7 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
server->sequence_number = 0x2;
server->session_estab = true;
}
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "SMB2/3 session established successfully\n");
return rc;
@@ -3899,7 +3911,8 @@ SMB2_echo(struct TCP_Server_Info *server)
cifs_dbg(FYI, "In echo request for conn_id %lld\n", server->conn_id);
spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus == CifsNeedNegotiate) {
+ if (server->ops->need_neg &&
+ server->ops->need_neg(server)) {
spin_unlock(&cifs_tcp_ses_lock);
/* No need to send echo on newly established connections */
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
@@ -5150,6 +5163,8 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
data = &info;
size = sizeof(struct smb2_file_eof_info);
+ trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof));
+
return send_set_info(xid, tcon, persistent_fid, volatile_fid,
pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE,
0, 1, &data, &size);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index d8c4388b190d..f57881b8464f 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -260,28 +260,6 @@ struct get_retrieval_pointers_refcount_rsp {
struct smb3_extents extents[];
} __packed;
-struct fsctl_set_integrity_information_req {
- __le16 ChecksumAlgorithm;
- __le16 Reserved;
- __le32 Flags;
-} __packed;
-
-struct fsctl_get_integrity_information_rsp {
- __le16 ChecksumAlgorithm;
- __le16 Reserved;
- __le32 Flags;
- __le32 ChecksumChunkSizeInBytes;
- __le32 ClusterSizeInBytes;
-} __packed;
-
-/* Integrity ChecksumAlgorithm choices for above */
-#define CHECKSUM_TYPE_NONE 0x0000
-#define CHECKSUM_TYPE_CRC64 0x0002
-#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */
-
-/* Integrity flags for above */
-#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
-
/* See MS-DFSC 2.2.2 */
struct fsctl_get_dfs_referral_req {
__le16 MaxReferralLevel;
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 2af79093b78b..55e79f6ee78d 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -641,7 +641,8 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!is_signed)
return 0;
spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus == CifsNeedNegotiate) {
+ if (server->ops->need_neg &&
+ server->ops->need_neg(server)) {
spin_unlock(&cifs_tcp_ses_lock);
return 0;
}
@@ -779,7 +780,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
return -EAGAIN;
}
- if (ses->status == CifsNew) {
+ if (ses->ses_status == SES_NEW) {
if ((shdr->Command != SMB2_SESSION_SETUP) &&
(shdr->Command != SMB2_NEGOTIATE)) {
spin_unlock(&cifs_tcp_ses_lock);
@@ -788,7 +789,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
/* else ok - we are setting up session */
}
- if (ses->status == CifsExiting) {
+ if (ses->ses_status == SES_EXITING) {
if (shdr->Command != SMB2_LOGOFF) {
spin_unlock(&cifs_tcp_ses_lock);
return -EAGAIN;
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 31ef64eb7fbb..5fbbec22bcc8 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -649,7 +649,7 @@ static int smbd_ia_open(
smbd_max_frmr_depth,
info->id->device->attrs.max_fast_reg_page_list_len);
info->mr_type = IB_MR_TYPE_MEM_REG;
- if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+ if (info->id->device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
info->mr_type = IB_MR_TYPE_SG_GAPS;
info->pd = ib_alloc_pd(info->id->device, 0);
@@ -1350,7 +1350,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
wait_event(info->wait_send_pending,
atomic_read(&info->send_pending) == 0);
- /* It's not posssible for upper layer to get to reassembly */
+ /* It's not possible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
do {
spin_lock_irqsave(&info->reassembly_queue_lock, flags);
@@ -1382,9 +1382,9 @@ void smbd_destroy(struct TCP_Server_Info *server)
log_rdma_event(INFO, "freeing mr list\n");
wake_up_interruptible_all(&info->wait_mr);
while (atomic_read(&info->mr_used_count)) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
msleep(1000);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
}
destroy_mr_list(info);
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index bc279616c513..6b88dc2e364f 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -121,6 +121,44 @@ DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
DEFINE_SMB3_RW_DONE_EVENT(zero_done);
DEFINE_SMB3_RW_DONE_EVENT(falloc_done);
+/* For logging successful set EOF (truncate) */
+DECLARE_EVENT_CLASS(smb3_eof_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset),
+ TP_ARGS(xid, fid, tid, sesid, offset),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset)
+)
+
+#define DEFINE_SMB3_EOF_EVENT(name) \
+DEFINE_EVENT(smb3_eof_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset), \
+ TP_ARGS(xid, fid, tid, sesid, offset))
+
+DEFINE_SMB3_EOF_EVENT(set_eof);
+
/*
* For handle based calls other than read and write, and get/set info
*/
@@ -158,6 +196,7 @@ DEFINE_SMB3_FD_EVENT(flush_enter);
DEFINE_SMB3_FD_EVENT(flush_done);
DEFINE_SMB3_FD_EVENT(close_enter);
DEFINE_SMB3_FD_EVENT(close_done);
+DEFINE_SMB3_FD_EVENT(oplock_not_found);
DECLARE_EVENT_CLASS(smb3_fd_err_class,
TP_PROTO(unsigned int xid,
@@ -814,6 +853,7 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \
TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high))
DEFINE_SMB3_LEASE_DONE_EVENT(lease_done);
+DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found);
DECLARE_EVENT_CLASS(smb3_lease_err_class,
TP_PROTO(__u32 lease_state,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c667e6ddfe2f..bfc9bd55870a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -726,7 +726,7 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
struct mid_q_entry **ppmidQ)
{
spin_lock(&cifs_tcp_ses_lock);
- if (ses->status == CifsNew) {
+ if (ses->ses_status == SES_NEW) {
if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
(in_buf->Command != SMB_COM_NEGOTIATE)) {
spin_unlock(&cifs_tcp_ses_lock);
@@ -735,7 +735,7 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
/* else ok - we are setting up session */
}
- if (ses->status == CifsExiting) {
+ if (ses->ses_status == SES_EXITING) {
/* check if SMB session is bad because we are setting it up */
if (in_buf->Command != SMB_COM_LOGOFF_ANDX) {
spin_unlock(&cifs_tcp_ses_lock);
@@ -822,7 +822,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
} else
instance = exist_credits->instance;
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
/*
* We can't use credits obtained from the previous session to send this
@@ -830,14 +830,14 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
* return -EAGAIN in such cases to let callers handle it.
*/
if (instance != server->reconnect_instance) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
add_credits_and_wake_if(server, &credits, optype);
return -EAGAIN;
}
mid = server->ops->setup_async_request(server, rqst);
if (IS_ERR(mid)) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
add_credits_and_wake_if(server, &credits, optype);
return PTR_ERR(mid);
}
@@ -868,7 +868,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
cifs_delete_mid(mid);
}
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
if (rc == 0)
return 0;
@@ -1109,7 +1109,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
* of smb data.
*/
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
/*
* All the parts of the compound chain belong obtained credits from the
@@ -1119,7 +1119,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
* handle it.
*/
if (instance != server->reconnect_instance) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
for (j = 0; j < num_rqst; j++)
add_credits(server, &credits[j], optype);
return -EAGAIN;
@@ -1131,7 +1131,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
revert_current_mid(server, i);
for (j = 0; j < i; j++)
cifs_delete_mid(midQ[j]);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
/* Update # of requests on wire to server */
for (j = 0; j < num_rqst; j++)
@@ -1163,7 +1163,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
server->sequence_number -= 2;
}
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
/*
* If sending failed for some reason or it is an oplock break that we
@@ -1187,12 +1187,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
* Compounding is never used during session establish.
*/
spin_lock(&cifs_tcp_ses_lock);
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
+ if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
spin_unlock(&cifs_tcp_ses_lock);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
smb311_update_preauth_hash(ses, server, rqst[0].rq_iov, rqst[0].rq_nvec);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
spin_lock(&cifs_tcp_ses_lock);
}
@@ -1260,15 +1260,15 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
* Compounding is never used during session establish.
*/
spin_lock(&cifs_tcp_ses_lock);
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
+ if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
struct kvec iov = {
.iov_base = resp_iov[0].iov_base,
.iov_len = resp_iov[0].iov_len
};
spin_unlock(&cifs_tcp_ses_lock);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
smb311_update_preauth_hash(ses, server, &iov, 1);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
spin_lock(&cifs_tcp_ses_lock);
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -1385,11 +1385,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
and avoid races inside tcp sendmsg code that could cause corruption
of smb data */
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
rc = allocate_mid(ses, in_buf, &midQ);
if (rc) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
/* Update # of requests on wire to server */
add_credits(server, &credits, 0);
return rc;
@@ -1397,7 +1397,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
if (rc) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
goto out;
}
@@ -1411,7 +1411,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc < 0)
server->sequence_number -= 2;
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
if (rc < 0)
goto out;
@@ -1530,18 +1530,18 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
and avoid races inside tcp sendmsg code that could cause corruption
of smb data */
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
rc = allocate_mid(ses, in_buf, &midQ);
if (rc) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
return rc;
}
rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
if (rc) {
cifs_delete_mid(midQ);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
return rc;
}
@@ -1554,7 +1554,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
if (rc < 0)
server->sequence_number -= 2;
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
if (rc < 0) {
cifs_delete_mid(midQ);
diff --git a/fs/dax.c b/fs/dax.c
index 67a08a32fccb..4155a6107fa1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
+#include <linux/rmap.h>
#include <asm/pgalloc.h>
#define CREATE_TRACE_POINTS
@@ -721,7 +722,8 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
int id;
id = dax_read_lock();
- rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
+ rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
+ &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
@@ -789,95 +791,12 @@ static void *dax_insert_entry(struct xa_state *xas,
return entry;
}
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
- unsigned long address;
-
- address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- return address;
-}
-
-/* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
- unsigned long pfn)
-{
- struct vm_area_struct *vma;
- pte_t pte, *ptep = NULL;
- pmd_t *pmdp = NULL;
- spinlock_t *ptl;
-
- i_mmap_lock_read(mapping);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
- struct mmu_notifier_range range;
- unsigned long address;
-
- cond_resched();
-
- if (!(vma->vm_flags & VM_SHARED))
- continue;
-
- address = pgoff_address(index, vma);
-
- /*
- * follow_invalidate_pte() will use the range to call
- * mmu_notifier_invalidate_range_start() on our behalf before
- * taking any lock.
- */
- if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
- &pmdp, &ptl))
- continue;
-
- /*
- * No need to call mmu_notifier_invalidate_range() as we are
- * downgrading page table protection not changing it to point
- * to a new page.
- *
- * See Documentation/vm/mmu_notifier.rst
- */
- if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
- pmd_t pmd;
-
- if (pfn != pmd_pfn(*pmdp))
- goto unlock_pmd;
- if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
- goto unlock_pmd;
-
- flush_cache_page(vma, address, pfn);
- pmd = pmdp_invalidate(vma, address, pmdp);
- pmd = pmd_wrprotect(pmd);
- pmd = pmd_mkclean(pmd);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
- spin_unlock(ptl);
- } else {
- if (pfn != pte_pfn(*ptep))
- goto unlock_pte;
- if (!pte_dirty(*ptep) && !pte_write(*ptep))
- goto unlock_pte;
-
- flush_cache_page(vma, address, pfn);
- pte = ptep_clear_flush(vma, address, ptep);
- pte = pte_wrprotect(pte);
- pte = pte_mkclean(pte);
- set_pte_at(vma->vm_mm, address, ptep, pte);
-unlock_pte:
- pte_unmap_unlock(ptep, ptl);
- }
-
- mmu_notifier_invalidate_range_end(&range);
- }
- i_mmap_unlock_read(mapping);
-}
-
static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
struct address_space *mapping, void *entry)
{
- unsigned long pfn, index, count;
+ unsigned long pfn, index, count, end;
long ret = 0;
+ struct vm_area_struct *vma;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -935,8 +854,16 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
pfn = dax_to_pfn(entry);
count = 1UL << dax_entry_order(entry);
index = xas->xa_index & ~(count - 1);
+ end = index + count - 1;
+
+ /* Walk all mappings of a given index of a file and writeprotect them */
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
+ pfn_mkclean_range(pfn, count, index, vma);
+ cond_resched();
+ }
+ i_mmap_unlock_read(mapping);
- dax_entry_mkclean(mapping, index, pfn);
dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
/*
* After we have flushed the cache, we can clear the dirty tag. There
@@ -1013,7 +940,7 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
- NULL, pfnp);
+ DAX_ACCESS, NULL, pfnp);
if (length < 0) {
rc = length;
goto out;
@@ -1122,7 +1049,7 @@ static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
void *kaddr;
long ret;
- ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
+ ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL);
if (ret > 0) {
memset(kaddr + offset, 0, size);
dax_flush(dax_dev, kaddr + offset, size);
@@ -1239,6 +1166,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
const size_t size = ALIGN(length + offset, PAGE_SIZE);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
ssize_t map_len;
+ bool recovery = false;
void *kaddr;
if (fatal_signal_pending(current)) {
@@ -1247,7 +1175,14 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
}
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
- &kaddr, NULL);
+ DAX_ACCESS, &kaddr, NULL);
+ if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
+ map_len = dax_direct_access(dax_dev, pgoff,
+ PHYS_PFN(size), DAX_RECOVERY_WRITE,
+ &kaddr, NULL);
+ if (map_len > 0)
+ recovery = true;
+ }
if (map_len < 0) {
ret = map_len;
break;
@@ -1259,7 +1194,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (map_len > end - pos)
map_len = end - pos;
- if (iov_iter_rw(iter) == WRITE)
+ if (recovery)
+ xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
+ map_len, iter);
+ else if (iov_iter_rw(iter) == WRITE)
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index a5cc4ed2cd0d..8e01d89c3319 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -17,6 +17,7 @@ static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space
rreq->start = start;
rreq->len = len;
rreq->mapping = mapping;
+ rreq->inode = mapping->host;
INIT_LIST_HEAD(&rreq->subrequests);
refcount_set(&rreq->ref, 1);
return rreq;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index bcc8335b46b3..95a403720e8c 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -288,7 +288,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
}
if (erofs_inode_is_data_compressed(vi->datalayout)) {
- err = z_erofs_fill_inode(inode);
+ if (!erofs_is_fscache_mode(inode->i_sb))
+ err = z_erofs_fill_inode(inode);
+ else
+ err = -EOPNOTSUPP;
goto out_unlock;
}
inode->i_mapping->a_ops = &erofs_raw_access_aops;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 95efc127b2ba..724bb57075f6 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -199,7 +199,6 @@ struct z_erofs_decompress_frontend {
struct z_erofs_pagevec_ctor vector;
struct z_erofs_pcluster *pcl, *tailpcl;
- struct z_erofs_collection *cl;
/* a pointer used to pick up inplace I/O pages */
struct page **icpage_ptr;
z_erofs_next_pcluster_t owned_head;
@@ -214,7 +213,7 @@ struct z_erofs_decompress_frontend {
#define DECOMPRESS_FRONTEND_INIT(__i) { \
.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = COLLECT_PRIMARY_FOLLOWED }
+ .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true }
static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
static DEFINE_MUTEX(z_pagemap_global_lock);
@@ -357,7 +356,7 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
return false;
}
-/* callers must be with collection lock held */
+/* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct page *page, enum z_erofs_page_type type,
bool pvec_safereuse)
@@ -372,7 +371,7 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
ret = z_erofs_pagevec_enqueue(&fe->vector, page, type,
pvec_safereuse);
- fe->cl->vcnt += (unsigned int)ret;
+ fe->pcl->vcnt += (unsigned int)ret;
return ret ? 0 : -EAGAIN;
}
@@ -405,12 +404,11 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
f->mode = COLLECT_PRIMARY;
}
-static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe,
+ struct inode *inode,
+ struct erofs_map_blocks *map)
{
struct z_erofs_pcluster *pcl = fe->pcl;
- struct z_erofs_collection *cl;
unsigned int length;
/* to avoid unexpected loop formed by corrupted images */
@@ -419,8 +417,7 @@ static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe,
return -EFSCORRUPTED;
}
- cl = z_erofs_primarycollection(pcl);
- if (cl->pageofs != (map->m_la & ~PAGE_MASK)) {
+ if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -443,23 +440,21 @@ static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe,
length = READ_ONCE(pcl->length);
}
}
- mutex_lock(&cl->lock);
+ mutex_lock(&pcl->lock);
/* used to check tail merging loop due to corrupted images */
if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
fe->tailpcl = pcl;
z_erofs_try_to_claim_pcluster(fe);
- fe->cl = cl;
return 0;
}
-static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe,
+ struct inode *inode,
+ struct erofs_map_blocks *map)
{
bool ztailpacking = map->m_flags & EROFS_MAP_META;
struct z_erofs_pcluster *pcl;
- struct z_erofs_collection *cl;
struct erofs_workgroup *grp;
int err;
@@ -482,17 +477,15 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
/* new pclusters should be claimed as type 1, primary and followed */
pcl->next = fe->owned_head;
+ pcl->pageofs_out = map->m_la & ~PAGE_MASK;
fe->mode = COLLECT_PRIMARY_FOLLOWED;
- cl = z_erofs_primarycollection(pcl);
- cl->pageofs = map->m_la & ~PAGE_MASK;
-
/*
* lock all primary followed works before visible to others
* and mutex_trylock *never* fails for a new pcluster.
*/
- mutex_init(&cl->lock);
- DBG_BUGON(!mutex_trylock(&cl->lock));
+ mutex_init(&pcl->lock);
+ DBG_BUGON(!mutex_trylock(&pcl->lock));
if (ztailpacking) {
pcl->obj.index = 0; /* which indicates ztailpacking */
@@ -519,11 +512,10 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
fe->tailpcl = pcl;
fe->owned_head = &pcl->next;
fe->pcl = pcl;
- fe->cl = cl;
return 0;
err_out:
- mutex_unlock(&cl->lock);
+ mutex_unlock(&pcl->lock);
z_erofs_free_pcluster(pcl);
return err;
}
@@ -535,9 +527,9 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe,
struct erofs_workgroup *grp;
int ret;
- DBG_BUGON(fe->cl);
+ DBG_BUGON(fe->pcl);
- /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
+ /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
@@ -554,14 +546,14 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe,
fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
} else {
tailpacking:
- ret = z_erofs_register_collection(fe, inode, map);
+ ret = z_erofs_register_pcluster(fe, inode, map);
if (!ret)
goto out;
if (ret != -EEXIST)
return ret;
}
- ret = z_erofs_lookup_collection(fe, inode, map);
+ ret = z_erofs_lookup_pcluster(fe, inode, map);
if (ret) {
erofs_workgroup_put(&fe->pcl->obj);
return ret;
@@ -569,7 +561,7 @@ tailpacking:
out:
z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS,
- fe->cl->pagevec, fe->cl->vcnt);
+ fe->pcl->pagevec, fe->pcl->vcnt);
/* since file-backed online pages are traversed in reverse order */
fe->icpage_ptr = fe->pcl->compressed_pages +
z_erofs_pclusterpages(fe->pcl);
@@ -582,48 +574,36 @@ out:
*/
static void z_erofs_rcu_callback(struct rcu_head *head)
{
- struct z_erofs_collection *const cl =
- container_of(head, struct z_erofs_collection, rcu);
-
- z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster,
- primary_collection));
+ z_erofs_free_pcluster(container_of(head,
+ struct z_erofs_pcluster, rcu));
}
void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
- struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl);
-
- call_rcu(&cl->rcu, z_erofs_rcu_callback);
-}
-static void z_erofs_collection_put(struct z_erofs_collection *cl)
-{
- struct z_erofs_pcluster *const pcl =
- container_of(cl, struct z_erofs_pcluster, primary_collection);
-
- erofs_workgroup_put(&pcl->obj);
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
{
- struct z_erofs_collection *cl = fe->cl;
+ struct z_erofs_pcluster *pcl = fe->pcl;
- if (!cl)
+ if (!pcl)
return false;
z_erofs_pagevec_ctor_exit(&fe->vector, false);
- mutex_unlock(&cl->lock);
+ mutex_unlock(&pcl->lock);
/*
* if all pending pages are added, don't hold its reference
* any longer if the pcluster isn't hosted by ourselves.
*/
if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
- z_erofs_collection_put(cl);
+ erofs_workgroup_put(&pcl->obj);
- fe->cl = NULL;
+ fe->pcl = NULL;
return true;
}
@@ -663,28 +643,23 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
repeat:
cur = end - 1;
- /* lucky, within the range of the current map_blocks */
- if (offset + cur >= map->m_la &&
- offset + cur < map->m_la + map->m_llen) {
- /* didn't get a valid collection previously (very rare) */
- if (!fe->cl)
- goto restart_now;
- goto hitted;
- }
-
- /* go ahead the next map_blocks */
- erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur);
-
- if (z_erofs_collector_end(fe))
- fe->backmost = false;
+ if (offset + cur < map->m_la ||
+ offset + cur >= map->m_la + map->m_llen) {
+ erofs_dbg("out-of-range map @ pos %llu", offset + cur);
- map->m_la = offset + cur;
- map->m_llen = 0;
- err = z_erofs_map_blocks_iter(inode, map, 0);
- if (err)
- goto err_out;
+ if (z_erofs_collector_end(fe))
+ fe->backmost = false;
+ map->m_la = offset + cur;
+ map->m_llen = 0;
+ err = z_erofs_map_blocks_iter(inode, map, 0);
+ if (err)
+ goto err_out;
+ } else {
+ if (fe->pcl)
+ goto hitted;
+ /* didn't get a valid pcluster previously (very rare) */
+ }
-restart_now:
if (!(map->m_flags & EROFS_MAP_MAPPED))
goto hitted;
@@ -766,7 +741,7 @@ retry:
/* bump up the number of spiltted parts of a page */
++spiltted;
/* also update nr_pages */
- fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1);
+ fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1);
next_part:
/* can be used for verification */
map->m_llen = offset + cur - map->m_la;
@@ -821,15 +796,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
enum z_erofs_page_type page_type;
bool overlapped, partial;
- struct z_erofs_collection *cl;
int err;
might_sleep();
- cl = z_erofs_primarycollection(pcl);
- DBG_BUGON(!READ_ONCE(cl->nr_pages));
+ DBG_BUGON(!READ_ONCE(pcl->nr_pages));
- mutex_lock(&cl->lock);
- nr_pages = cl->nr_pages;
+ mutex_lock(&pcl->lock);
+ nr_pages = pcl->nr_pages;
if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) {
pages = pages_onstack;
@@ -857,9 +830,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
err = 0;
z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS,
- cl->pagevec, 0);
+ pcl->pagevec, 0);
- for (i = 0; i < cl->vcnt; ++i) {
+ for (i = 0; i < pcl->vcnt; ++i) {
unsigned int pagenr;
page = z_erofs_pagevec_dequeue(&ctor, &page_type);
@@ -945,11 +918,11 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
goto out;
llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT;
- if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) {
+ if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) {
outputsize = llen;
partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH);
} else {
- outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs;
+ outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out;
partial = true;
}
@@ -963,7 +936,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
.in = compressed_pages,
.out = pages,
.pageofs_in = pcl->pageofs_in,
- .pageofs_out = cl->pageofs,
+ .pageofs_out = pcl->pageofs_out,
.inputsize = inputsize,
.outputsize = outputsize,
.alg = pcl->algorithmformat,
@@ -1012,16 +985,12 @@ out:
else if (pages != pages_onstack)
kvfree(pages);
- cl->nr_pages = 0;
- cl->vcnt = 0;
+ pcl->nr_pages = 0;
+ pcl->vcnt = 0;
- /* all cl locks MUST be taken before the following line */
+ /* pcluster lock MUST be taken before the following line */
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
-
- /* all cl locks SHOULD be released right now */
- mutex_unlock(&cl->lock);
-
- z_erofs_collection_put(cl);
+ mutex_unlock(&pcl->lock);
return err;
}
@@ -1043,6 +1012,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
owned = READ_ONCE(pcl->next);
z_erofs_decompress_pcluster(io->sb, pcl, pagepool);
+ erofs_workgroup_put(&pcl->obj);
}
}
@@ -1466,22 +1436,19 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
struct page *page;
page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
- if (!page)
- goto skip;
-
- if (PageUptodate(page)) {
- unlock_page(page);
+ if (page) {
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ } else {
+ err = z_erofs_do_read_page(f, page, pagepool);
+ if (err)
+ erofs_err(inode->i_sb,
+ "readmore error at page %lu @ nid %llu",
+ index, EROFS_I(inode)->nid);
+ }
put_page(page);
- goto skip;
}
- err = z_erofs_do_read_page(f, page, pagepool);
- if (err)
- erofs_err(inode->i_sb,
- "readmore error at page %lu @ nid %llu",
- index, EROFS_I(inode)->nid);
- put_page(page);
-skip:
if (cur < PAGE_SIZE)
break;
cur = (index << PAGE_SHIFT) - 1;
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 800b11c53f57..58053bb5066f 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -12,21 +12,40 @@
#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
#define Z_EROFS_NR_INLINE_PAGEVECS 3
+#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001
+#define Z_EROFS_PCLUSTER_LENGTH_BIT 1
+
+/*
+ * let's leave a type here in case of introducing
+ * another tagged pointer later.
+ */
+typedef void *z_erofs_next_pcluster_t;
+
/*
* Structure fields follow one of the following exclusion rules.
*
* I: Modifiable by initialization/destruction paths and read-only
* for everyone else;
*
- * L: Field should be protected by pageset lock;
+ * L: Field should be protected by the pcluster lock;
*
* A: Field should be accessed / updated in atomic for parallelized code.
*/
-struct z_erofs_collection {
+struct z_erofs_pcluster {
+ struct erofs_workgroup obj;
struct mutex lock;
+ /* A: point to next chained pcluster or TAILs */
+ z_erofs_next_pcluster_t next;
+
+ /* A: lower limit of decompressed length and if full length or not */
+ unsigned int length;
+
/* I: page offset of start position of decompression */
- unsigned short pageofs;
+ unsigned short pageofs_out;
+
+ /* I: page offset of inline compressed data */
+ unsigned short pageofs_in;
/* L: maximum relative page index in pagevec[] */
unsigned short nr_pages;
@@ -41,29 +60,6 @@ struct z_erofs_collection {
/* I: can be used to free the pcluster by RCU. */
struct rcu_head rcu;
};
-};
-
-#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001
-#define Z_EROFS_PCLUSTER_LENGTH_BIT 1
-
-/*
- * let's leave a type here in case of introducing
- * another tagged pointer later.
- */
-typedef void *z_erofs_next_pcluster_t;
-
-struct z_erofs_pcluster {
- struct erofs_workgroup obj;
- struct z_erofs_collection primary_collection;
-
- /* A: point to next chained pcluster or TAILs */
- z_erofs_next_pcluster_t next;
-
- /* A: lower limit of decompressed length and if full length or not */
- unsigned int length;
-
- /* I: page offset of inline compressed data */
- unsigned short pageofs_in;
union {
/* I: physical cluster size in pages */
@@ -80,8 +76,6 @@ struct z_erofs_pcluster {
struct page *compressed_pages[];
};
-#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
-
/* let's avoid the valid 32-bit kernel addresses */
/* the chained workgroup has't submitted io (still open) */
diff --git a/fs/exec.c b/fs/exec.c
index e3e55d5e0be1..0989fb8472a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -758,6 +758,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
unsigned long stack_size;
unsigned long stack_expand;
unsigned long rlim_stack;
+ struct mmu_gather tlb;
#ifdef CONFIG_STACK_GROWSUP
/* Limit stack size */
@@ -812,8 +813,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
vm_flags |= mm->def_flags;
vm_flags |= VM_STACK_INCOMPLETE_SETUP;
- ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
+ tlb_gather_mmu(&tlb, mm);
+ ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end,
vm_flags);
+ tlb_finish_mmu(&tlb);
+
if (ret)
goto out_unlock;
BUG_ON(prev != vma);
@@ -1308,9 +1312,7 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out_unlock;
- if (me->flags & PF_KTHREAD)
- free_kthread_struct(me);
- me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+ me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
me->personality &= ~bprm->per_clear;
@@ -1955,6 +1957,10 @@ int kernel_execve(const char *kernel_filename,
int fd = AT_FDCWD;
int retval;
+ /* It is non-sense for kernel threads to call execve */
+ if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+ return -EINVAL;
+
filename = getname_kernel(kernel_filename);
if (IS_ERR(filename))
return PTR_ERR(filename);
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 76acc3721951..c6eaf7e9ea74 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -1198,7 +1198,9 @@ static int __exfat_rename(struct inode *old_parent_inode,
return -ENOENT;
}
- exfat_chain_dup(&olddir, &ei->dir);
+ exfat_chain_set(&olddir, EXFAT_I(old_parent_inode)->start_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(i_size_read(old_parent_inode), sbi),
+ EXFAT_I(old_parent_inode)->flags);
dentry = ei->entry;
ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh);
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 0106eba46d5a..3ef80d000e13 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -145,7 +145,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
if (err)
goto out_err;
dprintk("%s: found name: %s\n", __func__, nbuf);
- tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf));
+ tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf));
if (IS_ERR(tmp)) {
dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
err = PTR_ERR(tmp);
@@ -525,7 +525,8 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
}
inode_lock(target_dir->d_inode);
- nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
+ nresult = lookup_one(mnt_user_ns(mnt), nbuf,
+ target_dir, strlen(nbuf));
if (!IS_ERR(nresult)) {
if (unlikely(nresult->d_inode != result->d_inode)) {
dput(nresult);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2c2f179b6977..43de293cef56 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -672,17 +672,14 @@ int ext2_empty_dir (struct inode * inode)
void *page_addr = NULL;
struct page *page = NULL;
unsigned long i, npages = dir_pages(inode);
- int dir_has_error = 0;
for (i = 0; i < npages; i++) {
char *kaddr;
ext2_dirent * de;
- page = ext2_get_page(inode, i, dir_has_error, &page_addr);
+ page = ext2_get_page(inode, i, 0, &page_addr);
- if (IS_ERR(page)) {
- dir_has_error = 1;
- continue;
- }
+ if (IS_ERR(page))
+ goto not_empty;
kaddr = page_addr;
de = (ext2_dirent *)kaddr;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 9e1ecd89f47f..e6b932219803 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -36,7 +36,6 @@
#include <linux/iomap.h>
#include <linux/namei.h>
#include <linux/uio.h>
-#include <linux/dax.h>
#include "ext2.h"
#include "acl.h"
#include "xattr.h"
@@ -1550,7 +1549,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
if (IS_ERR(raw_inode))
return -EIO;
- /* For fields not not tracking in the in-memory inode,
+ /* For fields not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
if (ei->i_state & EXT2_STATE_NEW)
memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3dce7d058985..84c0eb55071d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -829,7 +829,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
inode->i_ino, create);
return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
}
/* Maximum number of blocks we map for direct IO at once. */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9f12f29bc346..9e06334771a3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4104,6 +4104,15 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
size = size >> bsbits;
start = start_off >> bsbits;
+ /*
+ * For tiny groups (smaller than 8MB) the chosen allocation
+ * alignment may be larger than group size. Make sure the
+ * alignment does not move allocation to a different group which
+ * makes mballoc fail assertions later.
+ */
+ start = max(start, rounddown(ac->ac_o_ex.fe_logical,
+ (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
+
/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
size -= ar->lleft + 1 - start;
@@ -4176,7 +4185,22 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
}
rcu_read_unlock();
- if (start + size <= ac->ac_o_ex.fe_logical &&
+ /*
+ * In this function "start" and "size" are normalized for better
+ * alignment and length such that we could preallocate more blocks.
+ * This normalization is done such that original request of
+ * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
+ * "size" boundaries.
+ * (Note fe_len can be relaxed since FS block allocation API does not
+ * provide gurantee on number of contiguous blocks allocation since that
+ * depends upon free space left, etc).
+ * In case of inode pa, later we use the allocated blocks
+ * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated
+ * range of goal/best blocks [start, size] to put it at the
+ * ac_o_ex.fe_logical extent of this inode.
+ * (See ext4_mb_use_inode_pa() for more details)
+ */
+ if (start + size <= ac->ac_o_ex.fe_logical ||
start > ac->ac_o_ex.fe_logical) {
ext4_msg(ac->ac_sb, KERN_ERR,
"start %lu, size %lu, fe_logical %lu",
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 7a5353a8cfd7..42f590518b4c 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -438,7 +438,7 @@ int ext4_ext_migrate(struct inode *inode)
/*
* Worst case we can touch the allocation bitmaps and a block
- * group descriptor block. We do need need to worry about
+ * group descriptor block. We do need to worry about
* credits for modifying the quota inode.
*/
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 47d0ca4c795b..db4ba99d1ceb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1929,7 +1929,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
struct dx_hash_info *hinfo)
{
unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count, continued;
+ unsigned continued;
+ int count;
struct buffer_head *bh2;
ext4_lblk_t newblock;
u32 hash2;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 14695e2b5042..97fa7b4c645f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -465,7 +465,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
/*
* In the first loop we prepare and mark buffers to submit. We have to
* mark all buffers in the page before submitting so that
- * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+ * end_page_writeback() cannot be called from ext4_end_bio() when IO
* on the first buffer finishes and we are still working on submitting
* the second buffer.
*/
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 90a941d20dff..8b70a4701293 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -54,6 +54,16 @@ int ext4_resize_begin(struct super_block *sb)
return -EPERM;
/*
+ * If the reserved GDT blocks is non-zero, the resize_inode feature
+ * should always be set.
+ */
+ if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks &&
+ !ext4_has_feature_resize_inode(sb)) {
+ ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero");
+ return -EFSCORRUPTED;
+ }
+
+ /*
* If we are not using the primary superblock/GDT copy don't resize,
* because the user tools have no way of handling this. Probably a
* bad time to do it anyways.
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 450c918d68fc..845f2f8aee5f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -87,7 +87,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
static int ext4_validate_options(struct fs_context *fc);
static int ext4_check_opt_consistency(struct fs_context *fc,
struct super_block *sb);
-static int ext4_apply_options(struct fs_context *fc, struct super_block *sb);
+static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
static int ext4_get_tree(struct fs_context *fc);
static int ext4_reconfigure(struct fs_context *fc);
@@ -1870,31 +1870,12 @@ ext4_sb_read_encoding(const struct ext4_super_block *es)
}
#endif
-static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)
-{
-#ifdef CONFIG_FS_ENCRYPTION
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
-
- err = fscrypt_set_test_dummy_encryption(sb, arg,
- &sbi->s_dummy_enc_policy);
- if (err) {
- ext4_msg(sb, KERN_WARNING,
- "Error while setting test dummy encryption [%d]", err);
- return err;
- }
- ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
-#endif
- return 0;
-}
-
#define EXT4_SPEC_JQUOTA (1 << 0)
#define EXT4_SPEC_JQFMT (1 << 1)
#define EXT4_SPEC_DATAJ (1 << 2)
#define EXT4_SPEC_SB_BLOCK (1 << 3)
#define EXT4_SPEC_JOURNAL_DEV (1 << 4)
#define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5)
-#define EXT4_SPEC_DUMMY_ENCRYPTION (1 << 6)
#define EXT4_SPEC_s_want_extra_isize (1 << 7)
#define EXT4_SPEC_s_max_batch_time (1 << 8)
#define EXT4_SPEC_s_min_batch_time (1 << 9)
@@ -1911,7 +1892,7 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)
struct ext4_fs_context {
char *s_qf_names[EXT4_MAXQUOTAS];
- char *test_dummy_enc_arg;
+ struct fscrypt_dummy_policy dummy_enc_policy;
int s_jquota_fmt; /* Format of quota to use */
#ifdef CONFIG_EXT4_DEBUG
int s_fc_debug_max_replay;
@@ -1953,7 +1934,7 @@ static void ext4_fc_free(struct fs_context *fc)
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(ctx->s_qf_names[i]);
- kfree(ctx->test_dummy_enc_arg);
+ fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
kfree(ctx);
}
@@ -2029,6 +2010,29 @@ static int unnote_qf_name(struct fs_context *fc, int qtype)
}
#endif
+static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
+ struct ext4_fs_context *ctx)
+{
+ int err;
+
+ if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "test_dummy_encryption option not supported");
+ return -EINVAL;
+ }
+ err = fscrypt_parse_test_dummy_encryption(param,
+ &ctx->dummy_enc_policy);
+ if (err == -EINVAL) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Value of option \"%s\" is unrecognized", param->key);
+ } else if (err == -EEXIST) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+ return err;
+}
+
#define EXT4_SET_CTX(name) \
static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
unsigned long flag) \
@@ -2291,29 +2295,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
return 0;
case Opt_test_dummy_encryption:
-#ifdef CONFIG_FS_ENCRYPTION
- if (param->type == fs_value_is_flag) {
- ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION;
- ctx->test_dummy_enc_arg = NULL;
- return 0;
- }
- if (*param->string &&
- !(!strcmp(param->string, "v1") ||
- !strcmp(param->string, "v2"))) {
- ext4_msg(NULL, KERN_WARNING,
- "Value of option \"%s\" is unrecognized",
- param->key);
- return -EINVAL;
- }
- ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION;
- ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size,
- GFP_KERNEL);
- return 0;
-#else
- ext4_msg(NULL, KERN_WARNING,
- "test_dummy_encryption option not supported");
- return -EINVAL;
-#endif
+ return ext4_parse_test_dummy_encryption(param, ctx);
case Opt_dax:
case Opt_dax_type:
#ifdef CONFIG_FS_DAX
@@ -2504,7 +2486,8 @@ parse_failed:
if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
m_ctx->journal_ioprio = s_ctx->journal_ioprio;
- ret = ext4_apply_options(fc, sb);
+ ext4_apply_options(fc, sb);
+ ret = 0;
out_free:
if (fc) {
@@ -2673,11 +2656,11 @@ err_jquota_specified:
static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
struct super_block *sb)
{
-#ifdef CONFIG_FS_ENCRYPTION
const struct ext4_fs_context *ctx = fc->fs_private;
const struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
- if (!(ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION))
+ if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
return 0;
if (!ext4_has_feature_encrypt(sb)) {
@@ -2691,14 +2674,46 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
- !sbi->s_dummy_enc_policy.policy) {
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
+ &ctx->dummy_enc_policy))
+ return 0;
ext4_msg(NULL, KERN_WARNING,
- "Can't set test_dummy_encryption on remount");
+ "Can't set or change test_dummy_encryption on remount");
return -EINVAL;
}
-#endif /* CONFIG_FS_ENCRYPTION */
- return 0;
+ /* Also make sure s_mount_opts didn't contain a conflicting value. */
+ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
+ if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
+ &ctx->dummy_enc_policy))
+ return 0;
+ ext4_msg(NULL, KERN_WARNING,
+ "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+ /*
+ * fscrypt_add_test_dummy_key() technically changes the super_block, so
+ * technically it should be delayed until ext4_apply_options() like the
+ * other changes. But since we never get here for remounts (see above),
+ * and this is the last chance to report errors, we do it here.
+ */
+ err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy);
+ if (err)
+ ext4_msg(NULL, KERN_WARNING,
+ "Error adding test dummy encryption key [%d]", err);
+ return err;
+}
+
+static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
+ struct super_block *sb)
+{
+ if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
+ /* if already set, it was already verified to be the same */
+ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
+ return;
+ EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
+ memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
+ ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
}
static int ext4_check_opt_consistency(struct fs_context *fc,
@@ -2785,11 +2800,10 @@ fail_dax_change_remount:
return ext4_check_quota_consistency(fc, sb);
}
-static int ext4_apply_options(struct fs_context *fc, struct super_block *sb)
+static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
{
struct ext4_fs_context *ctx = fc->fs_private;
struct ext4_sb_info *sbi = fc->s_fs_info;
- int ret = 0;
sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
sbi->s_mount_opt |= ctx->vals_s_mount_opt;
@@ -2825,11 +2839,7 @@ static int ext4_apply_options(struct fs_context *fc, struct super_block *sb)
#endif
ext4_apply_quota_options(fc, sb);
-
- if (ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION)
- ret = ext4_set_test_dummy_encryption(sb, ctx->test_dummy_enc_arg);
-
- return ret;
+ ext4_apply_test_dummy_encryption(ctx, sb);
}
@@ -4552,9 +4562,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (err < 0)
goto failed_mount;
- err = ext4_apply_options(fc, sb);
- if (err < 0)
- goto failed_mount;
+ ext4_apply_options(fc, sb);
#if IS_ENABLED(CONFIG_UNICODE)
if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
@@ -5302,14 +5310,6 @@ no_journal:
err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
GFP_KERNEL);
}
- /*
- * Update the checksum after updating free space/inode
- * counters. Otherwise the superblock can have an incorrect
- * checksum in the buffer cache until it is written out and
- * e2fsprogs programs trying to open a file system immediately
- * after it is mounted can fail.
- */
- ext4_superblock_csum_set(sb);
if (!err)
err = percpu_counter_init(&sbi->s_dirs_counter,
ext4_count_dirs(sb), GFP_KERNEL);
@@ -5367,6 +5367,14 @@ no_journal:
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+ /*
+ * Update the checksum after updating free space/inode counters and
+ * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
+ * checksum in the buffer cache until it is written out and
+ * e2fsprogs programs trying to open a file system immediately
+ * after it is mounted can fail.
+ */
+ ext4_superblock_csum_set(sb);
if (needs_recovery) {
ext4_msg(sb, KERN_INFO, "recovery complete");
err = ext4_mark_recovery_complete(sb, es);
@@ -5898,7 +5906,6 @@ static void ext4_update_super(struct super_block *sb)
static int ext4_commit_super(struct super_block *sb)
{
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
- int error = 0;
if (!sbh)
return -EINVAL;
@@ -5907,6 +5914,13 @@ static int ext4_commit_super(struct super_block *sb)
ext4_update_super(sb);
+ lock_buffer(sbh);
+ /* Buffer got discarded which means block device got invalidated */
+ if (!buffer_mapped(sbh)) {
+ unlock_buffer(sbh);
+ return -EIO;
+ }
+
if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
/*
* Oh, dear. A previous attempt to write the
@@ -5921,17 +5935,21 @@ static int ext4_commit_super(struct super_block *sb)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- BUFFER_TRACE(sbh, "marking dirty");
- mark_buffer_dirty(sbh);
- error = __sync_dirty_buffer(sbh,
- REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
+ get_bh(sbh);
+ /* Clear potential dirty bit if it was journalled update */
+ clear_buffer_dirty(sbh);
+ sbh->b_end_io = end_buffer_write_sync;
+ submit_bh(REQ_OP_WRITE,
+ REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
+ wait_on_buffer(sbh);
if (buffer_write_io_error(sbh)) {
ext4_msg(sb, KERN_ERR, "I/O error while writing "
"superblock");
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
+ return -EIO;
}
- return error;
+ return 0;
}
/*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 042325349098..564e28a1aa94 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1895,11 +1895,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
unlock_buffer(bs->bh);
ea_bdebug(bs->bh, "cloning");
- s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
+ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
error = -ENOMEM;
if (s->base == NULL)
goto cleanup;
- memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
s->first = ENTRY(header(s->base)+1);
header(s->base)->h_refcount = cpu_to_le32(1);
s->here = ENTRY(s->base + offset);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 456c1e89386a..6d8b2bf14de0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -98,13 +98,7 @@ repeat:
}
if (unlikely(!PageUptodate(page))) {
- if (page->index == sbi->metapage_eio_ofs) {
- if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO)
- set_ckpt_flags(sbi, CP_ERROR_FLAG);
- } else {
- sbi->metapage_eio_ofs = page->index;
- sbi->metapage_eio_cnt = 0;
- }
+ f2fs_handle_page_eio(sbi, page->index, META);
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
@@ -158,7 +152,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
blkaddr, exist);
set_sbi_flag(sbi, SBI_NEED_FSCK);
- WARN_ON(1);
+ dump_stack();
}
return exist;
}
@@ -196,7 +190,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
f2fs_warn(sbi, "access invalid blkaddr:%u",
blkaddr);
set_sbi_flag(sbi, SBI_NEED_FSCK);
- WARN_ON(1);
+ dump_stack();
return false;
} else {
return __is_bitmap_valid(sbi, blkaddr, type);
@@ -1010,9 +1004,7 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type)
return;
set_inode_flag(inode, flag);
- if (!f2fs_is_volatile_file(inode))
- list_add_tail(&F2FS_I(inode)->dirty_list,
- &sbi->inode_list[type]);
+ list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
stat_inc_dirty_inode(sbi, type);
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8f38c26bb16c..7fcbcf979737 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -69,8 +69,7 @@ static bool __is_cp_guaranteed(struct page *page)
if (f2fs_is_compressed_page(page))
return false;
- if ((S_ISREG(inode->i_mode) &&
- (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
+ if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
page_private_gcing(page))
return true;
return false;
@@ -585,6 +584,34 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
return false;
}
+int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi)
+{
+ int i;
+
+ for (i = 0; i < NR_PAGE_TYPE; i++) {
+ int n = (i == META) ? 1 : NR_TEMP_TYPE;
+ int j;
+
+ sbi->write_io[i] = f2fs_kmalloc(sbi,
+ array_size(n, sizeof(struct f2fs_bio_info)),
+ GFP_KERNEL);
+ if (!sbi->write_io[i])
+ return -ENOMEM;
+
+ for (j = HOT; j < n; j++) {
+ init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem);
+ sbi->write_io[i][j].sbi = sbi;
+ sbi->write_io[i][j].bio = NULL;
+ spin_lock_init(&sbi->write_io[i][j].io_lock);
+ INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
+ INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
+ init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock);
+ }
+ }
+
+ return 0;
+}
+
static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
enum page_type type, enum temp_type temp)
{
@@ -2564,7 +2591,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
bool ipu_force = false;
int err = 0;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
+ /* Use COW inode to make dnode_of_data for atomic write */
+ if (f2fs_is_atomic_file(inode))
+ set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0);
+ else
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+
if (need_inplace_update(fio) &&
f2fs_lookup_extent_cache(inode, page->index, &ei)) {
fio->old_blkaddr = ei.blk + page->index - ei.fofs;
@@ -2601,6 +2633,7 @@ got_it:
err = -EFSCORRUPTED;
goto out_writepage;
}
+
/*
* If current allocation needs SSR,
* it had better in-place writes for updated data.
@@ -2737,11 +2770,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
write:
if (f2fs_is_drop_cache(inode))
goto out;
- /* we should not write 0'th page having journal header */
- if (f2fs_is_volatile_file(inode) && (!page->index ||
- (!wbc->for_reclaim &&
- f2fs_available_free_memory(sbi, BASE_CHECK))))
- goto redirty_out;
/* Dentry/quota blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) {
@@ -3314,6 +3342,100 @@ unlock_out:
return err;
}
+static int __find_data_block(struct inode *inode, pgoff_t index,
+ block_t *blk_addr)
+{
+ struct dnode_of_data dn;
+ struct page *ipage;
+ struct extent_info ei = {0, };
+ int err = 0;
+
+ ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn.data_blkaddr = ei.blk + index - ei.fofs;
+ } else {
+ /* hole case */
+ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (err) {
+ dn.data_blkaddr = NULL_ADDR;
+ err = 0;
+ }
+ }
+ *blk_addr = dn.data_blkaddr;
+ f2fs_put_dnode(&dn);
+ return err;
+}
+
+static int __reserve_data_block(struct inode *inode, pgoff_t index,
+ block_t *blk_addr, bool *node_changed)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct dnode_of_data dn;
+ struct page *ipage;
+ int err = 0;
+
+ f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+
+ ipage = f2fs_get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto unlock_out;
+ }
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ err = f2fs_get_block(&dn, index);
+
+ *blk_addr = dn.data_blkaddr;
+ *node_changed = dn.node_changed;
+ f2fs_put_dnode(&dn);
+
+unlock_out:
+ f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ return err;
+}
+
+static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
+ struct page *page, loff_t pos, unsigned int len,
+ block_t *blk_addr, bool *node_changed)
+{
+ struct inode *inode = page->mapping->host;
+ struct inode *cow_inode = F2FS_I(inode)->cow_inode;
+ pgoff_t index = page->index;
+ int err = 0;
+ block_t ori_blk_addr;
+
+ /* If pos is beyond the end of file, reserve a new block in COW inode */
+ if ((pos & PAGE_MASK) >= i_size_read(inode))
+ return __reserve_data_block(cow_inode, index, blk_addr,
+ node_changed);
+
+ /* Look for the block in COW inode first */
+ err = __find_data_block(cow_inode, index, blk_addr);
+ if (err)
+ return err;
+ else if (*blk_addr != NULL_ADDR)
+ return 0;
+
+ /* Look for the block in the original inode */
+ err = __find_data_block(inode, index, &ori_blk_addr);
+ if (err)
+ return err;
+
+ /* Finally, we should reserve a new block in COW inode for the update */
+ err = __reserve_data_block(cow_inode, index, blk_addr, node_changed);
+ if (err)
+ return err;
+
+ if (ori_blk_addr != NULL_ADDR)
+ *blk_addr = ori_blk_addr;
+ return 0;
+}
+
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
@@ -3321,7 +3443,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page = NULL;
pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
- bool need_balance = false, drop_atomic = false;
+ bool need_balance = false;
block_t blkaddr = NULL_ADDR;
int err = 0;
@@ -3332,14 +3454,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
goto fail;
}
- if ((f2fs_is_atomic_file(inode) &&
- !f2fs_available_free_memory(sbi, INMEM_PAGES)) ||
- is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
- err = -ENOMEM;
- drop_atomic = true;
- goto fail;
- }
-
/*
* We should check this at this moment to avoid deadlock on inode page
* and #0 page. The locking rule for inline_data conversion should be:
@@ -3387,7 +3501,11 @@ repeat:
*pagep = page;
- err = prepare_write_begin(sbi, page, pos, len,
+ if (f2fs_is_atomic_file(inode))
+ err = prepare_atomic_write_begin(sbi, page, pos, len,
+ &blkaddr, &need_balance);
+ else
+ err = prepare_write_begin(sbi, page, pos, len,
&blkaddr, &need_balance);
if (err)
goto fail;
@@ -3443,8 +3561,6 @@ repeat:
fail:
f2fs_put_page(page, 1);
f2fs_write_failed(inode, pos + len);
- if (drop_atomic)
- f2fs_drop_inmem_pages_all(sbi, false);
return err;
}
@@ -3488,8 +3604,12 @@ static int f2fs_write_end(struct file *file,
set_page_dirty(page);
if (pos + copied > i_size_read(inode) &&
- !f2fs_verity_in_progress(inode))
+ !f2fs_verity_in_progress(inode)) {
f2fs_i_size_write(inode, pos + copied);
+ if (f2fs_is_atomic_file(inode))
+ f2fs_i_size_write(F2FS_I(inode)->cow_inode,
+ pos + copied);
+ }
unlock_out:
f2fs_put_page(page, 1);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
@@ -3522,9 +3642,6 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
inode->i_ino == F2FS_COMPRESS_INO(sbi))
clear_page_private_data(&folio->page);
- if (page_private_atomic(&folio->page))
- return f2fs_drop_inmem_page(inode, &folio->page);
-
folio_detach_private(folio);
}
@@ -3536,10 +3653,6 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
if (folio_test_dirty(folio))
return false;
- /* This is atomic written page, keep Private */
- if (page_private_atomic(&folio->page))
- return false;
-
sbi = F2FS_M_SB(folio->mapping);
if (test_opt(sbi, COMPRESS_CACHE)) {
struct inode *inode = folio->mapping->host;
@@ -3565,18 +3678,6 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping,
folio_mark_uptodate(folio);
BUG_ON(folio_test_swapcache(folio));
- if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
- if (!page_private_atomic(&folio->page)) {
- f2fs_register_inmem_page(inode, &folio->page);
- return true;
- }
- /*
- * Previously, this page has been registered, we just
- * return here.
- */
- return false;
- }
-
if (!folio_test_dirty(folio)) {
filemap_dirty_folio(mapping, folio);
f2fs_update_dirty_folio(inode, folio);
@@ -3656,42 +3757,14 @@ out:
int f2fs_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page, enum migrate_mode mode)
{
- int rc, extra_count;
- struct f2fs_inode_info *fi = F2FS_I(mapping->host);
- bool atomic_written = page_private_atomic(page);
+ int rc, extra_count = 0;
BUG_ON(PageWriteback(page));
- /* migrating an atomic written page is safe with the inmem_lock hold */
- if (atomic_written) {
- if (mode != MIGRATE_SYNC)
- return -EBUSY;
- if (!mutex_trylock(&fi->inmem_lock))
- return -EAGAIN;
- }
-
- /* one extra reference was held for atomic_write page */
- extra_count = atomic_written ? 1 : 0;
rc = migrate_page_move_mapping(mapping, newpage,
page, extra_count);
- if (rc != MIGRATEPAGE_SUCCESS) {
- if (atomic_written)
- mutex_unlock(&fi->inmem_lock);
+ if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- }
-
- if (atomic_written) {
- struct inmem_pages *cur;
-
- list_for_each_entry(cur, &fi->inmem_pages, list)
- if (cur->page == page) {
- cur->page = newpage;
- break;
- }
- mutex_unlock(&fi->inmem_lock);
- put_page(page);
- get_page(newpage);
- }
/* guarantee to start from no stale private field */
set_page_private(newpage, 0);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fcdf253cd211..c92625ef16d0 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -91,11 +91,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->nquota_files = sbi->nquota_files;
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
- si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
si->aw_cnt = sbi->atomic_files;
- si->vw_cnt = atomic_read(&sbi->vw_cnt);
si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
- si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ);
si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE);
si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
@@ -167,8 +164,6 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
si->io_skip_bggc = sbi->io_skip_bggc;
si->other_skip_bggc = sbi->other_skip_bggc;
- si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC];
- si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC];
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
/ 2;
@@ -296,7 +291,6 @@ get_cache:
sizeof(struct nat_entry);
si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] *
sizeof(struct nat_entry_set);
- si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
for (i = 0; i < MAX_INO_ENTRY; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
si->cache_mem += atomic_read(&sbi->total_ext_tree) *
@@ -491,10 +485,6 @@ static int stat_show(struct seq_file *s, void *v)
si->bg_data_blks);
seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
si->bg_node_blks);
- seq_printf(s, "Skipped : atomic write %llu (%llu)\n",
- si->skipped_atomic_files[BG_GC] +
- si->skipped_atomic_files[FG_GC],
- si->skipped_atomic_files[BG_GC]);
seq_printf(s, "BG skip : IO: %u, Other: %u\n",
si->io_skip_bggc, si->other_skip_bggc);
seq_puts(s, "\nExtent Cache:\n");
@@ -519,10 +509,8 @@ static int stat_show(struct seq_file *s, void *v)
si->flush_list_empty,
si->nr_discarding, si->nr_discarded,
si->nr_discard_cmd, si->undiscard_blks);
- seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), "
- "volatile IO: %4d (Max. %4d)\n",
- si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
- si->vw_cnt, si->max_vw_cnt);
+ seq_printf(s, " - atomic IO: %4d (Max. %4d)\n",
+ si->aw_cnt, si->max_aw_cnt);
seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit);
seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
@@ -623,9 +611,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
for (i = META_CP; i < META_MAX; i++)
atomic_set(&sbi->meta_count[i], 0);
- atomic_set(&sbi->vw_cnt, 0);
atomic_set(&sbi->max_aw_cnt, 0);
- atomic_set(&sbi->max_vw_cnt, 0);
raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_add_tail(&si->stat_list, &f2fs_stat_list);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index a0e51937d92e..d5bd7932fb64 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -82,7 +82,8 @@ int f2fs_init_casefolded_name(const struct inode *dir,
#if IS_ENABLED(CONFIG_UNICODE)
struct super_block *sb = dir->i_sb;
- if (IS_CASEFOLDED(dir)) {
+ if (IS_CASEFOLDED(dir) &&
+ !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) {
fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab,
GFP_NOFS, false, F2FS_SB(sb));
if (!fname->cf_name.name)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 10d1f138d14f..d9bbecd008d2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -509,11 +509,11 @@ struct f2fs_filename {
#if IS_ENABLED(CONFIG_UNICODE)
/*
* For casefolded directories: the casefolded name, but it's left NULL
- * if the original name is not valid Unicode, if the directory is both
- * casefolded and encrypted and its encryption key is unavailable, or if
- * the filesystem is doing an internal operation where usr_fname is also
- * NULL. In all these cases we fall back to treating the name as an
- * opaque byte sequence.
+ * if the original name is not valid Unicode, if the original name is
+ * "." or "..", if the directory is both casefolded and encrypted and
+ * its encryption key is unavailable, or if the filesystem is doing an
+ * internal operation where usr_fname is also NULL. In all these cases
+ * we fall back to treating the name as an opaque byte sequence.
*/
struct fscrypt_str cf_name;
#endif
@@ -579,8 +579,8 @@ enum {
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
-/* maximum retry of EIO'ed meta page */
-#define MAX_RETRY_META_PAGE_EIO 100
+/* maximum retry of EIO'ed page */
+#define MAX_RETRY_PAGE_EIO 100
#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
@@ -717,7 +717,6 @@ enum {
enum {
GC_FAILURE_PIN,
- GC_FAILURE_ATOMIC,
MAX_GC_FAILURE
};
@@ -739,8 +738,6 @@ enum {
FI_UPDATE_WRITE, /* inode has in-place-update data */
FI_NEED_IPU, /* used for ipu per file */
FI_ATOMIC_FILE, /* indicate atomic file */
- FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */
- FI_VOLATILE_FILE, /* indicate volatile file */
FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
@@ -753,7 +750,6 @@ enum {
FI_EXTRA_ATTR, /* indicate file has extra attribute */
FI_PROJ_INHERIT, /* indicate file inherits projectid */
FI_PIN_FILE, /* indicate file should not be gced */
- FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
FI_COMPRESSED_FILE, /* indicate file's data can be compressed */
FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */
@@ -795,11 +791,9 @@ struct f2fs_inode_info {
#endif
struct list_head dirty_list; /* dirty list for dirs and files */
struct list_head gdirty_list; /* linked in global dirty list */
- struct list_head inmem_ilist; /* list for inmem inodes */
- struct list_head inmem_pages; /* inmemory pages managed by f2fs */
- struct task_struct *inmem_task; /* store inmemory task */
- struct mutex inmem_lock; /* lock for inmemory pages */
+ struct task_struct *atomic_write_task; /* store atomic write task */
struct extent_tree *extent_tree; /* cached extent_tree entry */
+ struct inode *cow_inode; /* copy-on-write inode for atomic write */
/* avoid racing between foreground op and gc */
struct f2fs_rwsem i_gc_rwsem[2];
@@ -1093,7 +1087,6 @@ enum count_type {
F2FS_DIRTY_QDATA,
F2FS_DIRTY_NODES,
F2FS_DIRTY_META,
- F2FS_INMEM_PAGES,
F2FS_DIRTY_IMETA,
F2FS_WB_CP_DATA,
F2FS_WB_DATA,
@@ -1118,16 +1111,12 @@ enum count_type {
*/
#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type))
enum page_type {
- DATA,
- NODE,
+ DATA = 0,
+ NODE = 1, /* should not change this */
META,
NR_PAGE_TYPE,
META_FLUSH,
- INMEM, /* the below types are used by tracepoints only. */
- INMEM_DROP,
- INMEM_INVALIDATE,
- INMEM_REVOKE,
- IPU,
+ IPU, /* the below types are used by tracepoints only. */
OPU,
};
@@ -1277,6 +1266,15 @@ struct atgc_management {
unsigned long long age_threshold; /* age threshold */
};
+struct f2fs_gc_control {
+ unsigned int victim_segno; /* target victim segment number */
+ int init_gc_type; /* FG_GC or BG_GC */
+ bool no_bg_gc; /* check the space and stop bg_gc */
+ bool should_migrate_blocks; /* should migrate blocks */
+ bool err_gc_skipped; /* return EAGAIN if GC skipped */
+ unsigned int nr_free_secs; /* # of free sections to do GC */
+};
+
/* For s_flag in struct f2fs_sb_info */
enum {
SBI_IS_DIRTY, /* dirty flag for checkpoint */
@@ -1615,8 +1613,8 @@ struct f2fs_sb_info {
/* keep migration IO order for LFS mode */
struct f2fs_rwsem io_order_lock;
mempool_t *write_io_dummy; /* Dummy pages */
- pgoff_t metapage_eio_ofs; /* EIO page offset */
- int metapage_eio_cnt; /* EIO count */
+ pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */
+ int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
@@ -1719,7 +1717,6 @@ struct f2fs_sb_info {
/* for skip statistic */
unsigned int atomic_files; /* # of opened atomic file */
- unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
/* threshold for gc trials on pinned files */
@@ -1750,9 +1747,7 @@ struct f2fs_sb_info {
atomic_t inline_dir; /* # of inline_dentry inodes */
atomic_t compr_inode; /* # of compressed inodes */
atomic64_t compr_blocks; /* # of compressed blocks */
- atomic_t vw_cnt; /* # of volatile writes */
atomic_t max_aw_cnt; /* max # of atomic writes */
- atomic_t max_vw_cnt; /* max # of volatile writes */
unsigned int io_skip_bggc; /* skip background gc for in-flight IO */
unsigned int other_skip_bggc; /* skip background gc for other reasons */
unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
@@ -1763,7 +1758,7 @@ struct f2fs_sb_info {
unsigned int data_io_flag;
unsigned int node_io_flag;
- /* For sysfs suppport */
+ /* For sysfs support */
struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */
struct completion s_kobj_unregister;
@@ -2606,11 +2601,17 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, !sbi->total_valid_block_count);
- f2fs_bug_on(sbi, !sbi->total_valid_node_count);
+ if (unlikely(!sbi->total_valid_block_count ||
+ !sbi->total_valid_node_count)) {
+ f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u",
+ sbi->total_valid_block_count,
+ sbi->total_valid_node_count);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ } else {
+ sbi->total_valid_block_count--;
+ sbi->total_valid_node_count--;
+ }
- sbi->total_valid_node_count--;
- sbi->total_valid_block_count--;
if (sbi->reserved_blocks &&
sbi->current_reserved_blocks < sbi->reserved_blocks)
sbi->current_reserved_blocks++;
@@ -3173,6 +3174,10 @@ static inline int inline_xattr_size(struct inode *inode)
return 0;
}
+/*
+ * Notice: check inline_data flag without inode page lock is unsafe.
+ * It could change at any time by f2fs_convert_inline_page().
+ */
static inline int f2fs_has_inline_data(struct inode *inode)
{
return is_inode_flag_set(inode, FI_INLINE_DATA);
@@ -3203,16 +3208,6 @@ static inline bool f2fs_is_atomic_file(struct inode *inode)
return is_inode_flag_set(inode, FI_ATOMIC_FILE);
}
-static inline bool f2fs_is_commit_atomic_write(struct inode *inode)
-{
- return is_inode_flag_set(inode, FI_ATOMIC_COMMIT);
-}
-
-static inline bool f2fs_is_volatile_file(struct inode *inode)
-{
- return is_inode_flag_set(inode, FI_VOLATILE_FILE);
-}
-
static inline bool f2fs_is_first_block_written(struct inode *inode)
{
return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN);
@@ -3445,6 +3440,8 @@ void f2fs_handle_failed_inode(struct inode *inode);
int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
bool hot, bool set);
struct dentry *f2fs_get_parent(struct dentry *child);
+int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct inode **new_inode);
/*
* dir.c
@@ -3580,11 +3577,8 @@ void f2fs_destroy_node_manager_caches(void);
* segment.c
*/
bool f2fs_need_SSR(struct f2fs_sb_info *sbi);
-void f2fs_register_inmem_page(struct inode *inode, struct page *page);
-void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure);
-void f2fs_drop_inmem_pages(struct inode *inode);
-void f2fs_drop_inmem_page(struct inode *inode, struct page *page);
-int f2fs_commit_inmem_pages(struct inode *inode);
+int f2fs_commit_atomic_write(struct inode *inode);
+void f2fs_abort_atomic_write(struct inode *inode, bool clean);
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg);
int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
@@ -3726,6 +3720,7 @@ int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_bio(struct f2fs_sb_info *sbi,
struct bio *bio, enum page_type type);
+int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi);
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
struct inode *inode, struct page *page,
@@ -3787,8 +3782,7 @@ extern const struct iomap_ops f2fs_iomap_ops;
int f2fs_start_gc_thread(struct f2fs_sb_info *sbi);
void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi);
block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force,
- unsigned int segno);
+int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control);
void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count);
int __init f2fs_create_garbage_collection_cache(void);
@@ -3816,7 +3810,6 @@ struct f2fs_stat_info {
int ext_tree, zombie_tree, ext_node;
int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
int ndirty_data, ndirty_qdata;
- int inmem_pages;
unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
int nats, dirty_nats, sits, dirty_sits;
int free_nids, avail_nids, alloc_nids;
@@ -3834,7 +3827,7 @@ struct f2fs_stat_info {
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
int compr_inode;
unsigned long long compr_blocks;
- int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
+ int aw_cnt, max_aw_cnt;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
@@ -3846,7 +3839,6 @@ struct f2fs_stat_info {
int bg_node_segs, bg_data_segs;
int tot_blks, data_blks, node_blks;
int bg_data_blks, bg_node_blks;
- unsigned long long skipped_atomic_files[2];
int curseg[NR_CURSEG_TYPE];
int cursec[NR_CURSEG_TYPE];
int curzone[NR_CURSEG_TYPE];
@@ -3946,17 +3938,6 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
if (cur > max) \
atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
} while (0)
-#define stat_inc_volatile_write(inode) \
- (atomic_inc(&F2FS_I_SB(inode)->vw_cnt))
-#define stat_dec_volatile_write(inode) \
- (atomic_dec(&F2FS_I_SB(inode)->vw_cnt))
-#define stat_update_max_volatile_write(inode) \
- do { \
- int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \
- int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \
- if (cur > max) \
- atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \
- } while (0)
#define stat_inc_seg_count(sbi, type, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
@@ -4018,9 +3999,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#define stat_add_compr_blocks(inode, blocks) do { } while (0)
#define stat_sub_compr_blocks(inode, blocks) do { } while (0)
#define stat_update_max_atomic_write(inode) do { } while (0)
-#define stat_inc_volatile_write(inode) do { } while (0)
-#define stat_dec_volatile_write(inode) do { } while (0)
-#define stat_update_max_volatile_write(inode) do { } while (0)
#define stat_inc_meta_count(sbi, blkaddr) do { } while (0)
#define stat_inc_seg_type(sbi, curseg) do { } while (0)
#define stat_inc_block_count(sbi, curseg) do { } while (0)
@@ -4053,6 +4031,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab;
* inline.c
*/
bool f2fs_may_inline_data(struct inode *inode);
+bool f2fs_sanity_check_inline_data(struct inode *inode);
bool f2fs_may_inline_dentry(struct inode *inode);
void f2fs_do_read_inline_data(struct page *page, struct page *ipage);
void f2fs_truncate_inline_inode(struct inode *inode,
@@ -4422,8 +4401,7 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
static inline bool f2fs_may_compress(struct inode *inode)
{
if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) ||
- f2fs_is_atomic_file(inode) ||
- f2fs_is_volatile_file(inode))
+ f2fs_is_atomic_file(inode))
return false;
return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
}
@@ -4431,8 +4409,8 @@ static inline bool f2fs_may_compress(struct inode *inode)
static inline void f2fs_i_compr_blocks_update(struct inode *inode,
u64 blocks, bool add)
{
- int diff = F2FS_I(inode)->i_cluster_size - blocks;
struct f2fs_inode_info *fi = F2FS_I(inode);
+ int diff = fi->i_cluster_size - blocks;
/* don't update i_compr_blocks if saved blocks were released */
if (!add && !atomic_read(&fi->i_compr_blocks))
@@ -4540,6 +4518,21 @@ static inline void f2fs_io_schedule_timeout(long timeout)
io_schedule_timeout(timeout);
}
+static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs,
+ enum page_type type)
+{
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+
+ if (ofs == sbi->page_eio_ofs[type]) {
+ if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO)
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ } else {
+ sbi->page_eio_ofs[type] = ofs;
+ sbi->page_eio_cnt[type] = 0;
+ }
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 100637b1adb3..bd14cef1b08f 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -372,7 +372,8 @@ sync_nodes:
f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
clear_inode_flag(inode, FI_APPEND_WRITE);
flush_out:
- if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER)
+ if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) ||
+ (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi)))
ret = f2fs_issue_flush(sbi, inode->i_ino);
if (!ret) {
f2fs_remove_ino_entry(sbi, ino, UPDATE_INO);
@@ -1437,11 +1438,19 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
ret = -ENOSPC;
break;
}
- if (dn->data_blkaddr != NEW_ADDR) {
- f2fs_invalidate_blocks(sbi, dn->data_blkaddr);
- dn->data_blkaddr = NEW_ADDR;
- f2fs_set_data_blkaddr(dn);
+
+ if (dn->data_blkaddr == NEW_ADDR)
+ continue;
+
+ if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr,
+ DATA_GENERIC_ENHANCE)) {
+ ret = -EFSCORRUPTED;
+ break;
}
+
+ f2fs_invalidate_blocks(sbi, dn->data_blkaddr);
+ dn->data_blkaddr = NEW_ADDR;
+ f2fs_set_data_blkaddr(dn);
}
f2fs_update_extent_cache_range(dn, start, 0, index - start);
@@ -1638,6 +1647,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
struct f2fs_map_blocks map = { .m_next_pgofs = NULL,
.m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE,
.m_may_create = true };
+ struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO,
+ .init_gc_type = FG_GC,
+ .should_migrate_blocks = false,
+ .err_gc_skipped = true,
+ .nr_free_secs = 0 };
pgoff_t pg_start, pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_end;
@@ -1675,8 +1689,8 @@ next_alloc:
if (has_not_enough_free_secs(sbi, 0,
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
f2fs_down_write(&sbi->gc_lock);
- err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
- if (err && err != -ENODATA && err != -EAGAIN)
+ err = f2fs_gc(sbi, &gc_control);
+ if (err && err != -ENODATA)
goto out_err;
}
@@ -1766,6 +1780,10 @@ static long f2fs_fallocate(struct file *file, int mode,
inode_lock(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
if (offset >= inode->i_size)
goto out;
@@ -1804,16 +1822,8 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
atomic_read(&inode->i_writecount) != 1)
return 0;
- /* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- f2fs_drop_inmem_pages(inode);
- if (f2fs_is_volatile_file(inode)) {
- set_inode_flag(inode, FI_DROP_CACHE);
- filemap_fdatawrite(inode->i_mapping);
- clear_inode_flag(inode, FI_DROP_CACHE);
- clear_inode_flag(inode, FI_VOLATILE_FILE);
- stat_dec_volatile_write(inode);
- }
+ f2fs_abort_atomic_write(inode, true);
return 0;
}
@@ -1828,8 +1838,8 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id)
* before dropping file lock, it needs to do in ->flush.
*/
if (f2fs_is_atomic_file(inode) &&
- F2FS_I(inode)->inmem_task == current)
- f2fs_drop_inmem_pages(inode);
+ F2FS_I(inode)->atomic_write_task == current)
+ f2fs_abort_atomic_write(inode, true);
return 0;
}
@@ -1992,6 +2002,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct inode *pinode;
int ret;
if (!inode_owner_or_capable(mnt_userns, inode))
@@ -2014,44 +2025,55 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
goto out;
}
- if (f2fs_is_atomic_file(inode)) {
- if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
- ret = -EINVAL;
+ if (f2fs_is_atomic_file(inode))
goto out;
- }
ret = f2fs_convert_inline_inode(inode);
if (ret)
goto out;
- f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
/*
* Should wait end_io to count F2FS_WB_CP_DATA correctly by
* f2fs_is_atomic_file.
*/
if (get_dirty_pages(inode))
- f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u",
+ f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret) {
- f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ goto out;
+ }
+
+ /* Create a COW inode for atomic write */
+ pinode = f2fs_iget(inode->i_sb, fi->i_pino);
+ if (IS_ERR(pinode)) {
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ ret = PTR_ERR(pinode);
+ goto out;
+ }
+
+ ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode);
+ iput(pinode);
+ if (ret) {
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
goto out;
}
+ f2fs_i_size_write(fi->cow_inode, i_size_read(inode));
spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- if (list_empty(&fi->inmem_ilist))
- list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
sbi->atomic_files++;
spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
- /* add inode in inmem_list first and set atomic_file */
set_inode_flag(inode, FI_ATOMIC_FILE);
- clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
- f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE);
+ clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- F2FS_I(inode)->inmem_task = current;
+ f2fs_update_time(sbi, REQ_TIME);
+ fi->atomic_write_task = current;
stat_update_max_atomic_write(inode);
out:
inode_unlock(inode);
@@ -2076,127 +2098,20 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
inode_lock(inode);
- if (f2fs_is_volatile_file(inode)) {
- ret = -EINVAL;
- goto err_out;
- }
-
if (f2fs_is_atomic_file(inode)) {
- ret = f2fs_commit_inmem_pages(inode);
+ ret = f2fs_commit_atomic_write(inode);
if (ret)
- goto err_out;
+ goto unlock_out;
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
if (!ret)
- f2fs_drop_inmem_pages(inode);
+ f2fs_abort_atomic_write(inode, false);
} else {
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
}
-err_out:
- if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
- clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
- ret = -EINVAL;
- }
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return ret;
-}
-
-static int f2fs_ioc_start_volatile_write(struct file *filp)
-{
- struct inode *inode = file_inode(filp);
- struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
- int ret;
-
- if (!inode_owner_or_capable(mnt_userns, inode))
- return -EACCES;
-
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
-
- inode_lock(inode);
-
- if (f2fs_is_volatile_file(inode))
- goto out;
-
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- goto out;
-
- stat_inc_volatile_write(inode);
- stat_update_max_volatile_write(inode);
-
- set_inode_flag(inode, FI_VOLATILE_FILE);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
-out:
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return ret;
-}
-
-static int f2fs_ioc_release_volatile_write(struct file *filp)
-{
- struct inode *inode = file_inode(filp);
- struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
- int ret;
-
- if (!inode_owner_or_capable(mnt_userns, inode))
- return -EACCES;
-
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
-
- inode_lock(inode);
-
- if (!f2fs_is_volatile_file(inode))
- goto out;
-
- if (!f2fs_is_first_block_written(inode)) {
- ret = truncate_partial_data_page(inode, 0, true);
- goto out;
- }
-
- ret = punch_hole(inode, 0, F2FS_BLKSIZE);
-out:
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return ret;
-}
-
-static int f2fs_ioc_abort_volatile_write(struct file *filp)
-{
- struct inode *inode = file_inode(filp);
- struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
- int ret;
-
- if (!inode_owner_or_capable(mnt_userns, inode))
- return -EACCES;
-
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
-
- inode_lock(inode);
-
- if (f2fs_is_atomic_file(inode))
- f2fs_drop_inmem_pages(inode);
- if (f2fs_is_volatile_file(inode)) {
- clear_inode_flag(inode, FI_VOLATILE_FILE);
- stat_dec_volatile_write(inode);
- ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
- }
-
- clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
-
+unlock_out:
inode_unlock(inode);
-
mnt_drop_write_file(filp);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return ret;
}
@@ -2437,6 +2352,10 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO,
+ .no_bg_gc = false,
+ .should_migrate_blocks = false,
+ .nr_free_secs = 0 };
__u32 sync;
int ret;
@@ -2462,7 +2381,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
f2fs_down_write(&sbi->gc_lock);
}
- ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO);
+ gc_control.init_gc_type = sync ? FG_GC : BG_GC;
+ gc_control.err_gc_skipped = sync;
+ ret = f2fs_gc(sbi, &gc_control);
out:
mnt_drop_write_file(filp);
return ret;
@@ -2471,6 +2392,12 @@ out:
static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp));
+ struct f2fs_gc_control gc_control = {
+ .init_gc_type = range->sync ? FG_GC : BG_GC,
+ .no_bg_gc = false,
+ .should_migrate_blocks = false,
+ .err_gc_skipped = range->sync,
+ .nr_free_secs = 0 };
u64 end;
int ret;
@@ -2498,8 +2425,8 @@ do_more:
f2fs_down_write(&sbi->gc_lock);
}
- ret = f2fs_gc(sbi, range->sync, true, false,
- GET_SEGNO(sbi, range->start));
+ gc_control.victim_segno = GET_SEGNO(sbi, range->start);
+ ret = f2fs_gc(sbi, &gc_control);
if (ret) {
if (ret == -EBUSY)
ret = -EAGAIN;
@@ -2674,6 +2601,7 @@ do_map:
}
set_page_dirty(page);
+ set_page_private_gcing(page);
f2fs_put_page(page, 1);
idx++;
@@ -2913,6 +2841,11 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
unsigned int start_segno = 0, end_segno = 0;
unsigned int dev_start_segno = 0, dev_end_segno = 0;
struct f2fs_flush_device range;
+ struct f2fs_gc_control gc_control = {
+ .init_gc_type = FG_GC,
+ .should_migrate_blocks = true,
+ .err_gc_skipped = true,
+ .nr_free_secs = 0 };
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -2956,7 +2889,9 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
sm->last_victim[GC_CB] = end_segno + 1;
sm->last_victim[GC_GREEDY] = end_segno + 1;
sm->last_victim[ALLOC_NEXT] = end_segno + 1;
- ret = f2fs_gc(sbi, true, true, true, start_segno);
+
+ gc_control.victim_segno = start_segno;
+ ret = f2fs_gc(sbi, &gc_control);
if (ret == -EAGAIN)
ret = 0;
else if (ret < 0)
@@ -3017,7 +2952,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
- if (projid_eq(kprojid, F2FS_I(inode)->i_projid))
+ if (projid_eq(kprojid, fi->i_projid))
return 0;
err = -EPERM;
@@ -3037,7 +2972,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
if (err)
goto out_unlock;
- F2FS_I(inode)->i_projid = kprojid;
+ fi->i_projid = kprojid;
inode->i_ctime = current_time(inode);
f2fs_mark_inode_dirty_sync(inode, true);
out_unlock:
@@ -3987,7 +3922,7 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg)
struct f2fs_inode_info *fi = F2FS_I(inode);
pgoff_t page_idx = 0, last_idx;
unsigned int blk_per_seg = sbi->blocks_per_seg;
- int cluster_size = F2FS_I(inode)->i_cluster_size;
+ int cluster_size = fi->i_cluster_size;
int count, ret;
if (!f2fs_sb_has_compression(sbi) ||
@@ -4010,11 +3945,6 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg)
goto out;
}
- if (f2fs_is_mmap_file(inode)) {
- ret = -EBUSY;
- goto out;
- }
-
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret)
goto out;
@@ -4082,11 +4012,6 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg)
goto out;
}
- if (f2fs_is_mmap_file(inode)) {
- ret = -EBUSY;
- goto out;
- }
-
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret)
goto out;
@@ -4136,11 +4061,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
return f2fs_ioc_commit_atomic_write(filp);
case F2FS_IOC_START_VOLATILE_WRITE:
- return f2fs_ioc_start_volatile_write(filp);
case F2FS_IOC_RELEASE_VOLATILE_WRITE:
- return f2fs_ioc_release_volatile_write(filp);
case F2FS_IOC_ABORT_VOLATILE_WRITE:
- return f2fs_ioc_abort_volatile_write(filp);
+ return -EOPNOTSUPP;
case F2FS_IOC_SHUTDOWN:
return f2fs_ioc_shutdown(filp, arg);
case FITRIM:
@@ -4328,17 +4251,39 @@ out:
static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ const loff_t pos = iocb->ki_pos;
ssize_t ret;
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- if (f2fs_should_use_dio(inode, iocb, to))
- return f2fs_dio_read_iter(iocb, to);
+ if (trace_f2fs_dataread_start_enabled()) {
+ char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL);
+ char *path;
+
+ if (!p)
+ goto skip_read_trace;
+
+ path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX);
+ if (IS_ERR(path)) {
+ kfree(p);
+ goto skip_read_trace;
+ }
- ret = filemap_read(iocb, to, 0);
- if (ret > 0)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret);
+ trace_f2fs_dataread_start(inode, pos, iov_iter_count(to),
+ current->pid, path, current->comm);
+ kfree(p);
+ }
+skip_read_trace:
+ if (f2fs_should_use_dio(inode, iocb, to)) {
+ ret = f2fs_dio_read_iter(iocb, to);
+ } else {
+ ret = filemap_read(iocb, to, 0);
+ if (ret > 0)
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret);
+ }
+ if (trace_f2fs_dataread_end_enabled())
+ trace_f2fs_dataread_end(inode, pos, ret);
return ret;
}
@@ -4630,14 +4575,36 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* Possibly preallocate the blocks for the write. */
target_size = iocb->ki_pos + iov_iter_count(from);
preallocated = f2fs_preallocate_blocks(iocb, from, dio);
- if (preallocated < 0)
+ if (preallocated < 0) {
ret = preallocated;
- else
+ } else {
+ if (trace_f2fs_datawrite_start_enabled()) {
+ char *p = f2fs_kmalloc(F2FS_I_SB(inode),
+ PATH_MAX, GFP_KERNEL);
+ char *path;
+
+ if (!p)
+ goto skip_write_trace;
+ path = dentry_path_raw(file_dentry(iocb->ki_filp),
+ p, PATH_MAX);
+ if (IS_ERR(path)) {
+ kfree(p);
+ goto skip_write_trace;
+ }
+ trace_f2fs_datawrite_start(inode, orig_pos, orig_count,
+ current->pid, path, current->comm);
+ kfree(p);
+ }
+skip_write_trace:
/* Do the actual write. */
ret = dio ?
f2fs_dio_write_iter(iocb, from, &may_need_sync):
f2fs_buffered_write_iter(iocb, from);
+ if (trace_f2fs_datawrite_end_enabled())
+ trace_f2fs_datawrite_end(inode, orig_pos, ret);
+ }
+
/* Don't leave any preallocated blocks around past i_size. */
if (preallocated && i_size_read(inode) < target_size) {
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ea5b93b689cd..d5fb426e0747 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -35,6 +35,10 @@ static int gc_thread_func(void *data)
wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq;
unsigned int wait_ms;
+ struct f2fs_gc_control gc_control = {
+ .victim_segno = NULL_SEGNO,
+ .should_migrate_blocks = false,
+ .err_gc_skipped = false };
wait_ms = gc_th->min_sleep_time;
@@ -141,8 +145,12 @@ do_gc:
if (foreground)
sync_mode = false;
+ gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC;
+ gc_control.no_bg_gc = foreground;
+ gc_control.nr_free_secs = foreground ? 1 : 0;
+
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO))
+ if (f2fs_gc(sbi, &gc_control))
wait_ms = gc_th->no_gc_sleep_time;
if (foreground)
@@ -646,6 +654,54 @@ static void release_victim_entry(struct f2fs_sb_info *sbi)
f2fs_bug_on(sbi, !list_empty(&am->victim_list));
}
+static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
+ if (!dirty_i->enable_pin_section)
+ return false;
+ if (!test_and_set_bit(secno, dirty_i->pinned_secmap))
+ dirty_i->pinned_secmap_cnt++;
+ return true;
+}
+
+static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i)
+{
+ return dirty_i->pinned_secmap_cnt;
+}
+
+static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i,
+ unsigned int secno)
+{
+ return dirty_i->enable_pin_section &&
+ f2fs_pinned_section_exists(dirty_i) &&
+ test_bit(secno, dirty_i->pinned_secmap);
+}
+
+static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable)
+{
+ unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
+
+ if (f2fs_pinned_section_exists(DIRTY_I(sbi))) {
+ memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size);
+ DIRTY_I(sbi)->pinned_secmap_cnt = 0;
+ }
+ DIRTY_I(sbi)->enable_pin_section = enable;
+}
+
+static int f2fs_gc_pinned_control(struct inode *inode, int gc_type,
+ unsigned int segno)
+{
+ if (!f2fs_is_pinned_file(inode))
+ return 0;
+ if (gc_type != FG_GC)
+ return -EBUSY;
+ if (!f2fs_pin_section(F2FS_I_SB(inode), segno))
+ f2fs_pin_file_control(inode, true);
+ return -EAGAIN;
+}
+
/*
* This function is called from two paths.
* One is garbage collection and the other is SSR segment selection.
@@ -787,6 +843,9 @@ retry:
if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
goto next;
+ if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno))
+ goto next;
+
if (is_atgc) {
add_victim_entry(sbi, &p, segno);
goto next;
@@ -1194,18 +1253,9 @@ static int move_data_block(struct inode *inode, block_t bidx,
goto out;
}
- if (f2fs_is_atomic_file(inode)) {
- F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
- F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
- err = -EAGAIN;
- goto out;
- }
-
- if (f2fs_is_pinned_file(inode)) {
- f2fs_pin_file_control(inode, true);
- err = -EAGAIN;
+ err = f2fs_gc_pinned_control(inode, gc_type, segno);
+ if (err)
goto out;
- }
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
@@ -1344,18 +1394,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
goto out;
}
- if (f2fs_is_atomic_file(inode)) {
- F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
- F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
- err = -EAGAIN;
- goto out;
- }
- if (f2fs_is_pinned_file(inode)) {
- if (gc_type == FG_GC)
- f2fs_pin_file_control(inode, true);
- err = -EAGAIN;
+ err = f2fs_gc_pinned_control(inode, gc_type, segno);
+ if (err)
goto out;
- }
if (gc_type == BG_GC) {
if (PageWriteback(page)) {
@@ -1475,11 +1516,19 @@ next_step:
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
if (phase == 3) {
+ int err;
+
inode = f2fs_iget(sb, dni.ino);
if (IS_ERR(inode) || is_bad_inode(inode) ||
special_file(inode->i_mode))
continue;
+ err = f2fs_gc_pinned_control(inode, gc_type, segno);
+ if (err == -EAGAIN) {
+ iput(inode);
+ return submitted;
+ }
+
if (!f2fs_down_write_trylock(
&F2FS_I(inode)->i_gc_rwsem[WRITE])) {
iput(inode);
@@ -1699,23 +1748,21 @@ skip:
return seg_freed;
}
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
- bool background, bool force, unsigned int segno)
+int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control)
{
- int gc_type = sync ? FG_GC : BG_GC;
+ int gc_type = gc_control->init_gc_type;
+ unsigned int segno = gc_control->victim_segno;
int sec_freed = 0, seg_freed = 0, total_freed = 0;
int ret = 0;
struct cp_control cpc;
- unsigned int init_segno = segno;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
- unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC];
- unsigned long long first_skipped;
unsigned int skipped_round = 0, round = 0;
- trace_f2fs_gc_begin(sbi->sb, sync, background,
+ trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc,
+ gc_control->nr_free_secs,
get_pages(sbi, F2FS_DIRTY_NODES),
get_pages(sbi, F2FS_DIRTY_DENTS),
get_pages(sbi, F2FS_DIRTY_IMETA),
@@ -1726,7 +1773,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
cpc.reason = __get_cp_reason(sbi);
sbi->skipped_gc_rwsem = 0;
- first_skipped = last_skipped;
gc_more:
if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
ret = -EINVAL;
@@ -1743,8 +1789,7 @@ gc_more:
* threshold, we can make them free by checkpoint. Then, we
* secure free segments which doesn't need fggc any more.
*/
- if (prefree_segments(sbi) &&
- !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
+ if (prefree_segments(sbi)) {
ret = f2fs_write_checkpoint(sbi, &cpc);
if (ret)
goto stop;
@@ -1754,54 +1799,69 @@ gc_more:
}
/* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
- if (gc_type == BG_GC && !background) {
+ if (gc_type == BG_GC && gc_control->no_bg_gc) {
ret = -EINVAL;
goto stop;
}
+retry:
ret = __get_victim(sbi, &segno, gc_type);
- if (ret)
+ if (ret) {
+ /* allow to search victim from sections has pinned data */
+ if (ret == -ENODATA && gc_type == FG_GC &&
+ f2fs_pinned_section_exists(DIRTY_I(sbi))) {
+ f2fs_unpin_all_sections(sbi, false);
+ goto retry;
+ }
goto stop;
+ }
- seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force);
- if (gc_type == FG_GC &&
- seg_freed == f2fs_usable_segs_in_sec(sbi, segno))
- sec_freed++;
+ seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type,
+ gc_control->should_migrate_blocks);
total_freed += seg_freed;
- if (gc_type == FG_GC) {
- if (sbi->skipped_atomic_files[FG_GC] > last_skipped ||
- sbi->skipped_gc_rwsem)
- skipped_round++;
- last_skipped = sbi->skipped_atomic_files[FG_GC];
- round++;
- }
+ if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno))
+ sec_freed++;
if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
- if (sync)
+ if (gc_control->init_gc_type == FG_GC ||
+ !has_not_enough_free_secs(sbi,
+ (gc_type == FG_GC) ? sec_freed : 0, 0)) {
+ if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs)
+ goto go_gc_more;
goto stop;
+ }
- if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
- if (skipped_round <= MAX_SKIP_GC_COUNT ||
- skipped_round * 2 < round) {
- segno = NULL_SEGNO;
- goto gc_more;
+ /* FG_GC stops GC by skip_count */
+ if (gc_type == FG_GC) {
+ if (sbi->skipped_gc_rwsem)
+ skipped_round++;
+ round++;
+ if (skipped_round > MAX_SKIP_GC_COUNT &&
+ skipped_round * 2 >= round) {
+ ret = f2fs_write_checkpoint(sbi, &cpc);
+ goto stop;
}
+ }
- if (first_skipped < last_skipped &&
- (last_skipped - first_skipped) >
- sbi->skipped_gc_rwsem) {
- f2fs_drop_inmem_pages_all(sbi, true);
- segno = NULL_SEGNO;
- goto gc_more;
- }
- if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))
- ret = f2fs_write_checkpoint(sbi, &cpc);
+ /* Write checkpoint to reclaim prefree segments */
+ if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE &&
+ prefree_segments(sbi)) {
+ ret = f2fs_write_checkpoint(sbi, &cpc);
+ if (ret)
+ goto stop;
}
+go_gc_more:
+ segno = NULL_SEGNO;
+ goto gc_more;
+
stop:
SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
- SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
+ SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno;
+
+ if (gc_type == FG_GC)
+ f2fs_unpin_all_sections(sbi, true);
trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
get_pages(sbi, F2FS_DIRTY_NODES),
@@ -1816,7 +1876,7 @@ stop:
put_gc_inode(&gc_list);
- if (sync && !ret)
+ if (gc_control->err_gc_skipped && !ret)
ret = sec_freed ? 0 : -EAGAIN;
return ret;
}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 3cb1e7a24740..049ce50cec9b 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -91,7 +91,7 @@ static u32 TEA_hash_name(const u8 *p, size_t len)
/*
* Compute @fname->hash. For all directories, @fname->disk_name must be set.
* For casefolded directories, @fname->usr_fname must be set, and also
- * @fname->cf_name if the filename is valid Unicode.
+ * @fname->cf_name if the filename is valid Unicode and is not "." or "..".
*/
void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname)
{
@@ -110,10 +110,11 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname)
/*
* If the casefolded name is provided, hash it instead of the
* on-disk name. If the casefolded name is *not* provided, that
- * should only be because the name wasn't valid Unicode, so fall
- * back to treating the name as an opaque byte sequence. Note
- * that to handle encrypted directories, the fallback must use
- * usr_fname (plaintext) rather than disk_name (ciphertext).
+ * should only be because the name wasn't valid Unicode or was
+ * "." or "..", so fall back to treating the name as an opaque
+ * byte sequence. Note that to handle encrypted directories,
+ * the fallback must use usr_fname (plaintext) rather than
+ * disk_name (ciphertext).
*/
WARN_ON_ONCE(!fname->usr_fname->name);
if (fname->cf_name.name) {
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index a578bf83b803..bf46a7dfbea2 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -14,21 +14,40 @@
#include "node.h"
#include <trace/events/f2fs.h>
-bool f2fs_may_inline_data(struct inode *inode)
+static bool support_inline_data(struct inode *inode)
{
if (f2fs_is_atomic_file(inode))
return false;
-
if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
return false;
-
if (i_size_read(inode) > MAX_INLINE_DATA(inode))
return false;
+ return true;
+}
- if (f2fs_post_read_required(inode))
+bool f2fs_may_inline_data(struct inode *inode)
+{
+ if (!support_inline_data(inode))
return false;
- return true;
+ return !f2fs_post_read_required(inode);
+}
+
+bool f2fs_sanity_check_inline_data(struct inode *inode)
+{
+ if (!f2fs_has_inline_data(inode))
+ return false;
+
+ if (!support_inline_data(inode))
+ return true;
+
+ /*
+ * used by sanity_check_inode(), when disk layout fields has not
+ * been synchronized to inmem fields.
+ */
+ return (S_ISREG(inode->i_mode) &&
+ (file_is_encrypt(inode) || file_is_verity(inode) ||
+ (F2FS_I(inode)->i_flags & F2FS_COMPR_FL)));
}
bool f2fs_may_inline_dentry(struct inode *inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 83639238a1fe..fc55f5bd1fcc 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -260,8 +260,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
- if (F2FS_I(inode)->extent_tree) {
- struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest;
+ if (fi->extent_tree) {
+ struct extent_info *ei = &fi->extent_tree->largest;
if (ei->len &&
(!f2fs_is_valid_blkaddr(sbi, ei->blk,
@@ -276,8 +276,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
}
}
- if (f2fs_has_inline_data(inode) &&
- (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) {
+ if (f2fs_sanity_check_inline_data(inode)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix",
__func__, inode->i_ino, inode->i_mode);
@@ -466,10 +465,10 @@ static int do_read_inode(struct inode *inode)
}
}
- F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
- F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
- F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
- F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
+ fi->i_disk_time[0] = inode->i_atime;
+ fi->i_disk_time[1] = inode->i_ctime;
+ fi->i_disk_time[2] = inode->i_mtime;
+ fi->i_disk_time[3] = fi->i_crtime;
f2fs_put_page(node_page, 1);
stat_inc_inline_xattr(inode);
@@ -745,9 +744,8 @@ void f2fs_evict_inode(struct inode *inode)
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
int err = 0;
- /* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- f2fs_drop_inmem_pages(inode);
+ f2fs_abort_atomic_write(inode, true);
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
@@ -796,8 +794,22 @@ retry:
f2fs_lock_op(sbi);
err = f2fs_remove_inode_page(inode);
f2fs_unlock_op(sbi);
- if (err == -ENOENT)
+ if (err == -ENOENT) {
err = 0;
+
+ /*
+ * in fuzzed image, another node may has the same
+ * block address as inode's, if it was truncated
+ * previously, truncation of inode node will fail.
+ */
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ f2fs_warn(F2FS_I_SB(inode),
+ "f2fs_evict_inode: inconsistent node id, ino:%lu",
+ inode->i_ino);
+ f2fs_inode_synced(inode);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
+ }
}
/* give more chances, if ENOMEM case */
diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c
index be599f31d3c4..d84c5f6cc09d 100644
--- a/fs/f2fs/iostat.c
+++ b/fs/f2fs/iostat.c
@@ -91,8 +91,9 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
unsigned int cnt;
struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
+ unsigned long flags;
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
for (idx = 0; idx < MAX_IO_TYPE; idx++) {
for (io = 0; io < NR_PAGE_TYPE; io++) {
cnt = io_lat->bio_cnt[idx][io];
@@ -106,7 +107,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
io_lat->bio_cnt[idx][io] = 0;
}
}
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
trace_f2fs_iostat_latency(sbi, iostat_lat);
}
@@ -115,14 +116,15 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
{
unsigned long long iostat_diff[NR_IO_TYPE];
int i;
+ unsigned long flags;
if (time_is_after_jiffies(sbi->iostat_next_period))
return;
/* Need double check under the lock */
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irqsave(&sbi->iostat_lock, flags);
if (time_is_after_jiffies(sbi->iostat_next_period)) {
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
return;
}
sbi->iostat_next_period = jiffies +
@@ -133,7 +135,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
sbi->prev_rw_iostat[i];
sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
}
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
trace_f2fs_iostat(sbi, iostat_diff);
@@ -145,25 +147,27 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int i;
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irq(&sbi->iostat_lock);
for (i = 0; i < NR_IO_TYPE; i++) {
sbi->rw_iostat[i] = 0;
sbi->prev_rw_iostat[i] = 0;
}
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irq(&sbi->iostat_lock);
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irq(&sbi->iostat_lat_lock);
memset(io_lat, 0, sizeof(struct iostat_lat_info));
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irq(&sbi->iostat_lat_lock);
}
void f2fs_update_iostat(struct f2fs_sb_info *sbi,
enum iostat_type type, unsigned long long io_bytes)
{
+ unsigned long flags;
+
if (!sbi->iostat_enable)
return;
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irqsave(&sbi->iostat_lock, flags);
sbi->rw_iostat[type] += io_bytes;
if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO)
@@ -172,7 +176,7 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi,
if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO)
sbi->rw_iostat[APP_READ_IO] += io_bytes;
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
f2fs_record_iostat(sbi);
}
@@ -185,6 +189,7 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
struct f2fs_sb_info *sbi = iostat_ctx->sbi;
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int idx;
+ unsigned long flags;
if (!sbi->iostat_enable)
return;
@@ -202,12 +207,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
idx = WRITE_ASYNC_IO;
}
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
io_lat->sum_lat[idx][iotype] += ts_diff;
io_lat->bio_cnt[idx][iotype]++;
if (ts_diff > io_lat->peak_lat[idx][iotype])
io_lat->peak_lat[idx][iotype] = ts_diff;
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
}
void iostat_update_and_unbind_ctx(struct bio *bio, int rw)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 5ed79b29999f..bf00d5057abb 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -37,13 +37,10 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
if (!inode)
return ERR_PTR(-ENOMEM);
- f2fs_lock_op(sbi);
if (!f2fs_alloc_nid(sbi, &ino)) {
- f2fs_unlock_op(sbi);
err = -ENOSPC;
goto fail;
}
- f2fs_unlock_op(sbi);
nid_free = true;
@@ -92,8 +89,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
if (test_opt(sbi, INLINE_XATTR))
set_inode_flag(inode, FI_INLINE_XATTR);
- if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
- set_inode_flag(inode, FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
set_inode_flag(inode, FI_INLINE_DENTRY);
@@ -110,10 +105,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
f2fs_init_extent_tree(inode, NULL);
- stat_inc_inline_xattr(inode);
- stat_inc_inline_inode(inode);
- stat_inc_inline_dir(inode);
-
F2FS_I(inode)->i_flags =
f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
@@ -130,6 +121,14 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
set_compress_context(inode);
}
+ /* Should enable inline_data after compression set */
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
+ set_inode_flag(inode, FI_INLINE_DATA);
+
+ stat_inc_inline_xattr(inode);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
f2fs_set_inode_flags(inode);
trace_f2fs_new_inode(inode, 0);
@@ -328,6 +327,9 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
if (!is_extension_exist(name, ext[i], false))
continue;
+ /* Do not use inline_data with compression */
+ stat_dec_inline_inode(inode);
+ clear_inode_flag(inode, FI_INLINE_DATA);
set_compress_context(inode);
return;
}
@@ -461,6 +463,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
return 0;
}
+ if (!S_ISDIR(dir->i_mode)) {
+ f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)",
+ dir->i_ino, dir->i_mode, pino);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ return -ENOTDIR;
+ }
+
err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -836,8 +845,8 @@ out:
}
static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode,
- struct inode **whiteout)
+ struct dentry *dentry, umode_t mode, bool is_whiteout,
+ struct inode **new_inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
@@ -851,7 +860,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (whiteout) {
+ if (is_whiteout) {
init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
inode->i_op = &f2fs_special_inode_operations;
} else {
@@ -876,21 +885,25 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
f2fs_add_orphan_inode(inode);
f2fs_alloc_nid_done(sbi, inode->i_ino);
- if (whiteout) {
+ if (is_whiteout) {
f2fs_i_links_write(inode, false);
spin_lock(&inode->i_lock);
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
-
- *whiteout = inode;
} else {
- d_tmpfile(dentry, inode);
+ if (dentry)
+ d_tmpfile(dentry, inode);
+ else
+ f2fs_i_links_write(inode, false);
}
/* link_count was changed by d_tmpfile as well. */
f2fs_unlock_op(sbi);
unlock_new_inode(inode);
+ if (new_inode)
+ *new_inode = inode;
+
f2fs_balance_fs(sbi, true);
return 0;
@@ -911,7 +924,7 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL);
+ return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL);
}
static int f2fs_create_whiteout(struct user_namespace *mnt_userns,
@@ -921,7 +934,13 @@ static int f2fs_create_whiteout(struct user_namespace *mnt_userns,
return -EIO;
return __f2fs_tmpfile(mnt_userns, dir, NULL,
- S_IFCHR | WHITEOUT_MODE, whiteout);
+ S_IFCHR | WHITEOUT_MODE, true, whiteout);
+}
+
+int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct inode **new_inode)
+{
+ return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode);
}
static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8ccff18560ff..cf6f7fc83c08 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -90,10 +90,6 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
atomic_read(&sbi->total_ext_node) *
sizeof(struct extent_node)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
- } else if (type == INMEM_PAGES) {
- /* it allows 20% / total_ram for inmemory pages */
- mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
- res = mem_size < (val.totalram / 5);
} else if (type == DISCARD_CACHE) {
mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
sizeof(struct discard_cmd)) >> PAGE_SHIFT;
@@ -1416,8 +1412,7 @@ repeat:
err = read_node_page(page, 0);
if (err < 0) {
- f2fs_put_page(page, 1);
- return ERR_PTR(err);
+ goto out_put_err;
} else if (err == LOCKED_PAGE) {
err = 0;
goto page_hit;
@@ -1443,19 +1438,23 @@ repeat:
goto out_err;
}
page_hit:
- if (unlikely(nid != nid_of_node(page))) {
- f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
+ if (likely(nid == nid_of_node(page)))
+ return page;
+
+ f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
nid, nid_of_node(page), ino_of_node(page),
ofs_of_node(page), cpver_of_node(page),
next_blkaddr_of_node(page));
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- err = -EINVAL;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ err = -EINVAL;
out_err:
- ClearPageUptodate(page);
- f2fs_put_page(page, 1);
- return ERR_PTR(err);
- }
- return page;
+ ClearPageUptodate(page);
+out_put_err:
+ /* ENOENT comes from read_node_page which is not an error. */
+ if (err != -ENOENT)
+ f2fs_handle_page_eio(sbi, page->index, NODE);
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
}
struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
@@ -1631,7 +1630,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
goto redirty_out;
}
- if (atomic && !test_opt(sbi, NOBARRIER))
+ if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi))
fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
/* should add to global list before clearing PAGECACHE status */
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 4c1d34bfea78..3c09cae058b0 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -147,7 +147,6 @@ enum mem_type {
DIRTY_DENTS, /* indicates dirty dentry pages */
INO_ENTRIES, /* indicates inode entries */
EXTENT_CACHE, /* indicates extent cache */
- INMEM_PAGES, /* indicates inmemory pages */
DISCARD_CACHE, /* indicates memory of cached discard cmds */
COMPRESS_PAGE, /* indicates memory of cached compressed pages */
BASE_CHECK, /* check kernel status */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7225ce09f3ab..874c1b9c41a2 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -30,7 +30,7 @@
static struct kmem_cache *discard_entry_slab;
static struct kmem_cache *discard_cmd_slab;
static struct kmem_cache *sit_entry_set_slab;
-static struct kmem_cache *inmem_entry_slab;
+static struct kmem_cache *revoke_entry_slab;
static unsigned long __reverse_ulong(unsigned char *str)
{
@@ -185,301 +185,175 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
SM_I(sbi)->min_ssr_sections + reserved_sections(sbi));
}
-void f2fs_register_inmem_page(struct inode *inode, struct page *page)
+void f2fs_abort_atomic_write(struct inode *inode, bool clean)
{
- struct inmem_pages *new;
-
- set_page_private_atomic(page);
-
- new = f2fs_kmem_cache_alloc(inmem_entry_slab,
- GFP_NOFS, true, NULL);
-
- /* add atomic page indices to the list */
- new->page = page;
- INIT_LIST_HEAD(&new->list);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
- /* increase reference count with clean state */
- get_page(page);
- mutex_lock(&F2FS_I(inode)->inmem_lock);
- list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages);
- inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
- mutex_unlock(&F2FS_I(inode)->inmem_lock);
+ if (f2fs_is_atomic_file(inode)) {
+ if (clean)
+ truncate_inode_pages_final(inode->i_mapping);
+ clear_inode_flag(fi->cow_inode, FI_ATOMIC_FILE);
+ iput(fi->cow_inode);
+ fi->cow_inode = NULL;
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
- trace_f2fs_register_inmem_page(page, INMEM);
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ sbi->atomic_files--;
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ }
}
-static int __revoke_inmem_pages(struct inode *inode,
- struct list_head *head, bool drop, bool recover,
- bool trylock)
+static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
+ block_t new_addr, block_t *old_addr, bool recover)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inmem_pages *cur, *tmp;
- int err = 0;
-
- list_for_each_entry_safe(cur, tmp, head, list) {
- struct page *page = cur->page;
-
- if (drop)
- trace_f2fs_commit_inmem_page(page, INMEM_DROP);
-
- if (trylock) {
- /*
- * to avoid deadlock in between page lock and
- * inmem_lock.
- */
- if (!trylock_page(page))
- continue;
- } else {
- lock_page(page);
- }
-
- f2fs_wait_on_page_writeback(page, DATA, true, true);
-
- if (recover) {
- struct dnode_of_data dn;
- struct node_info ni;
+ struct dnode_of_data dn;
+ struct node_info ni;
+ int err;
- trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
retry:
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = f2fs_get_dnode_of_data(&dn, page->index,
- LOOKUP_NODE);
- if (err) {
- if (err == -ENOMEM) {
- memalloc_retry_wait(GFP_NOFS);
- goto retry;
- }
- err = -EAGAIN;
- goto next;
- }
-
- err = f2fs_get_node_info(sbi, dn.nid, &ni, false);
- if (err) {
- f2fs_put_dnode(&dn);
- return err;
- }
-
- if (cur->old_addr == NEW_ADDR) {
- f2fs_invalidate_blocks(sbi, dn.data_blkaddr);
- f2fs_update_data_blkaddr(&dn, NEW_ADDR);
- } else
- f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
- cur->old_addr, ni.version, true, true);
- f2fs_put_dnode(&dn);
- }
-next:
- /* we don't need to invalidate this in the sccessful status */
- if (drop || recover) {
- ClearPageUptodate(page);
- clear_page_private_gcing(page);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA);
+ if (err) {
+ if (err == -ENOMEM) {
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ goto retry;
}
- detach_page_private(page);
- set_page_private(page, 0);
- f2fs_put_page(page, 1);
-
- list_del(&cur->list);
- kmem_cache_free(inmem_entry_slab, cur);
- dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+ return err;
}
- return err;
-}
-void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure)
-{
- struct list_head *head = &sbi->inode_list[ATOMIC_FILE];
- struct inode *inode;
- struct f2fs_inode_info *fi;
- unsigned int count = sbi->atomic_files;
- unsigned int looped = 0;
-next:
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- if (list_empty(head)) {
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
- return;
+ err = f2fs_get_node_info(sbi, dn.nid, &ni, false);
+ if (err) {
+ f2fs_put_dnode(&dn);
+ return err;
}
- fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist);
- inode = igrab(&fi->vfs_inode);
- if (inode)
- list_move_tail(&fi->inmem_ilist, head);
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
- if (inode) {
- if (gc_failure) {
- if (!fi->i_gc_failures[GC_FAILURE_ATOMIC])
- goto skip;
+ if (recover) {
+ /* dn.data_blkaddr is always valid */
+ if (!__is_valid_data_blkaddr(new_addr)) {
+ if (new_addr == NULL_ADDR)
+ dec_valid_block_count(sbi, inode, 1);
+ f2fs_invalidate_blocks(sbi, dn.data_blkaddr);
+ f2fs_update_data_blkaddr(&dn, new_addr);
+ } else {
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+ new_addr, ni.version, true, true);
}
- set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
- f2fs_drop_inmem_pages(inode);
-skip:
- iput(inode);
- }
- f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
- if (gc_failure) {
- if (++looped >= count)
- return;
- }
- goto next;
-}
-
-void f2fs_drop_inmem_pages(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_inode_info *fi = F2FS_I(inode);
+ } else {
+ blkcnt_t count = 1;
- do {
- mutex_lock(&fi->inmem_lock);
- if (list_empty(&fi->inmem_pages)) {
- fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
-
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- if (!list_empty(&fi->inmem_ilist))
- list_del_init(&fi->inmem_ilist);
- if (f2fs_is_atomic_file(inode)) {
- clear_inode_flag(inode, FI_ATOMIC_FILE);
- sbi->atomic_files--;
- }
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ *old_addr = dn.data_blkaddr;
+ f2fs_truncate_data_blocks_range(&dn, 1);
+ dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count);
+ inc_valid_block_count(sbi, inode, &count);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
+ ni.version, true, false);
+ }
- mutex_unlock(&fi->inmem_lock);
- break;
- }
- __revoke_inmem_pages(inode, &fi->inmem_pages,
- true, false, true);
- mutex_unlock(&fi->inmem_lock);
- } while (1);
+ f2fs_put_dnode(&dn);
+ return 0;
}
-void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
+static void __complete_revoke_list(struct inode *inode, struct list_head *head,
+ bool revoke)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct list_head *head = &fi->inmem_pages;
- struct inmem_pages *cur = NULL;
-
- f2fs_bug_on(sbi, !page_private_atomic(page));
+ struct revoke_entry *cur, *tmp;
- mutex_lock(&fi->inmem_lock);
- list_for_each_entry(cur, head, list) {
- if (cur->page == page)
- break;
+ list_for_each_entry_safe(cur, tmp, head, list) {
+ if (revoke)
+ __replace_atomic_write_block(inode, cur->index,
+ cur->old_addr, NULL, true);
+ list_del(&cur->list);
+ kmem_cache_free(revoke_entry_slab, cur);
}
-
- f2fs_bug_on(sbi, list_empty(head) || cur->page != page);
- list_del(&cur->list);
- mutex_unlock(&fi->inmem_lock);
-
- dec_page_count(sbi, F2FS_INMEM_PAGES);
- kmem_cache_free(inmem_entry_slab, cur);
-
- ClearPageUptodate(page);
- clear_page_private_atomic(page);
- f2fs_put_page(page, 0);
-
- detach_page_private(page);
- set_page_private(page, 0);
-
- trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
}
-static int __f2fs_commit_inmem_pages(struct inode *inode)
+static int __f2fs_commit_atomic_write(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct inmem_pages *cur, *tmp;
- struct f2fs_io_info fio = {
- .sbi = sbi,
- .ino = inode->i_ino,
- .type = DATA,
- .op = REQ_OP_WRITE,
- .op_flags = REQ_SYNC | REQ_PRIO,
- .io_type = FS_DATA_IO,
- };
+ struct inode *cow_inode = fi->cow_inode;
+ struct revoke_entry *new;
struct list_head revoke_list;
- bool submit_bio = false;
- int err = 0;
+ block_t blkaddr;
+ struct dnode_of_data dn;
+ pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ pgoff_t off = 0, blen, index;
+ int ret = 0, i;
INIT_LIST_HEAD(&revoke_list);
- list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
- struct page *page = cur->page;
+ while (len) {
+ blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len);
- lock_page(page);
- if (page->mapping == inode->i_mapping) {
- trace_f2fs_commit_inmem_page(page, INMEM);
+ set_new_dnode(&dn, cow_inode, NULL, NULL, 0);
+ ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA);
+ if (ret && ret != -ENOENT) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ ret = 0;
+ if (dn.max_level == 0)
+ goto out;
+ goto next;
+ }
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode),
+ len);
+ index = off;
+ for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) {
+ blkaddr = f2fs_data_blkaddr(&dn);
- set_page_dirty(page);
- if (clear_page_dirty_for_io(page)) {
- inode_dec_dirty_pages(inode);
- f2fs_remove_dirty_inode(inode);
- }
-retry:
- fio.page = page;
- fio.old_blkaddr = NULL_ADDR;
- fio.encrypted_page = NULL;
- fio.need_lock = LOCK_DONE;
- err = f2fs_do_write_data_page(&fio);
- if (err) {
- if (err == -ENOMEM) {
- memalloc_retry_wait(GFP_NOFS);
- goto retry;
- }
- unlock_page(page);
- break;
+ if (!__is_valid_data_blkaddr(blkaddr)) {
+ continue;
+ } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
+ DATA_GENERIC_ENHANCE)) {
+ f2fs_put_dnode(&dn);
+ ret = -EFSCORRUPTED;
+ goto out;
}
- /* record old blkaddr for revoking */
- cur->old_addr = fio.old_blkaddr;
- submit_bio = true;
- }
- unlock_page(page);
- list_move_tail(&cur->list, &revoke_list);
- }
- if (submit_bio)
- f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA);
+ new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS,
+ true, NULL);
- if (err) {
- /*
- * try to revoke all committed pages, but still we could fail
- * due to no memory or other reason, if that happened, EAGAIN
- * will be returned, which means in such case, transaction is
- * already not integrity, caller should use journal to do the
- * recovery or rewrite & commit last transaction. For other
- * error number, revoking was done by filesystem itself.
- */
- err = __revoke_inmem_pages(inode, &revoke_list,
- false, true, false);
+ ret = __replace_atomic_write_block(inode, index, blkaddr,
+ &new->old_addr, false);
+ if (ret) {
+ f2fs_put_dnode(&dn);
+ kmem_cache_free(revoke_entry_slab, new);
+ goto out;
+ }
- /* drop all uncommitted pages */
- __revoke_inmem_pages(inode, &fi->inmem_pages,
- true, false, false);
- } else {
- __revoke_inmem_pages(inode, &revoke_list,
- false, false, false);
+ f2fs_update_data_blkaddr(&dn, NULL_ADDR);
+ new->index = index;
+ list_add_tail(&new->list, &revoke_list);
+ }
+ f2fs_put_dnode(&dn);
+next:
+ off += blen;
+ len -= blen;
}
- return err;
+out:
+ __complete_revoke_list(inode, &revoke_list, ret ? true : false);
+
+ return ret;
}
-int f2fs_commit_inmem_pages(struct inode *inode)
+int f2fs_commit_atomic_write(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
int err;
- f2fs_balance_fs(sbi, true);
+ err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+ if (err)
+ return err;
f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
-
f2fs_lock_op(sbi);
- set_inode_flag(inode, FI_ATOMIC_COMMIT);
-
- mutex_lock(&fi->inmem_lock);
- err = __f2fs_commit_inmem_pages(inode);
- mutex_unlock(&fi->inmem_lock);
- clear_inode_flag(inode, FI_ATOMIC_COMMIT);
+ err = __f2fs_commit_atomic_write(inode);
f2fs_unlock_op(sbi);
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
@@ -520,8 +394,15 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
io_schedule();
finish_wait(&sbi->gc_thread->fggc_wq, &wait);
} else {
+ struct f2fs_gc_control gc_control = {
+ .victim_segno = NULL_SEGNO,
+ .init_gc_type = BG_GC,
+ .no_bg_gc = true,
+ .should_migrate_blocks = false,
+ .err_gc_skipped = false,
+ .nr_free_secs = 1 };
f2fs_down_write(&sbi->gc_lock);
- f2fs_gc(sbi, false, false, false, NULL_SEGNO);
+ f2fs_gc(sbi, &gc_control);
}
}
}
@@ -1664,33 +1545,32 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi,
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
&(dcc->fstrim_list) : &(dcc->wait_list);
- struct discard_cmd *dc, *tmp;
- bool need_wait;
+ struct discard_cmd *dc = NULL, *iter, *tmp;
unsigned int trimmed = 0;
next:
- need_wait = false;
+ dc = NULL;
mutex_lock(&dcc->cmd_lock);
- list_for_each_entry_safe(dc, tmp, wait_list, list) {
- if (dc->lstart + dc->len <= start || end <= dc->lstart)
+ list_for_each_entry_safe(iter, tmp, wait_list, list) {
+ if (iter->lstart + iter->len <= start || end <= iter->lstart)
continue;
- if (dc->len < dpolicy->granularity)
+ if (iter->len < dpolicy->granularity)
continue;
- if (dc->state == D_DONE && !dc->ref) {
- wait_for_completion_io(&dc->wait);
- if (!dc->error)
- trimmed += dc->len;
- __remove_discard_cmd(sbi, dc);
+ if (iter->state == D_DONE && !iter->ref) {
+ wait_for_completion_io(&iter->wait);
+ if (!iter->error)
+ trimmed += iter->len;
+ __remove_discard_cmd(sbi, iter);
} else {
- dc->ref++;
- need_wait = true;
+ iter->ref++;
+ dc = iter;
break;
}
}
mutex_unlock(&dcc->cmd_lock);
- if (need_wait) {
+ if (dc) {
trimmed += __wait_one_discard_bio(sbi, dc);
goto next;
}
@@ -3286,8 +3166,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
return CURSEG_COLD_DATA;
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
- f2fs_is_atomic_file(inode) ||
- f2fs_is_volatile_file(inode))
+ f2fs_is_atomic_file(inode))
return CURSEG_HOT_DATA;
return f2fs_rw_hint_to_seg_type(inode->i_write_hint);
} else {
@@ -4084,10 +3963,12 @@ static void adjust_sit_entry_set(struct sit_entry_set *ses,
return;
list_for_each_entry_continue(next, head, set_list)
- if (ses->entry_cnt <= next->entry_cnt)
- break;
+ if (ses->entry_cnt <= next->entry_cnt) {
+ list_move_tail(&ses->set_list, &next->set_list);
+ return;
+ }
- list_move_tail(&ses->set_list, &next->set_list);
+ list_move_tail(&ses->set_list, head);
}
static void add_sit_entry(unsigned int segno, struct list_head *head)
@@ -4455,7 +4336,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
int err = 0;
- block_t total_node_blocks = 0;
+ block_t sit_valid_blocks[2] = {0, 0};
do {
readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS,
@@ -4480,8 +4361,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
if (err)
return err;
seg_info_from_raw_sit(se, &sit);
- if (IS_NODESEG(se->type))
- total_node_blocks += se->valid_blocks;
+
+ sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
if (f2fs_block_unit_discard(sbi)) {
/* build discard map only one time */
@@ -4521,15 +4402,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
sit = sit_in_journal(journal, i);
old_valid_blocks = se->valid_blocks;
- if (IS_NODESEG(se->type))
- total_node_blocks -= old_valid_blocks;
+
+ sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks;
err = check_block_count(sbi, start, &sit);
if (err)
break;
seg_info_from_raw_sit(se, &sit);
- if (IS_NODESEG(se->type))
- total_node_blocks += se->valid_blocks;
+
+ sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
if (f2fs_block_unit_discard(sbi)) {
if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
@@ -4551,13 +4432,24 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
}
up_read(&curseg->journal_rwsem);
- if (!err && total_node_blocks != valid_node_count(sbi)) {
+ if (err)
+ return err;
+
+ if (sit_valid_blocks[NODE] != valid_node_count(sbi)) {
f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
- total_node_blocks, valid_node_count(sbi));
- err = -EFSCORRUPTED;
+ sit_valid_blocks[NODE], valid_node_count(sbi));
+ return -EFSCORRUPTED;
}
- return err;
+ if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] >
+ valid_user_blocks(sbi)) {
+ f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u",
+ sit_valid_blocks[DATA], sit_valid_blocks[NODE],
+ valid_user_blocks(sbi));
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
}
static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -4637,6 +4529,13 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
return -ENOMEM;
+
+ dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
+ if (!dirty_i->pinned_secmap)
+ return -ENOMEM;
+
+ dirty_i->pinned_secmap_cnt = 0;
+ dirty_i->enable_pin_section = true;
return 0;
}
@@ -5225,6 +5124,7 @@ static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ kvfree(dirty_i->pinned_secmap);
kvfree(dirty_i->victim_secmap);
}
@@ -5335,9 +5235,9 @@ int __init f2fs_create_segment_manager_caches(void)
if (!sit_entry_set_slab)
goto destroy_discard_cmd;
- inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry",
- sizeof(struct inmem_pages));
- if (!inmem_entry_slab)
+ revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry",
+ sizeof(struct revoke_entry));
+ if (!revoke_entry_slab)
goto destroy_sit_entry_set;
return 0;
@@ -5356,5 +5256,5 @@ void f2fs_destroy_segment_manager_caches(void)
kmem_cache_destroy(sit_entry_set_slab);
kmem_cache_destroy(discard_cmd_slab);
kmem_cache_destroy(discard_entry_slab);
- kmem_cache_destroy(inmem_entry_slab);
+ kmem_cache_destroy(revoke_entry_slab);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5c94caf0c0a1..3f277dfcb131 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -24,6 +24,7 @@
#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA)
#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE)
+#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA))
static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
unsigned short seg_type)
@@ -224,10 +225,10 @@ struct segment_allocation {
#define MAX_SKIP_GC_COUNT 16
-struct inmem_pages {
+struct revoke_entry {
struct list_head list;
- struct page *page;
block_t old_addr; /* for revoking when fail to commit */
+ pgoff_t index;
};
struct sit_info {
@@ -294,6 +295,9 @@ struct dirty_seglist_info {
struct mutex seglist_lock; /* lock for segment bitmaps */
int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */
unsigned long *victim_secmap; /* background GC victims */
+ unsigned long *pinned_secmap; /* pinned victims from foreground GC */
+ unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */
+ bool enable_pin_section; /* enable pinning section */
};
/* victim selection function for cleaning and SSR */
@@ -572,11 +576,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi));
}
-static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
+static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
+ unsigned int node_blocks, unsigned int dent_blocks)
{
- unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
- get_pages(sbi, F2FS_DIRTY_DENTS);
- unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+
unsigned int segno, left_blocks;
int i;
@@ -602,19 +605,28 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
int freed, int needed)
{
- int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
- int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
- int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+ unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
+ get_pages(sbi, F2FS_DIRTY_DENTS) +
+ get_pages(sbi, F2FS_DIRTY_IMETA);
+ unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+ unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi);
+ unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi);
+ unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi);
+ unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi);
+ unsigned int free, need_lower, need_upper;
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
- if (free_sections(sbi) + freed == reserved_sections(sbi) + needed &&
- has_curseg_enough_space(sbi))
+ free = free_sections(sbi) + freed;
+ need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed;
+ need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0);
+
+ if (free > need_upper)
return false;
- return (free_sections(sbi) + freed) <=
- (node_secs + 2 * dent_secs + imeta_secs +
- reserved_sections(sbi) + needed);
+ else if (free <= need_lower)
+ return true;
+ return !has_curseg_enough_space(sbi, node_blocks, dent_blocks);
}
static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ed3e8b7a8260..37221e94e5ef 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -525,10 +525,11 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
return -EINVAL;
}
f2fs_warn(sbi, "Test dummy encryption mode enabled");
+ return 0;
#else
- f2fs_warn(sbi, "Test dummy encryption mount option ignored");
+ f2fs_warn(sbi, "test_dummy_encryption option not supported");
+ return -EINVAL;
#endif
- return 0;
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -1339,9 +1340,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
- INIT_LIST_HEAD(&fi->inmem_ilist);
- INIT_LIST_HEAD(&fi->inmem_pages);
- mutex_init(&fi->inmem_lock);
init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
init_f2fs_rwsem(&fi->i_xattr_sem);
@@ -1382,9 +1380,8 @@ static int f2fs_drop_inode(struct inode *inode)
atomic_inc(&inode->i_count);
spin_unlock(&inode->i_lock);
- /* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- f2fs_drop_inmem_pages(inode);
+ f2fs_abort_atomic_write(inode, true);
/* should remain fi->extent_tree for writepage */
f2fs_destroy_extent_node(inode);
@@ -1707,18 +1704,23 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
block_t total_count, user_block_count, start_count;
u64 avail_node_count;
+ unsigned int total_valid_node_count;
total_count = le64_to_cpu(sbi->raw_super->block_count);
- user_block_count = sbi->user_block_count;
start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr);
buf->f_type = F2FS_SUPER_MAGIC;
buf->f_bsize = sbi->blocksize;
buf->f_blocks = total_count - start_count;
+
+ spin_lock(&sbi->stat_lock);
+
+ user_block_count = sbi->user_block_count;
+ total_valid_node_count = valid_node_count(sbi);
+ avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
buf->f_bfree = user_block_count - valid_user_blocks(sbi) -
sbi->current_reserved_blocks;
- spin_lock(&sbi->stat_lock);
if (unlikely(buf->f_bfree <= sbi->unusable_block_count))
buf->f_bfree = 0;
else
@@ -1731,14 +1733,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
else
buf->f_bavail = 0;
- avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
-
if (avail_node_count > user_block_count) {
buf->f_files = user_block_count;
buf->f_ffree = buf->f_bavail;
} else {
buf->f_files = avail_node_count;
- buf->f_ffree = min(avail_node_count - valid_node_count(sbi),
+ buf->f_ffree = min(avail_node_count - total_valid_node_count,
buf->f_bavail);
}
@@ -2055,7 +2055,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
{
unsigned int s_flags = sbi->sb->s_flags;
struct cp_control cpc;
- unsigned int gc_mode;
+ unsigned int gc_mode = sbi->gc_mode;
int err = 0;
int ret;
block_t unusable;
@@ -2066,14 +2066,25 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
}
sbi->sb->s_flags |= SB_ACTIVE;
+ /* check if we need more GC first */
+ unusable = f2fs_get_unusable_blocks(sbi);
+ if (!f2fs_disable_cp_again(sbi, unusable))
+ goto skip_gc;
+
f2fs_update_time(sbi, DISABLE_TIME);
- gc_mode = sbi->gc_mode;
sbi->gc_mode = GC_URGENT_HIGH;
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
+ struct f2fs_gc_control gc_control = {
+ .victim_segno = NULL_SEGNO,
+ .init_gc_type = FG_GC,
+ .should_migrate_blocks = false,
+ .err_gc_skipped = true,
+ .nr_free_secs = 1 };
+
f2fs_down_write(&sbi->gc_lock);
- err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
+ err = f2fs_gc(sbi, &gc_control);
if (err == -ENODATA) {
err = 0;
break;
@@ -2094,6 +2105,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
goto restore_flag;
}
+skip_gc:
f2fs_down_write(&sbi->gc_lock);
cpc.reason = CP_PAUSE;
set_sbi_flag(sbi, SBI_CP_DISABLED);
@@ -2684,7 +2696,8 @@ int f2fs_quota_sync(struct super_block *sb, int type)
if (!sb_has_quota_active(sb, cnt))
continue;
- inode_lock(dqopt->files[cnt]);
+ if (!f2fs_sb_has_quota_ino(sbi))
+ inode_lock(dqopt->files[cnt]);
/*
* do_quotactl
@@ -2703,7 +2716,8 @@ int f2fs_quota_sync(struct super_block *sb, int type)
f2fs_up_read(&sbi->quota_sem);
f2fs_unlock_op(sbi);
- inode_unlock(dqopt->files[cnt]);
+ if (!f2fs_sb_has_quota_ino(sbi))
+ inode_unlock(dqopt->files[cnt]);
if (ret)
break;
@@ -3648,22 +3662,29 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
struct block_device *bdev = FDEV(devi).bdev;
sector_t nr_sectors = bdev_nr_sectors(bdev);
struct f2fs_report_zones_args rep_zone_arg;
+ u64 zone_sectors;
int ret;
if (!f2fs_sb_has_blkzoned(sbi))
return 0;
+ zone_sectors = bdev_zone_sectors(bdev);
+ if (!is_power_of_2(zone_sectors)) {
+ f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n");
+ return -EINVAL;
+ }
+
if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
- SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)))
+ SECTOR_TO_BLOCK(zone_sectors))
return -EINVAL;
- sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev));
+ sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors);
if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
__ilog2_u32(sbi->blocks_per_blkz))
return -EINVAL;
sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
sbi->log_blocks_per_blkz;
- if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
+ if (nr_sectors & (zone_sectors - 1))
FDEV(devi).nr_blkz++;
FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi,
@@ -4070,30 +4091,9 @@ try_onemore:
set_sbi_flag(sbi, SBI_POR_DOING);
spin_lock_init(&sbi->stat_lock);
- for (i = 0; i < NR_PAGE_TYPE; i++) {
- int n = (i == META) ? 1 : NR_TEMP_TYPE;
- int j;
-
- sbi->write_io[i] =
- f2fs_kmalloc(sbi,
- array_size(n,
- sizeof(struct f2fs_bio_info)),
- GFP_KERNEL);
- if (!sbi->write_io[i]) {
- err = -ENOMEM;
- goto free_bio_info;
- }
-
- for (j = HOT; j < n; j++) {
- init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem);
- sbi->write_io[i][j].sbi = sbi;
- sbi->write_io[i][j].bio = NULL;
- spin_lock_init(&sbi->write_io[i][j].io_lock);
- INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
- INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
- init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock);
- }
- }
+ err = f2fs_init_write_merge_io(sbi);
+ if (err)
+ goto free_bio_info;
init_f2fs_rwsem(&sbi->cp_rwsem);
init_f2fs_rwsem(&sbi->quota_sem);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 65395ae188aa..7b8f2b41c29b 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -129,7 +129,7 @@ static int f2fs_begin_enable_verity(struct file *filp)
if (f2fs_verity_in_progress(inode))
return -EBUSY;
- if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ if (f2fs_is_atomic_file(inode))
return -EOPNOTSUPP;
/*
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 02d4d4234956..a415c02ede39 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -126,6 +126,7 @@ struct msdos_inode_info {
struct hlist_node i_fat_hash; /* hash by i_location */
struct hlist_node i_dir_hash; /* hash by i_logstart */
struct rw_semaphore truncate_lock; /* protect bmap against truncate */
+ struct timespec64 i_crtime; /* File creation (birth) time */
struct inode vfs_inode;
};
@@ -433,8 +434,15 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
__fat_fs_error(sb, 1, fmt , ## args)
#define fat_fs_error_ratelimit(sb, fmt, args...) \
__fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
+
+#define FAT_PRINTK_PREFIX "%sFAT-fs (%s): "
+#define fat_msg(sb, level, fmt, args...) \
+do { \
+ printk_index_subsys_emit(FAT_PRINTK_PREFIX, level, fmt, ##args);\
+ _fat_msg(sb, level, fmt, ##args); \
+} while (0)
__printf(3, 4) __cold
-void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
#define fat_msg_ratelimit(sb, level, fmt, args...) \
do { \
if (__ratelimit(&MSDOS_SB(sb)->ratelimit)) \
@@ -446,6 +454,10 @@ extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 __time, __le16 __date, u8 time_cs);
extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 *time, __le16 *date, u8 *time_cs);
+extern struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi,
+ const struct timespec64 *ts);
+extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi,
+ const struct timespec64 *ts);
extern int fat_truncate_time(struct inode *inode, struct timespec64 *now,
int flags);
extern int fat_update_time(struct inode *inode, struct timespec64 *now,
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 978ac6751aeb..1db348f8f887 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -94,7 +94,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
err_brelse:
brelse(bhs[0]);
err:
- fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr);
+ fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
+ (llu)blocknr);
return -EIO;
}
@@ -107,8 +108,8 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
fatent->bhs[0] = sb_bread(sb, blocknr);
if (!fatent->bhs[0]) {
- fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
- (llu)blocknr);
+ fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
+ (llu)blocknr);
return -EIO;
}
fatent->nr_bhs = 1;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index bf91f977debe..3dae3ed60f3a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -398,13 +398,21 @@ int fat_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+
generic_fillattr(mnt_userns, inode, stat);
- stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
+ stat->blksize = sbi->cluster_size;
- if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
+ if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) {
/* Use i_pos for ino. This is used as fileid of nfs. */
- stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode);
+ stat->ino = fat_i_pos_read(sbi, inode);
}
+
+ if (sbi->options.isvfat && request_mask & STATX_BTIME) {
+ stat->result_mask |= STATX_BTIME;
+ stat->btime = MSDOS_I(inode)->i_crtime;
+ }
+
return 0;
}
EXPORT_SYMBOL_GPL(fat_getattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 69b4d4ae64d7..a38238d75c08 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -567,12 +567,13 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
& ~((loff_t)sbi->cluster_size - 1)) >> 9;
fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
+ inode->i_ctime = inode->i_mtime;
if (sbi->options.isvfat) {
- fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime,
- de->cdate, de->ctime_cs);
fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
+ fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime,
+ de->cdate, de->ctime_cs);
} else
- fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME);
+ inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime);
return 0;
}
@@ -757,6 +758,8 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
ei->i_logstart = 0;
ei->i_attrs = 0;
ei->i_pos = 0;
+ ei->i_crtime.tv_sec = 0;
+ ei->i_crtime.tv_nsec = 0;
return &ei->vfs_inode;
}
@@ -888,10 +891,10 @@ retry:
&raw_entry->date, NULL);
if (sbi->options.isvfat) {
__le16 atime;
- fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
- &raw_entry->cdate, &raw_entry->ctime_cs);
fat_time_unix2fat(sbi, &inode->i_atime, &atime,
&raw_entry->adate, NULL);
+ fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime,
+ &raw_entry->cdate, &raw_entry->ctime_cs);
}
spin_unlock(&sbi->inode_hash_lock);
mark_buffer_dirty(bh);
@@ -1885,10 +1888,8 @@ out_invalid:
fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem");
out_fail:
- if (fsinfo_inode)
- iput(fsinfo_inode);
- if (fat_inode)
- iput(fat_inode);
+ iput(fsinfo_inode);
+ iput(fat_inode);
unload_nls(sbi->nls_io);
unload_nls(sbi->nls_disk);
fat_reset_iocharset(&sbi->options);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 91ca3c304211..7e5d6ae305f2 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -42,10 +42,16 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
EXPORT_SYMBOL_GPL(__fat_fs_error);
/**
- * fat_msg() - print preformated FAT specific messages. Every thing what is
- * not fat_fs_error() should be fat_msg().
+ * _fat_msg() - Print a preformatted FAT message based on a superblock.
+ * @sb: A pointer to a &struct super_block
+ * @level: A Kernel printk level constant
+ * @fmt: The printf-style format string to print.
+ *
+ * Everything that is not fat_fs_error() should be fat_msg().
+ *
+ * fat_msg() wraps _fat_msg() for printk indexing.
*/
-void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -53,7 +59,7 @@ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
+ _printk(FAT_PRINTK_PREFIX "%pV\n", level, sb->s_id, &vaf);
va_end(args);
}
@@ -187,7 +193,7 @@ static long days_in_year[] = {
0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
};
-static inline int fat_tz_offset(struct msdos_sb_info *sbi)
+static inline int fat_tz_offset(const struct msdos_sb_info *sbi)
{
return (sbi->options.tz_set ?
-sbi->options.time_offset :
@@ -275,23 +281,35 @@ static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts)
return (struct timespec64){ ts.tv_sec & ~1ULL, 0 };
}
-static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts)
+/*
+ * truncate atime to 24 hour granularity (00:00:00 in local timezone)
+ */
+struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi,
+ const struct timespec64 *ts)
+{
+ /* to localtime */
+ time64_t seconds = ts->tv_sec - fat_tz_offset(sbi);
+ s32 remainder;
+
+ div_s64_rem(seconds, SECS_PER_DAY, &remainder);
+ /* to day boundary, and back to unix time */
+ seconds = seconds + fat_tz_offset(sbi) - remainder;
+
+ return (struct timespec64){ seconds, 0 };
+}
+
+/*
+ * truncate mtime to 2 second granularity
+ */
+struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi,
+ const struct timespec64 *ts)
{
- if (ts.tv_nsec)
- ts.tv_nsec -= ts.tv_nsec % 10000000UL;
- return ts;
+ return fat_timespec64_trunc_2secs(*ts);
}
/*
* truncate the various times with appropriate granularity:
- * root inode:
- * all times always 0
- * all other inodes:
- * mtime - 2 seconds
- * ctime
- * msdos - 2 seconds
- * vfat - 10 milliseconds
- * atime - 24 hours (00:00:00 in local timezone)
+ * all times in root node are always 0
*/
int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
{
@@ -306,25 +324,15 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
ts = current_time(inode);
}
- if (flags & S_ATIME) {
- /* to localtime */
- time64_t seconds = now->tv_sec - fat_tz_offset(sbi);
- s32 remainder;
-
- div_s64_rem(seconds, SECS_PER_DAY, &remainder);
- /* to day boundary, and back to unix time */
- seconds = seconds + fat_tz_offset(sbi) - remainder;
-
- inode->i_atime = (struct timespec64){ seconds, 0 };
- }
- if (flags & S_CTIME) {
- if (sbi->options.isvfat)
- inode->i_ctime = fat_timespec64_trunc_10ms(*now);
- else
- inode->i_ctime = fat_timespec64_trunc_2secs(*now);
- }
+ if (flags & S_ATIME)
+ inode->i_atime = fat_truncate_atime(sbi, now);
+ /*
+ * ctime and mtime share the same on-disk field, and should be
+ * identical in memory. all mtime updates will be applied to ctime,
+ * but ctime updates are ignored.
+ */
if (flags & S_MTIME)
- inode->i_mtime = fat_timespec64_trunc_2secs(*now);
+ inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now);
return 0;
}
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 5369d82e0bfb..c573314806cf 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -780,8 +780,6 @@ static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir,
goto out;
}
inode_inc_iversion(inode);
- fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
- /* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
out:
@@ -878,8 +876,6 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
}
inode_inc_iversion(inode);
set_nlink(inode, 2);
- fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
- /* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f15d885b9796..34a3faa4886d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -56,11 +56,10 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
arg |= O_NONBLOCK;
/* Pipe packetized mode is controlled by O_DIRECT flag */
- if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
- if (!filp->f_mapping || !filp->f_mapping->a_ops ||
- !filp->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if (!S_ISFIFO(inode->i_mode) &&
+ (arg & O_DIRECT) &&
+ !(filp->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
if (filp->f_op->check_flags)
error = filp->f_op->check_flags(arg);
diff --git a/fs/file.c b/fs/file.c
index ee9317346702..3bcc1ecc314a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -630,32 +630,23 @@ EXPORT_SYMBOL(fd_install);
* @files: file struct to retrieve file from
* @fd: file descriptor to retrieve file for
*
- * If this functions returns an EINVAL error pointer the fd was beyond the
- * current maximum number of file descriptors for that fdtable.
+ * Context: files_lock must be held.
*
- * Returns: The file associated with @fd, on error returns an error pointer.
+ * Returns: The file associated with @fd (NULL if @fd is not open)
*/
static struct file *pick_file(struct files_struct *files, unsigned fd)
{
+ struct fdtable *fdt = files_fdtable(files);
struct file *file;
- struct fdtable *fdt;
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (fd >= fdt->max_fds) {
- file = ERR_PTR(-EINVAL);
- goto out_unlock;
- }
+ if (fd >= fdt->max_fds)
+ return NULL;
+
file = fdt->fd[fd];
- if (!file) {
- file = ERR_PTR(-EBADF);
- goto out_unlock;
+ if (file) {
+ rcu_assign_pointer(fdt->fd[fd], NULL);
+ __put_unused_fd(files, fd);
}
- rcu_assign_pointer(fdt->fd[fd], NULL);
- __put_unused_fd(files, fd);
-
-out_unlock:
- spin_unlock(&files->file_lock);
return file;
}
@@ -664,8 +655,10 @@ int close_fd(unsigned fd)
struct files_struct *files = current->files;
struct file *file;
+ spin_lock(&files->file_lock);
file = pick_file(files, fd);
- if (IS_ERR(file))
+ spin_unlock(&files->file_lock);
+ if (!file)
return -EBADF;
return filp_close(file, files);
@@ -702,20 +695,25 @@ static inline void __range_cloexec(struct files_struct *cur_fds,
static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
unsigned int max_fd)
{
+ unsigned n;
+
+ rcu_read_lock();
+ n = last_fd(files_fdtable(cur_fds));
+ rcu_read_unlock();
+ max_fd = min(max_fd, n);
+
while (fd <= max_fd) {
struct file *file;
+ spin_lock(&cur_fds->file_lock);
file = pick_file(cur_fds, fd++);
- if (!IS_ERR(file)) {
+ spin_unlock(&cur_fds->file_lock);
+
+ if (file) {
/* found a valid file to close */
filp_close(file, cur_fds);
cond_resched();
- continue;
}
-
- /* beyond the last fd in that table */
- if (PTR_ERR(file) == -EINVAL)
- return;
}
}
@@ -795,43 +793,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
* See close_fd_get_file() below, this variant assumes current->files->file_lock
* is held.
*/
-int __close_fd_get_file(unsigned int fd, struct file **res)
+struct file *__close_fd_get_file(unsigned int fd)
{
- struct files_struct *files = current->files;
- struct file *file;
- struct fdtable *fdt;
-
- fdt = files_fdtable(files);
- if (fd >= fdt->max_fds)
- goto out_err;
- file = fdt->fd[fd];
- if (!file)
- goto out_err;
- rcu_assign_pointer(fdt->fd[fd], NULL);
- __put_unused_fd(files, fd);
- get_file(file);
- *res = file;
- return 0;
-out_err:
- *res = NULL;
- return -ENOENT;
+ return pick_file(current->files, fd);
}
/*
* variant of close_fd that gets a ref on the file for later fput.
- * The caller must ensure that filp_close() called on the file, and then
- * an fput().
+ * The caller must ensure that filp_close() called on the file.
*/
-int close_fd_get_file(unsigned int fd, struct file **res)
+struct file *close_fd_get_file(unsigned int fd)
{
struct files_struct *files = current->files;
- int ret;
+ struct file *file;
spin_lock(&files->file_lock);
- ret = __close_fd_get_file(fd, res);
+ file = pick_file(files, fd);
spin_unlock(&files->file_lock);
- return ret;
+ return file;
}
void do_close_on_exec(struct files_struct *files)
@@ -871,7 +851,7 @@ void do_close_on_exec(struct files_struct *files)
}
static inline struct file *__fget_files_rcu(struct files_struct *files,
- unsigned int fd, fmode_t mask, unsigned int refs)
+ unsigned int fd, fmode_t mask)
{
for (;;) {
struct file *file;
@@ -897,10 +877,9 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
* Such a race can take two forms:
*
* (a) the file ref already went down to zero,
- * and get_file_rcu_many() fails. Just try
- * again:
+ * and get_file_rcu() fails. Just try again:
*/
- if (unlikely(!get_file_rcu_many(file, refs)))
+ if (unlikely(!get_file_rcu(file)))
continue;
/*
@@ -909,11 +888,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
* pointer having changed, because it always goes
* hand-in-hand with 'fdt'.
*
- * If so, we need to put our refs and try again.
+ * If so, we need to put our ref and try again.
*/
if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
unlikely(rcu_dereference_raw(*fdentry) != file)) {
- fput_many(file, refs);
+ fput(file);
continue;
}
@@ -926,37 +905,31 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
}
static struct file *__fget_files(struct files_struct *files, unsigned int fd,
- fmode_t mask, unsigned int refs)
+ fmode_t mask)
{
struct file *file;
rcu_read_lock();
- file = __fget_files_rcu(files, fd, mask, refs);
+ file = __fget_files_rcu(files, fd, mask);
rcu_read_unlock();
return file;
}
-static inline struct file *__fget(unsigned int fd, fmode_t mask,
- unsigned int refs)
-{
- return __fget_files(current->files, fd, mask, refs);
-}
-
-struct file *fget_many(unsigned int fd, unsigned int refs)
+static inline struct file *__fget(unsigned int fd, fmode_t mask)
{
- return __fget(fd, FMODE_PATH, refs);
+ return __fget_files(current->files, fd, mask);
}
struct file *fget(unsigned int fd)
{
- return __fget(fd, FMODE_PATH, 1);
+ return __fget(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fget);
struct file *fget_raw(unsigned int fd)
{
- return __fget(fd, 0, 1);
+ return __fget(fd, 0);
}
EXPORT_SYMBOL(fget_raw);
@@ -966,7 +939,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
task_lock(task);
if (task->files)
- file = __fget_files(task->files, fd, 0, 1);
+ file = __fget_files(task->files, fd, 0);
task_unlock(task);
return file;
@@ -1035,7 +1008,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
return 0;
return (unsigned long)file;
} else {
- file = __fget(fd, mask, 1);
+ file = __fget(fd, mask);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
diff --git a/fs/file_table.c b/fs/file_table.c
index ada8fe814db9..5424e3a8df5f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -368,9 +368,9 @@ EXPORT_SYMBOL_GPL(flush_delayed_fput);
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
-void fput_many(struct file *file, unsigned int refs)
+void fput(struct file *file)
{
- if (atomic_long_sub_and_test(refs, &file->f_count)) {
+ if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
@@ -389,11 +389,6 @@ void fput_many(struct file *file, unsigned int refs)
}
}
-void fput(struct file *file)
-{
- fput_many(file, 1);
-}
-
/*
* synchronous analog of fput(); for kernel threads that might be needed
* in some umount() (and thus can't use flush_delayed_fput() without
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index a41ea0ba6943..bffd156d6434 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -1,32 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_SUPER_H_
#define _VXFS_SUPER_H_
diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c
index 1fd41cf98b9f..de2a5bccb930 100644
--- a/fs/freevxfs/vxfs_bmap.c
+++ b/fs/freevxfs/vxfs_bmap.c
@@ -1,30 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index acc5477b3f23..fbcd603365ad 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -1,31 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_DIR_H_
#define _VXFS_DIR_H_
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index f5c428e21024..3a2180c5e208 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -1,31 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_EXTERN_H_
#define _VXFS_EXTERN_H_
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index a4610a77649e..c1174a3f8990 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -1,31 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_fshead.h b/fs/freevxfs/vxfs_fshead.h
index e026f0c49159..dfd2147599c4 100644
--- a/fs/freevxfs/vxfs_fshead.h
+++ b/fs/freevxfs/vxfs_fshead.h
@@ -1,32 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_FSHEAD_H_
#define _VXFS_FSHEAD_H_
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index a37431e443d3..c2ef9f0debbd 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -1,30 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 1f41b25ef38b..ceb6a12649ba 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -1,31 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_inode.h b/fs/freevxfs/vxfs_inode.h
index f012abed125d..1e9e138d2b33 100644
--- a/fs/freevxfs/vxfs_inode.h
+++ b/fs/freevxfs/vxfs_inode.h
@@ -1,32 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_INODE_H_
#define _VXFS_INODE_H_
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index a51425634f65..f04ba2ed1e1a 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -1,31 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c
index 813da6685151..23f35187c289 100644
--- a/fs/freevxfs/vxfs_olt.c
+++ b/fs/freevxfs/vxfs_olt.c
@@ -1,30 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h
index 0c0b0c9fa557..53afba08d617 100644
--- a/fs/freevxfs/vxfs_olt.h
+++ b/fs/freevxfs/vxfs_olt.h
@@ -1,31 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_OLT_H_
#define _VXFS_OLT_H_
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 6143ebab940d..0e633d2bfc7d 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -1,30 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 22eed5a73ac2..c3b82f716f9a 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -1,31 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a1074a26e784..05221366a16d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -120,6 +120,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
struct list_head *head)
{
assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
list_move(&inode->i_io_list, head);
@@ -738,7 +739,7 @@ EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
* incorrectly attributed).
*
* To resolve this issue, cgroup writeback detects the majority dirtier of
- * an inode and transfers the ownership to it. To avoid unnnecessary
+ * an inode and transfers the ownership to it. To avoid unnecessary
* oscillation, the detection mechanism keeps track of history and gives
* out the switch verdict only if the foreign usage pattern is stable over
* a certain amount of time and/or writeback attempts.
@@ -1365,9 +1366,9 @@ static int move_expired_inodes(struct list_head *delaying_queue,
inode = wb_inode(delaying_queue->prev);
if (inode_dirtied_after(inode, dirtied_before))
break;
+ spin_lock(&inode->i_lock);
list_move(&inode->i_io_list, &tmp);
moved++;
- spin_lock(&inode->i_lock);
inode->i_state |= I_SYNC_QUEUED;
spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
@@ -1383,7 +1384,12 @@ static int move_expired_inodes(struct list_head *delaying_queue,
goto out;
}
- /* Move inodes from one superblock together */
+ /*
+ * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue',
+ * we don't take inode->i_lock here because it is just a pointless overhead.
+ * Inode is already marked as I_SYNC_QUEUED so writeback list handling is
+ * fully under our control.
+ */
while (!list_empty(&tmp)) {
sb = wb_inode(tmp.prev)->i_sb;
list_for_each_prev_safe(pos, node, &tmp) {
@@ -1826,8 +1832,8 @@ static long writeback_sb_inodes(struct super_block *sb,
* We'll have another go at writing back this inode
* when we completed a full scan of b_io.
*/
- spin_unlock(&inode->i_lock);
requeue_io(inode, wb);
+ spin_unlock(&inode->i_lock);
trace_writeback_sb_inodes_requeue(inode);
continue;
}
@@ -2358,6 +2364,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
int dirtytime = 0;
+ struct bdi_writeback *wb = NULL;
trace_writeback_mark_inode_dirty(inode, flags);
@@ -2410,13 +2417,24 @@ void __mark_inode_dirty(struct inode *inode, int flags)
inode->i_state |= flags;
/*
+ * Grab inode's wb early because it requires dropping i_lock and we
+ * need to make sure following checks happen atomically with dirty
+ * list handling so that we don't move inodes under flush worker's
+ * hands.
+ */
+ if (!was_dirty) {
+ wb = locked_inode_to_wb_and_lock_list(inode);
+ spin_lock(&inode->i_lock);
+ }
+
+ /*
* If the inode is queued for writeback by flush worker, just
* update its dirty state. Once the flush worker is done with
* the inode it will place it on the appropriate superblock
* list, based upon its state.
*/
if (inode->i_state & I_SYNC_QUEUED)
- goto out_unlock_inode;
+ goto out_unlock;
/*
* Only add valid (hashed) inodes to the superblock's
@@ -2424,22 +2442,19 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
if (!S_ISBLK(inode->i_mode)) {
if (inode_unhashed(inode))
- goto out_unlock_inode;
+ goto out_unlock;
}
if (inode->i_state & I_FREEING)
- goto out_unlock_inode;
+ goto out_unlock;
/*
* If the inode was already on b_dirty/b_io/b_more_io, don't
* reposition it (that would break b_dirty time-ordering).
*/
if (!was_dirty) {
- struct bdi_writeback *wb;
struct list_head *dirty_list;
bool wakeup_bdi = false;
- wb = locked_inode_to_wb_and_lock_list(inode);
-
inode->dirtied_when = jiffies;
if (dirtytime)
inode->dirtied_time_when = jiffies;
@@ -2453,6 +2468,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
dirty_list);
spin_unlock(&wb->list_lock);
+ spin_unlock(&inode->i_lock);
trace_writeback_dirty_inode_enqueue(inode);
/*
@@ -2467,6 +2483,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
return;
}
}
+out_unlock:
+ if (wb)
+ spin_unlock(&wb->list_lock);
out_unlock_inode:
spin_unlock(&inode->i_lock);
}
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 27a890aa493a..fc9d2d9fd234 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -119,7 +119,7 @@ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
const char *fs_name;
int ret;
- if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ if (!may_mount())
return -EPERM;
if (flags & ~FSOPEN_CLOEXEC)
@@ -162,7 +162,7 @@ SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags
unsigned int lookup_flags;
int ret;
- if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ if (!may_mount())
return -EPERM;
if ((flags & ~(FSPICK_CLOEXEC |
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index d7d3a7f06862..10eb50cbf398 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -1241,8 +1241,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
id = dax_read_lock();
- nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL,
- NULL);
+ nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size),
+ DAX_ACCESS, NULL, NULL);
dax_read_unlock(id);
if (nr_pages < 0) {
pr_debug("dax_direct_access() returned %ld\n", nr_pages);
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 86b7dbb6a0d4..8db53fa67359 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -752,7 +752,8 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
* offset.
*/
static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
- long nr_pages, void **kaddr, pfn_t *pfn)
+ long nr_pages, enum dax_access_mode mode,
+ void **kaddr, pfn_t *pfn)
{
struct virtio_fs *fs = dax_get_private(dax_dev);
phys_addr_t offset = PFN_PHYS(pgoff);
@@ -772,7 +773,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
long rc;
void *kaddr;
- rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL);
+ rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr,
+ NULL);
if (rc < 0)
return rc;
memset(kaddr, 0, nr_pages << PAGE_SHIFT);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e540d1290099..0b36a16659b6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -405,10 +405,13 @@ static void do_error(struct gfs2_glock *gl, const int ret)
/**
* demote_incompat_holders - demote incompatible demoteable holders
* @gl: the glock we want to promote
- * @new_gh: the new holder to be promoted
+ * @current_gh: the newly promoted holder
+ *
+ * We're passing the newly promoted holder in @current_gh, but actually, any of
+ * the strong holders would do.
*/
static void demote_incompat_holders(struct gfs2_glock *gl,
- struct gfs2_holder *new_gh)
+ struct gfs2_holder *current_gh)
{
struct gfs2_holder *gh, *tmp;
@@ -424,8 +427,10 @@ static void demote_incompat_holders(struct gfs2_glock *gl,
*/
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
return;
+ if (gh == current_gh)
+ continue;
if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
- !may_grant(gl, new_gh, gh)) {
+ !may_grant(gl, current_gh, gh)) {
/*
* We should not recurse into do_promote because
* __gfs2_glock_dq only calls handle_callback,
@@ -478,8 +483,7 @@ find_first_strong_holder(struct gfs2_glock *gl)
* gfs2_instantiate - Call the glops instantiate function
* @gh: The glock holder
*
- * Returns: 0 if instantiate was successful, 2 if type specific operation is
- * underway, or error.
+ * Returns: 0 if instantiate was successful, or error.
*/
int gfs2_instantiate(struct gfs2_holder *gh)
{
@@ -489,7 +493,7 @@ int gfs2_instantiate(struct gfs2_holder *gh)
again:
if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags))
- return 0;
+ goto done;
/*
* Since we unlock the lockref lock, we set a flag to indicate
@@ -508,37 +512,36 @@ again:
goto again;
}
- ret = glops->go_instantiate(gh);
+ ret = glops->go_instantiate(gl);
if (!ret)
clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags);
- return ret;
+ if (ret)
+ return ret;
+
+done:
+ if (glops->go_held)
+ return glops->go_held(gh);
+ return 0;
}
/**
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
*
- * Returns: 1 if there is a blocked holder at the head of the list, or 2
- * if a type specific operation is underway.
+ * Returns: 1 if there is a blocked holder at the head of the list
*/
static int do_promote(struct gfs2_glock *gl)
-__releases(&gl->gl_lockref.lock)
-__acquires(&gl->gl_lockref.lock)
{
- struct gfs2_holder *gh, *tmp, *first_gh;
+ struct gfs2_holder *gh, *current_gh;
bool incompat_holders_demoted = false;
- bool lock_released;
- int ret;
-restart:
- first_gh = find_first_strong_holder(gl);
- list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- lock_released = false;
+ current_gh = find_first_strong_holder(gl);
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
- if (!may_grant(gl, first_gh, gh)) {
+ if (!may_grant(gl, current_gh, gh)) {
/*
* If we get here, it means we may not grant this
* holder for some reason. If this holder is at the
@@ -550,37 +553,14 @@ restart:
do_error(gl, 0);
break;
}
- if (!incompat_holders_demoted) {
- demote_incompat_holders(gl, first_gh);
- incompat_holders_demoted = true;
- first_gh = gh;
- }
- if (test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags) &&
- !(gh->gh_flags & GL_SKIP) && gl->gl_ops->go_instantiate) {
- lock_released = true;
- spin_unlock(&gl->gl_lockref.lock);
- ret = gfs2_instantiate(gh);
- spin_lock(&gl->gl_lockref.lock);
- if (ret) {
- if (ret == 1)
- return 2;
- gh->gh_error = ret;
- list_del_init(&gh->gh_list);
- trace_gfs2_glock_queue(gh, 0);
- gfs2_holder_wake(gh);
- goto restart;
- }
- }
set_bit(HIF_HOLDER, &gh->gh_iflags);
trace_gfs2_promote(gh);
gfs2_holder_wake(gh);
- /*
- * If we released the gl_lockref.lock the holders list may have
- * changed. For that reason, we start again at the start of
- * the holders queue.
- */
- if (lock_released)
- goto restart;
+ if (!incompat_holders_demoted) {
+ current_gh = gh;
+ demote_incompat_holders(gl, current_gh);
+ incompat_holders_demoted = true;
+ }
}
return 0;
}
@@ -658,7 +638,6 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh;
unsigned state = ret & LM_OUT_ST_MASK;
- int rv;
spin_lock(&gl->gl_lockref.lock);
trace_gfs2_glock_state_change(gl, state);
@@ -716,6 +695,8 @@ retry:
gfs2_demote_wake(gl);
if (state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
+ int rv;
+
spin_unlock(&gl->gl_lockref.lock);
rv = glops->go_xmote_bh(gl);
spin_lock(&gl->gl_lockref.lock);
@@ -724,13 +705,10 @@ retry:
goto out;
}
}
- rv = do_promote(gl);
- if (rv == 2)
- goto out_locked;
+ do_promote(gl);
}
out:
clear_bit(GLF_LOCK, &gl->gl_flags);
-out_locked:
spin_unlock(&gl->gl_lockref.lock);
}
@@ -887,7 +865,6 @@ __releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
struct gfs2_holder *gh = NULL;
- int ret;
if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
return;
@@ -906,18 +883,14 @@ __acquires(&gl->gl_lockref.lock)
} else {
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
gfs2_demote_wake(gl);
- ret = do_promote(gl);
- if (ret == 0)
+ if (do_promote(gl) == 0)
goto out_unlock;
- if (ret == 2)
- goto out;
gh = find_first_waiter(gl);
gl->gl_target = gh->gh_state;
if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
do_error(gl, 0); /* Fail queued try locks */
}
do_xmote(gl, gh, gl->gl_target);
-out:
return;
out_sched:
@@ -1315,6 +1288,25 @@ static void gfs2_glock_update_hold_time(struct gfs2_glock *gl,
}
/**
+ * gfs2_glock_holder_ready - holder is ready and its error code can be collected
+ * @gh: the glock holder
+ *
+ * Called when a glock holder no longer needs to be waited for because it is
+ * now either held (HIF_HOLDER set; gh_error == 0), or acquiring the lock has
+ * failed (gh_error != 0).
+ */
+
+int gfs2_glock_holder_ready(struct gfs2_holder *gh)
+{
+ if (gh->gh_error || (gh->gh_flags & GL_SKIP))
+ return gh->gh_error;
+ gh->gh_error = gfs2_instantiate(gh);
+ if (gh->gh_error)
+ gfs2_glock_dq(gh);
+ return gh->gh_error;
+}
+
+/**
* gfs2_glock_wait - wait on a glock acquisition
* @gh: the glock holder
*
@@ -1328,7 +1320,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
might_sleep();
wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
gfs2_glock_update_hold_time(gh->gh_gl, start_time);
- return gh->gh_error;
+ return gfs2_glock_holder_ready(gh);
}
static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs)
@@ -1356,7 +1348,6 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs)
struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd;
int i, ret = 0, timeout = 0;
unsigned long start_time = jiffies;
- bool keep_waiting;
might_sleep();
/*
@@ -1366,53 +1357,33 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs)
for (i = 0; i < num_gh; i++)
timeout += ghs[i].gh_gl->gl_hold_time << 1;
-wait_for_dlm:
if (!wait_event_timeout(sdp->sd_async_glock_wait,
- !glocks_pending(num_gh, ghs), timeout))
+ !glocks_pending(num_gh, ghs), timeout)) {
ret = -ESTALE; /* request timed out. */
+ goto out;
+ }
- /*
- * If dlm granted all our requests, we need to adjust the glock
- * minimum hold time values according to how long we waited.
- *
- * If our request timed out, we need to repeatedly release any held
- * glocks we acquired thus far to allow dlm to acquire the remaining
- * glocks without deadlocking. We cannot currently cancel outstanding
- * glock acquisitions.
- *
- * The HIF_WAIT bit tells us which requests still need a response from
- * dlm.
- *
- * If dlm sent us any errors, we return the first error we find.
- */
- keep_waiting = false;
for (i = 0; i < num_gh; i++) {
- /* Skip holders we have already dequeued below. */
- if (!gfs2_holder_queued(&ghs[i]))
- continue;
- /* Skip holders with a pending DLM response. */
- if (test_bit(HIF_WAIT, &ghs[i].gh_iflags)) {
- keep_waiting = true;
- continue;
- }
+ struct gfs2_holder *gh = &ghs[i];
+ int ret2;
- if (test_bit(HIF_HOLDER, &ghs[i].gh_iflags)) {
- if (ret == -ESTALE)
- gfs2_glock_dq(&ghs[i]);
- else
- gfs2_glock_update_hold_time(ghs[i].gh_gl,
- start_time);
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags)) {
+ gfs2_glock_update_hold_time(gh->gh_gl,
+ start_time);
}
+ ret2 = gfs2_glock_holder_ready(gh);
if (!ret)
- ret = ghs[i].gh_error;
+ ret = ret2;
}
- if (keep_waiting)
- goto wait_for_dlm;
+out:
+ if (ret) {
+ for (i = 0; i < num_gh; i++) {
+ struct gfs2_holder *gh = &ghs[i];
- /*
- * At this point, we've either acquired all locks or released them all.
- */
+ gfs2_glock_dq(gh);
+ }
+ }
return ret;
}
@@ -1491,10 +1462,10 @@ __acquires(&gl->gl_lockref.lock)
if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
if (test_bit(GLF_LOCK, &gl->gl_flags)) {
- struct gfs2_holder *first_gh;
+ struct gfs2_holder *current_gh;
- first_gh = find_first_strong_holder(gl);
- try_futile = !may_grant(gl, first_gh, gh);
+ current_gh = find_first_strong_holder(gl);
+ try_futile = !may_grant(gl, current_gh, gh);
}
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
goto fail;
@@ -2242,20 +2213,6 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
glock_hash_walk(dump_glock_func, sdp);
}
-void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
-{
- struct gfs2_glock *gl = ip->i_gl;
- int ret;
-
- ret = gfs2_truncatei_resume(ip);
- gfs2_glock_assert_withdraw(gl, ret == 0);
-
- spin_lock(&gl->gl_lockref.lock);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- run_queue(gl, 1);
- spin_unlock(&gl->gl_lockref.lock);
-}
-
static const char *state2str(unsigned state)
{
switch(state) {
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c0ae9100a0bc..5aed8b500cf5 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -213,6 +213,7 @@ extern void gfs2_holder_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq(struct gfs2_holder *gh);
extern int gfs2_glock_poll(struct gfs2_holder *gh);
extern int gfs2_instantiate(struct gfs2_holder *gh);
+extern int gfs2_glock_holder_ready(struct gfs2_holder *gh);
extern int gfs2_glock_wait(struct gfs2_holder *gh);
extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq(struct gfs2_holder *gh);
@@ -273,7 +274,6 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl);
extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl);
extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
extern void gfs2_glock_free(struct gfs2_glock *gl);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 392800f082a6..49210a2e7ce7 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -485,35 +485,33 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
* Returns: errno
*/
-static int inode_go_instantiate(struct gfs2_holder *gh)
+static int inode_go_instantiate(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip = gl->gl_object;
+
+ if (!ip) /* no inode to populate - read it in later */
+ return 0;
+
+ return gfs2_inode_refresh(ip);
+}
+
+static int inode_go_held(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = gl->gl_object;
int error = 0;
if (!ip) /* no inode to populate - read it in later */
- goto out;
-
- error = gfs2_inode_refresh(ip);
- if (error)
- goto out;
+ return 0;
if (gh->gh_state != LM_ST_DEFERRED)
inode_dio_wait(&ip->i_inode);
if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
(gl->gl_state == LM_ST_EXCLUSIVE) &&
- (gh->gh_state == LM_ST_EXCLUSIVE)) {
- spin_lock(&sdp->sd_trunc_lock);
- if (list_empty(&ip->i_trunc_list))
- list_add(&ip->i_trunc_list, &sdp->sd_trunc_list);
- spin_unlock(&sdp->sd_trunc_lock);
- wake_up(&sdp->sd_quota_wait);
- error = 1;
- }
+ (gh->gh_state == LM_ST_EXCLUSIVE))
+ error = gfs2_truncatei_resume(ip);
-out:
return error;
}
@@ -737,6 +735,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
.go_inval = inode_go_inval,
.go_demote_ok = inode_go_demote_ok,
.go_instantiate = inode_go_instantiate,
+ .go_held = inode_go_held,
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
.go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8c00fb389ae5..d09d9892cd05 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -219,7 +219,8 @@ struct gfs2_glock_operations {
int (*go_xmote_bh)(struct gfs2_glock *gl);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (const struct gfs2_glock *gl);
- int (*go_instantiate) (struct gfs2_holder *gh);
+ int (*go_instantiate) (struct gfs2_glock *gl);
+ int (*go_held)(struct gfs2_holder *gh);
void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl,
const char *fs_id_buf);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
@@ -396,7 +397,6 @@ struct gfs2_inode {
atomic_t i_sizehint; /* hint of the write size */
struct rw_semaphore i_rw_mutex;
struct list_head i_ordered;
- struct list_head i_trunc_list;
__be64 *i_hash_cache;
u32 i_entries;
u32 i_diskflags;
@@ -784,8 +784,6 @@ struct gfs2_sbd {
struct mutex sd_quota_mutex;
struct mutex sd_quota_sync_mutex;
wait_queue_head_t sd_quota_wait;
- struct list_head sd_trunc_list;
- spinlock_t sd_trunc_lock;
unsigned int sd_quota_slots;
unsigned long *sd_quota_bitmap;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 244187e3e70f..d94791527dcb 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -38,7 +38,6 @@ static void gfs2_init_inode_once(void *foo)
inode_init_once(&ip->i_inode);
atomic_set(&ip->i_sizehint, 0);
init_rwsem(&ip->i_rw_mutex);
- INIT_LIST_HEAD(&ip->i_trunc_list);
INIT_LIST_HEAD(&ip->i_ordered);
ip->i_qadata = NULL;
gfs2_holder_mark_uninitialized(&ip->i_rgd_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c9b423c874a3..549879929c84 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -106,8 +106,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mutex_init(&sdp->sd_quota_mutex);
mutex_init(&sdp->sd_quota_sync_mutex);
init_waitqueue_head(&sdp->sd_quota_wait);
- INIT_LIST_HEAD(&sdp->sd_trunc_list);
- spin_lock_init(&sdp->sd_trunc_lock);
spin_lock_init(&sdp->sd_bitmap_lock);
INIT_LIST_HEAD(&sdp->sd_sc_inodes_list);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 59d727a4ae2c..a6667e8d781f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1517,25 +1517,6 @@ static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
}
}
-static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
-{
- struct gfs2_inode *ip;
-
- while(1) {
- ip = NULL;
- spin_lock(&sdp->sd_trunc_lock);
- if (!list_empty(&sdp->sd_trunc_list)) {
- ip = list_first_entry(&sdp->sd_trunc_list,
- struct gfs2_inode, i_trunc_list);
- list_del_init(&ip->i_trunc_list);
- }
- spin_unlock(&sdp->sd_trunc_lock);
- if (ip == NULL)
- return;
- gfs2_glock_finish_truncate(ip);
- }
-}
-
void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
if (!sdp->sd_statfs_force_sync) {
sdp->sd_statfs_force_sync = 1;
@@ -1558,7 +1539,6 @@ int gfs2_quotad(void *data)
unsigned long quotad_timeo = 0;
unsigned long t = 0;
DEFINE_WAIT(wait);
- int empty;
while (!kthread_should_stop()) {
@@ -1579,19 +1559,13 @@ int gfs2_quotad(void *data)
quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
&quotad_timeo, &tune->gt_quota_quantum);
- /* Check for & recover partially truncated inodes */
- quotad_check_trunc_list(sdp);
-
try_to_freeze();
bypass:
t = min(quotad_timeo, statfs_timeo);
prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
- spin_lock(&sdp->sd_trunc_lock);
- empty = list_empty(&sdp->sd_trunc_list);
- spin_unlock(&sdp->sd_trunc_lock);
- if (empty && !sdp->sd_statfs_force_sync)
+ if (!sdp->sd_statfs_force_sync)
t -= schedule_timeout(t);
else
t = 0;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index b9b8faefbe84..f602fb844951 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1196,9 +1196,8 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
* Returns: errno
*/
-int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh)
+int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl)
{
- struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_rgrpd *rgd = gl->gl_object;
struct gfs2_sbd *sdp = rgd->rd_sbd;
unsigned int length = rgd->rd_length;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index fd6ee8bfe18d..00b30cf893af 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -31,7 +31,7 @@ extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
-extern int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh);
+extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl);
extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index bdb773e5c88f..b5b0f285b27f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1196,7 +1196,7 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode)
gfs2_glock_dq(gh);
return false;
}
- return true;
+ return gfs2_glock_holder_ready(gh) == 0;
}
/**
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2de9ca5d260d..02eb72351b15 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -195,7 +195,6 @@ out:
* Called under mmap_write_lock(mm).
*/
-#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long
hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
@@ -206,7 +205,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_base;
- info.high_limit = arch_get_mmap_end(addr);
+ info.high_limit = arch_get_mmap_end(addr, len, flags);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
@@ -237,21 +236,22 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = current->mm->mmap_base;
- info.high_limit = arch_get_mmap_end(addr);
+ info.high_limit = arch_get_mmap_end(addr, len, flags);
addr = vm_unmapped_area(&info);
}
return addr;
}
-static unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+unsigned long
+generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
- const unsigned long mmap_end = arch_get_mmap_end(addr);
+ const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -283,6 +283,15 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
}
+
+#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+static unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
+}
#endif
static size_t
@@ -405,7 +414,8 @@ static void remove_huge_page(struct page *page)
}
static void
-hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
+hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
+ zap_flags_t zap_flags)
{
struct vm_area_struct *vma;
@@ -439,7 +449,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
}
unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
- NULL);
+ NULL, zap_flags);
}
}
@@ -517,7 +527,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vmdelete_list(&mapping->i_mmap,
index * pages_per_huge_page(h),
- (index + 1) * pages_per_huge_page(h));
+ (index + 1) * pages_per_huge_page(h),
+ ZAP_FLAG_DROP_MARKER);
i_mmap_unlock_write(mapping);
}
@@ -583,46 +594,85 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_mmap_lock_write(mapping);
i_size_write(inode, offset);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
- hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
+ ZAP_FLAG_DROP_MARKER);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, offset, LLONG_MAX);
}
+static void hugetlbfs_zero_partial_page(struct hstate *h,
+ struct address_space *mapping,
+ loff_t start,
+ loff_t end)
+{
+ pgoff_t idx = start >> huge_page_shift(h);
+ struct folio *folio;
+
+ folio = filemap_lock_folio(mapping, idx);
+ if (!folio)
+ return;
+
+ start = start & ~huge_page_mask(h);
+ end = end & ~huge_page_mask(h);
+ if (!end)
+ end = huge_page_size(h);
+
+ folio_zero_segment(folio, (size_t)start, (size_t)end);
+
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
+ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ struct address_space *mapping = inode->i_mapping;
struct hstate *h = hstate_inode(inode);
loff_t hpage_size = huge_page_size(h);
loff_t hole_start, hole_end;
/*
- * For hole punch round up the beginning offset of the hole and
- * round down the end.
+ * hole_start and hole_end indicate the full pages within the hole.
*/
hole_start = round_up(offset, hpage_size);
hole_end = round_down(offset + len, hpage_size);
- if (hole_end > hole_start) {
- struct address_space *mapping = inode->i_mapping;
- struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ inode_lock(inode);
- inode_lock(inode);
+ /* protected by i_rwsem */
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+ inode_unlock(inode);
+ return -EPERM;
+ }
- /* protected by i_rwsem */
- if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
- inode_unlock(inode);
- return -EPERM;
- }
+ i_mmap_lock_write(mapping);
- i_mmap_lock_write(mapping);
+ /* If range starts before first full page, zero partial page. */
+ if (offset < hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ offset, min(offset + len, hole_start));
+
+ /* Unmap users of full pages in the hole. */
+ if (hole_end > hole_start) {
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap,
- hole_start >> PAGE_SHIFT,
- hole_end >> PAGE_SHIFT);
- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, hole_start, hole_end);
- inode_unlock(inode);
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT, 0);
}
+ /* If range extends beyond last full page, zero partial page. */
+ if ((offset + len) > hole_end && (offset + len) > hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ hole_end, offset + len);
+
+ i_mmap_unlock_write(mapping);
+
+ /* Remove full pages from the file. */
+ if (hole_end > hole_start)
+ remove_inode_hugepages(inode, hole_start, hole_end);
+
+ inode_unlock(inode);
+
return 0;
}
@@ -1048,12 +1098,12 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
if (sbinfo->spool) {
long free_pages;
- spin_lock(&sbinfo->spool->lock);
+ spin_lock_irq(&sbinfo->spool->lock);
buf->f_blocks = sbinfo->spool->max_hpages;
free_pages = sbinfo->spool->max_hpages
- sbinfo->spool->used_hpages;
buf->f_bavail = buf->f_bfree = free_pages;
- spin_unlock(&sbinfo->spool->lock);
+ spin_unlock_irq(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
diff --git a/fs/inode.c b/fs/inode.c
index 9d9b422504d1..bd4da9c5207e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -27,7 +27,7 @@
* Inode locking rules:
*
* inode->i_lock protects:
- * inode->i_state, inode->i_hash, __iget()
+ * inode->i_state, inode->i_hash, __iget(), inode->i_io_list
* Inode LRU list locks protect:
* inode->i_sb->s_inode_lru, inode->i_lru
* inode->i_sb->s_inode_list_lock protects:
diff --git a/fs/internal.h b/fs/internal.h
index 9a6c233ee7f1..87e96b9024ce 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -84,6 +84,7 @@ extern int __mnt_want_write_file(struct file *);
extern void __mnt_drop_write_file(struct file *);
extern void dissolve_on_fput(struct vfsmount *);
+extern bool may_mount(void);
int path_mount(const char *dev_name, struct path *path,
const char *type_page, unsigned long flags, void *data_page);
@@ -125,7 +126,7 @@ extern struct file *do_file_open_root(const struct path *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
-extern int __close_fd_get_file(unsigned int fd, struct file **res);
+extern struct file *__close_fd_get_file(unsigned int fd);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9f1c682d7caf..5ff2cdb425bc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -112,7 +112,8 @@
IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
+ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
+ REQ_F_ASYNC_DATA)
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
IO_REQ_CLEAN_FLAGS)
@@ -297,8 +298,8 @@ struct io_buffer_list {
/* below is for ring provided buffers */
__u16 buf_nr_pages;
__u16 nr_entries;
- __u32 head;
- __u32 mask;
+ __u16 head;
+ __u16 mask;
};
struct io_buffer {
@@ -540,6 +541,7 @@ struct io_uring_task {
const struct io_ring_ctx *last;
struct io_wq *io_wq;
struct percpu_counter inflight;
+ atomic_t inflight_tracked;
atomic_t in_idle;
spinlock_t task_lock;
@@ -781,12 +783,6 @@ struct io_msg {
u32 len;
};
-struct io_nop {
- struct file *file;
- u64 extra1;
- u64 extra2;
-};
-
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -848,6 +844,7 @@ enum {
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
+ REQ_F_CQE32_INIT_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
@@ -917,6 +914,8 @@ enum {
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
/* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
+ /* ->extra1 and ->extra2 are initialised */
+ REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
};
struct async_poll {
@@ -991,7 +990,6 @@ struct io_kiocb {
struct io_msg msg;
struct io_xattr xattr;
struct io_socket sock;
- struct io_nop nop;
struct io_uring_cmd uring_cmd;
};
@@ -1118,7 +1116,6 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_NOP] = {
.audit_skip = 1,
.iopoll = 1,
- .buffer_select = 1,
},
[IORING_OP_READV] = {
.needs_file = 1,
@@ -1355,8 +1352,6 @@ static void io_clean_op(struct io_kiocb *req);
static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
static struct file *io_file_get_normal(struct io_kiocb *req, int fd);
-static void io_drop_inflight_file(struct io_kiocb *req);
-static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
static void io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
@@ -1366,7 +1361,9 @@ static int io_req_prep_async(struct io_kiocb *req);
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index);
-static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
+static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
+ unsigned int offset);
+static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
static void io_eventfd_signal(struct io_ring_ctx *ctx);
@@ -1726,9 +1723,16 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return;
- /* don't recycle if we already did IO to this buffer */
- if (req->flags & REQ_F_PARTIAL_IO)
+ /*
+ * For legacy provided buffer mode, don't recycle if we already did
+ * IO to this buffer. For ring-mapped provided buffer mode, we should
+ * increment ring->head to explicitly monopolize the buffer to avoid
+ * multiple use.
+ */
+ if ((req->flags & REQ_F_BUFFER_SELECTED) &&
+ (req->flags & REQ_F_PARTIAL_IO))
return;
+
/*
* We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
* the flag and hence ensure that bl->head doesn't get incremented.
@@ -1736,8 +1740,13 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
*/
if (req->flags & REQ_F_BUFFER_RING) {
if (req->buf_list) {
- req->buf_index = req->buf_list->bgid;
- req->flags &= ~REQ_F_BUFFER_RING;
+ if (req->flags & REQ_F_PARTIAL_IO) {
+ req->buf_list->head++;
+ req->buf_list = NULL;
+ } else {
+ req->buf_index = req->buf_list->bgid;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ }
}
return;
}
@@ -1757,9 +1766,29 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
bool cancel_all)
__must_hold(&req->ctx->timeout_lock)
{
+ struct io_kiocb *req;
+
if (task && head->task != task)
return false;
- return cancel_all;
+ if (cancel_all)
+ return true;
+
+ io_for_each_link(req, head) {
+ if (req->flags & REQ_F_INFLIGHT)
+ return true;
+ }
+ return false;
+}
+
+static bool io_match_linked(struct io_kiocb *head)
+{
+ struct io_kiocb *req;
+
+ io_for_each_link(req, head) {
+ if (req->flags & REQ_F_INFLIGHT)
+ return true;
+ }
+ return false;
}
/*
@@ -1769,9 +1798,24 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all)
{
+ bool matched;
+
if (task && head->task != task)
return false;
- return cancel_all;
+ if (cancel_all)
+ return true;
+
+ if (head->flags & REQ_F_LINK_TIMEOUT) {
+ struct io_ring_ctx *ctx = head->ctx;
+
+ /* protect against races with linked timeouts */
+ spin_lock_irq(&ctx->timeout_lock);
+ matched = io_match_linked(head);
+ spin_unlock_irq(&ctx->timeout_lock);
+ } else {
+ matched = io_match_linked(head);
+ }
+ return matched;
}
static inline bool req_has_async_data(struct io_kiocb *req)
@@ -1927,6 +1971,14 @@ static inline bool io_req_ffs_set(struct io_kiocb *req)
return req->flags & REQ_F_FIXED_FILE;
}
+static inline void io_req_track_inflight(struct io_kiocb *req)
+{
+ if (!(req->flags & REQ_F_INFLIGHT)) {
+ req->flags |= REQ_F_INFLIGHT;
+ atomic_inc(&req->task->io_uring->inflight_tracked);
+ }
+}
+
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
if (WARN_ON_ONCE(!req->link))
@@ -2395,94 +2447,66 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
return true;
}
-static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
+static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
struct io_uring_cqe *cqe;
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
- WRITE_ONCE(cqe->user_data, user_data);
- WRITE_ONCE(cqe->res, res);
- WRITE_ONCE(cqe->flags, cflags);
- return true;
- }
- return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
-}
+ if (!(ctx->flags & IORING_SETUP_CQE32)) {
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags, 0, 0);
-static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- struct io_uring_cqe *cqe;
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ memcpy(cqe, &req->cqe, sizeof(*cqe));
+ return true;
+ }
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
- req->cqe.res, req->cqe.flags, 0, 0);
+ return io_cqring_event_overflow(ctx, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags,
+ 0, 0);
+ } else {
+ u64 extra1 = 0, extra2 = 0;
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
- memcpy(cqe, &req->cqe, sizeof(*cqe));
- return true;
- }
- return io_cqring_event_overflow(ctx, req->cqe.user_data,
- req->cqe.res, req->cqe.flags, 0, 0);
-}
+ if (req->flags & REQ_F_CQE32_INIT) {
+ extra1 = req->extra1;
+ extra2 = req->extra2;
+ }
-static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- struct io_uring_cqe *cqe;
- u64 extra1 = req->extra1;
- u64 extra2 = req->extra2;
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags, extra1, extra2);
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
- req->cqe.res, req->cqe.flags, extra1, extra2);
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
+ WRITE_ONCE(cqe->big_cqe[0], extra1);
+ WRITE_ONCE(cqe->big_cqe[1], extra2);
+ return true;
+ }
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
- memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
- cqe->big_cqe[0] = extra1;
- cqe->big_cqe[1] = extra2;
- return true;
+ return io_cqring_event_overflow(ctx, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags,
+ extra1, extra2);
}
-
- return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res,
- req->cqe.flags, extra1, extra2);
}
-static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
-{
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0);
- return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
-}
-
-static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags,
- u64 extra1, u64 extra2)
+static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
+ s32 res, u32 cflags)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
- if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
- return;
- if (req->flags & REQ_F_CQE_SKIP)
- return;
-
- trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags,
- extra1, extra2);
+ ctx->cq_extra++;
+ trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
/*
* If we can't get a cq entry, userspace overflowed the
@@ -2491,23 +2515,17 @@ static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags
*/
cqe = io_get_cqe(ctx);
if (likely(cqe)) {
- WRITE_ONCE(cqe->user_data, req->cqe.user_data);
+ WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- WRITE_ONCE(cqe->big_cqe[0], extra1);
- WRITE_ONCE(cqe->big_cqe[1], extra2);
- return;
- }
-
- io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2);
-}
-static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
-{
- ctx->cq_extra++;
- trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
- return __io_fill_cqe(ctx, user_data, res, cflags);
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ WRITE_ONCE(cqe->big_cqe[0], 0);
+ WRITE_ONCE(cqe->big_cqe[1], 0);
+ }
+ return true;
+ }
+ return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
}
static void __io_req_complete_put(struct io_kiocb *req)
@@ -2544,16 +2562,11 @@ static void __io_req_complete_put(struct io_kiocb *req)
static void __io_req_complete_post(struct io_kiocb *req, s32 res,
u32 cflags)
{
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, res, cflags);
- __io_req_complete_put(req);
-}
-
-static void __io_req_complete_post32(struct io_kiocb *req, s32 res,
- u32 cflags, u64 extra1, u64 extra2)
-{
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe32_req(req, res, cflags, extra1, extra2);
+ if (!(req->flags & REQ_F_CQE_SKIP)) {
+ req->cqe.res = res;
+ req->cqe.flags = cflags;
+ __io_fill_cqe_req(req->ctx, req);
+ }
__io_req_complete_put(req);
}
@@ -2568,18 +2581,6 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
io_cqring_ev_posted(ctx);
}
-static void io_req_complete_post32(struct io_kiocb *req, s32 res,
- u32 cflags, u64 extra1, u64 extra2)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock(&ctx->completion_lock);
- __io_req_complete_post32(req, res, cflags, extra1, extra2);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-}
-
static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
u32 cflags)
{
@@ -2597,19 +2598,6 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
-static inline void __io_req_complete32(struct io_kiocb *req,
- unsigned int issue_flags, s32 res,
- u32 cflags, u64 extra1, u64 extra2)
-{
- if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
- io_req_complete_state(req, res, cflags);
- req->extra1 = extra1;
- req->extra2 = extra2;
- } else {
- io_req_complete_post32(req, res, cflags, extra1, extra2);
- }
-}
-
static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
if (res < 0)
@@ -2988,8 +2976,6 @@ static void __io_req_task_work_add(struct io_kiocb *req,
unsigned long flags;
bool running;
- io_drop_inflight_file(req);
-
spin_lock_irqsave(&tctx->task_lock, flags);
wq_list_add_tail(&req->io_task_work.node, list);
running = tctx->task_running;
@@ -3158,12 +3144,8 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
- if (!(req->flags & REQ_F_CQE_SKIP)) {
- if (!(ctx->flags & IORING_SETUP_CQE32))
- __io_fill_cqe_req_filled(ctx, req);
- else
- __io_fill_cqe32_req_filled(ctx, req);
- }
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(ctx, req);
}
io_commit_cqring(ctx);
@@ -3282,7 +3264,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
nr_events++;
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
- __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));
+
+ req->cqe.flags = io_put_kbuf(req, 0);
+ __io_fill_cqe_req(req->ctx, req);
}
if (unlikely(!nr_events))
@@ -3453,7 +3437,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
if (unlikely(res != req->cqe.res)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE;
+ req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
return true;
}
req_set_fail(req);
@@ -3503,7 +3487,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
kiocb_end_write(req);
if (unlikely(res != req->cqe.res)) {
if (res == -EAGAIN && io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE;
+ req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
return;
}
req->cqe.res = res;
@@ -3633,6 +3617,20 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int ret;
kiocb->ki_pos = READ_ONCE(sqe->off);
+ /* used for fixed read/write too - just read unconditionally */
+ req->buf_index = READ_ONCE(sqe->buf_index);
+
+ if (req->opcode == IORING_OP_READ_FIXED ||
+ req->opcode == IORING_OP_WRITE_FIXED) {
+ struct io_ring_ctx *ctx = req->ctx;
+ u16 index;
+
+ if (unlikely(req->buf_index >= ctx->nr_user_bufs))
+ return -EFAULT;
+ index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
+ req->imu = ctx->user_bufs[index];
+ io_req_set_rsrc_node(req, ctx, 0);
+ }
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
@@ -3645,12 +3643,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kiocb->ki_ioprio = get_current_ioprio();
}
- req->imu = NULL;
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->rw.flags = READ_ONCE(sqe->rw_flags);
- /* used for fixed read/write too - just read unconditionally */
- req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -3782,20 +3777,9 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
unsigned int issue_flags)
{
- struct io_mapped_ubuf *imu = req->imu;
- u16 index, buf_index = req->buf_index;
-
- if (likely(!imu)) {
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(buf_index >= ctx->nr_user_bufs))
- return -EFAULT;
- io_req_set_rsrc_node(req, ctx, issue_flags);
- index = array_index_nospec(buf_index, ctx->nr_user_bufs);
- imu = READ_ONCE(ctx->user_bufs[index]);
- req->imu = imu;
- }
- return __io_import_fixed(req, rw, iter, imu);
+ if (WARN_ON_ONCE(!req->imu))
+ return -EFAULT;
+ return __io_import_fixed(req, rw, iter, req->imu);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -3832,19 +3816,17 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
{
struct io_uring_buf_ring *br = bl->buf_ring;
struct io_uring_buf *buf;
- __u32 head = bl->head;
+ __u16 head = bl->head;
- if (unlikely(smp_load_acquire(&br->tail) == head)) {
- io_ring_submit_unlock(req->ctx, issue_flags);
+ if (unlikely(smp_load_acquire(&br->tail) == head))
return NULL;
- }
head &= bl->mask;
if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
buf = &br->bufs[head];
} else {
int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
- int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1;
+ int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
buf = page_address(bl->buf_pages[index]);
buf += off;
}
@@ -3854,7 +3836,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
req->buf_list = bl;
req->buf_index = buf->bid;
- if (issue_flags & IO_URING_F_UNLOCKED) {
+ if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
/*
* If we came in unlocked, we have no choice but to consume the
* buffer here. This does mean it'll be pinned until the IO
@@ -4176,6 +4158,16 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
return 0;
}
+static int io_readv_prep_async(struct io_kiocb *req)
+{
+ return io_rw_prep_async(req, READ);
+}
+
+static int io_writev_prep_async(struct io_kiocb *req)
+{
+ return io_rw_prep_async(req, WRITE);
+}
+
/*
* This is our waitqueue callback handler, registered through __folio_lock_async()
* when we initially tried to do the IO with the iocb armed our waitqueue.
@@ -5025,10 +5017,18 @@ void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
req->uring_cmd.task_work_cb = task_work_cb;
req->io_task_work.func = io_uring_cmd_work;
- io_req_task_prio_work_add(req);
+ io_req_task_work_add(req);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
+static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
+ u64 extra1, u64 extra2)
+{
+ req->extra1 = extra1;
+ req->extra2 = extra2;
+ req->flags |= REQ_F_CQE32_INIT;
+}
+
/*
* Called by consumers of io_uring_cmd, if they originally returned
* -EIOCBQUEUED upon receiving the command.
@@ -5039,10 +5039,10 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
if (ret < 0)
req_set_fail(req);
+
if (req->ctx->flags & IORING_SETUP_CQE32)
- __io_req_complete32(req, 0, ret, 0, res2, 0);
- else
- io_req_complete(req, ret);
+ io_req_set_cqe32_extra(req, res2, 0);
+ io_req_complete(req, ret);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
@@ -5103,42 +5103,6 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
-static int io_shutdown_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
-#if defined(CONFIG_NET)
- if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
- sqe->buf_index || sqe->splice_fd_in))
- return -EINVAL;
-
- req->shutdown.how = READ_ONCE(sqe->len);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
-{
-#if defined(CONFIG_NET)
- struct socket *sock;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- ret = __sys_shutdown_sock(sock, req->shutdown.how);
- io_req_complete(req, ret);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
static int __io_splice_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -5240,14 +5204,6 @@ done:
static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- /*
- * If the ring is setup with CQE32, relay back addr/addr
- */
- if (req->ctx->flags & IORING_SETUP_CQE32) {
- req->nop.extra1 = READ_ONCE(sqe->addr);
- req->nop.extra2 = READ_ONCE(sqe->addr2);
- }
-
return 0;
}
@@ -5256,23 +5212,7 @@ static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
*/
static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
- unsigned int cflags;
- void __user *buf;
-
- if (req->flags & REQ_F_BUFFER_SELECT) {
- size_t len = 1;
-
- buf = io_buffer_select(req, &len, issue_flags);
- if (!buf)
- return -ENOBUFS;
- }
-
- cflags = io_put_kbuf(req, issue_flags);
- if (!(req->ctx->flags & IORING_SETUP_CQE32))
- __io_req_complete(req, issue_flags, 0, cflags);
- else
- __io_req_complete32(req, issue_flags, 0, cflags,
- req->nop.extra1, req->nop.extra2);
+ __io_req_complete(req, issue_flags, 0, 0);
return 0;
}
@@ -5445,15 +5385,11 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
unsigned long nr = ctx->nr_user_files;
int ret;
- if (table->alloc_hint >= nr)
- table->alloc_hint = 0;
-
do {
ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
- if (ret != nr) {
- table->alloc_hint = ret + 1;
+ if (ret != nr)
return ret;
- }
+
if (!table->alloc_hint)
break;
@@ -5464,6 +5400,10 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
return -ENFILE;
}
+/*
+ * Note when io_fixed_fd_install() returns error value, it will ensure
+ * fput() is called correspondingly.
+ */
static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
struct file *file, unsigned int file_slot)
{
@@ -5471,26 +5411,24 @@ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
struct io_ring_ctx *ctx = req->ctx;
int ret;
+ io_ring_submit_lock(ctx, issue_flags);
+
if (alloc_slot) {
- io_ring_submit_lock(ctx, issue_flags);
ret = io_file_bitmap_get(ctx);
- if (unlikely(ret < 0)) {
- io_ring_submit_unlock(ctx, issue_flags);
- return ret;
- }
-
+ if (unlikely(ret < 0))
+ goto err;
file_slot = ret;
} else {
file_slot--;
}
ret = io_install_fixed_file(req, file, issue_flags, file_slot);
- if (alloc_slot) {
- io_ring_submit_unlock(ctx, issue_flags);
- if (!ret)
- return file_slot;
- }
-
+ if (!ret && alloc_slot)
+ ret = file_slot;
+err:
+ io_ring_submit_unlock(ctx, issue_flags);
+ if (unlikely(ret < 0))
+ fput(file);
return ret;
}
@@ -5990,7 +5928,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
struct files_struct *files = current->files;
struct io_close *close = &req->close;
struct fdtable *fdt;
- struct file *file = NULL;
+ struct file *file;
int ret = -EBADF;
if (req->close.file_slot) {
@@ -6008,7 +5946,6 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
lockdep_is_held(&files->file_lock));
if (!file || file->f_op == &io_uring_fops) {
spin_unlock(&files->file_lock);
- file = NULL;
goto err;
}
@@ -6018,21 +5955,16 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
}
- ret = __close_fd_get_file(close->fd, &file);
+ file = __close_fd_get_file(close->fd);
spin_unlock(&files->file_lock);
- if (ret < 0) {
- if (ret == -ENOENT)
- ret = -EBADF;
+ if (!file)
goto err;
- }
/* No ->flush() or already async, safely close from here */
ret = filp_close(file, current->files);
err:
if (ret < 0)
req_set_fail(req);
- if (file)
- fput(file);
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -6063,6 +5995,34 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
}
#if defined(CONFIG_NET)
+static int io_shutdown_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
+ sqe->buf_index || sqe->splice_fd_in))
+ return -EINVAL;
+
+ req->shutdown.how = READ_ONCE(sqe->len);
+ return 0;
+}
+
+static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct socket *sock;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ ret = __sys_shutdown_sock(sock, req->shutdown.how);
+ io_req_complete(req, ret);
+ return 0;
+}
+
static bool io_net_retry(struct socket *sock, int flags)
{
if (!(flags & MSG_WAITALL))
@@ -6117,8 +6077,6 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(sqe->file_index))
return -EINVAL;
- if (unlikely(sqe->addr2 || sqe->file_index))
- return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -6355,8 +6313,6 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(sqe->file_index))
return -EINVAL;
- if (unlikely(sqe->addr2 || sqe->file_index))
- return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -6674,8 +6630,8 @@ static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
fd_install(fd, file);
ret = fd;
} else {
- ret = io_install_fixed_file(req, file, issue_flags,
- sock->file_slot - 1);
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ sock->file_slot);
}
__io_req_complete(req, issue_flags, ret, 0);
return 0;
@@ -6767,6 +6723,7 @@ IO_NETOP_PREP_ASYNC(recvmsg);
IO_NETOP_PREP_ASYNC(connect);
IO_NETOP_PREP(accept);
IO_NETOP_PREP(socket);
+IO_NETOP_PREP(shutdown);
IO_NETOP_FN(send);
IO_NETOP_FN(recv);
#endif /* CONFIG_NET */
@@ -6905,10 +6862,6 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
if (!req->cqe.res) {
struct poll_table_struct pt = { ._key = req->apoll_events };
- unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
-
- if (unlikely(!io_assign_file(req, flags)))
- return -EBADF;
req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
}
@@ -6997,7 +6950,8 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
io_req_complete_failed(req, ret);
}
-static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
+static void __io_poll_execute(struct io_kiocb *req, int mask,
+ __poll_t __maybe_unused events)
{
req->cqe.res = mask;
/*
@@ -7006,7 +6960,6 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
* CPU. We want to avoid pulling in req->apoll->events for that
* case.
*/
- req->apoll_events = events;
if (req->opcode == IORING_OP_POLL_ADD)
req->io_task_work.func = io_poll_task_func;
else
@@ -7157,6 +7110,8 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
io_init_poll_iocb(poll, mask, io_poll_wake);
poll->file = req->file;
+ req->apoll_events = poll->events;
+
ipt->pt._key = mask;
ipt->req = req;
ipt->error = 0;
@@ -7187,8 +7142,11 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
if (mask) {
/* can't multishot if failed, just queue the event we've got */
- if (unlikely(ipt->error || !ipt->nr_entries))
+ if (unlikely(ipt->error || !ipt->nr_entries)) {
poll->events |= EPOLLONESHOT;
+ req->apoll_events |= EPOLLONESHOT;
+ ipt->error = 0;
+ }
__io_poll_execute(req, mask, poll->events);
return 0;
}
@@ -7250,6 +7208,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
mask |= EPOLLEXCLUSIVE;
if (req->flags & REQ_F_POLLED) {
apoll = req->apoll;
+ kfree(apoll->double_poll);
} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
!list_empty(&ctx->apoll_cache)) {
apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
@@ -7390,7 +7349,7 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
}
-static int io_poll_update_prep(struct io_kiocb *req,
+static int io_poll_remove_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_poll_update *upd = &req->poll_update;
@@ -7435,7 +7394,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL;
io_req_set_refcount(req);
- req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);
+ poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
@@ -7448,13 +7407,15 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
ipt.pt._qproc = io_poll_queue_proc;
ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
+ if (!ret && ipt.error)
+ req_set_fail(req);
ret = ret ?: ipt.error;
if (ret)
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
-static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
+static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_cancel_data cd = { .data = req->poll_update.old_user_data, };
struct io_ring_ctx *ctx = req->ctx;
@@ -7698,8 +7659,9 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
-static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool is_timeout_link)
+static int __io_timeout_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe,
+ bool is_timeout_link)
{
struct io_timeout_data *data;
unsigned flags;
@@ -7754,6 +7716,18 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
+static int io_timeout_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return __io_timeout_prep(req, sqe, false);
+}
+
+static int io_link_timeout_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return __io_timeout_prep(req, sqe, true);
+}
+
static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -7970,7 +7944,7 @@ done:
return 0;
}
-static int io_rsrc_update_prep(struct io_kiocb *req,
+static int io_files_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
@@ -7986,6 +7960,41 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
return 0;
}
+static int io_files_update_with_index_alloc(struct io_kiocb *req,
+ unsigned int issue_flags)
+{
+ __s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg);
+ unsigned int done;
+ struct file *file;
+ int ret, fd;
+
+ for (done = 0; done < req->rsrc_update.nr_args; done++) {
+ if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ file = fget(fd);
+ if (!file) {
+ ret = -EBADF;
+ break;
+ }
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ IORING_FILE_INDEX_ALLOC);
+ if (ret < 0)
+ break;
+ if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
+ __io_close_fixed(req, issue_flags, ret);
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ if (done)
+ return done;
+ return ret;
+}
+
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -7999,10 +8008,14 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
up.resv = 0;
up.resv2 = 0;
- io_ring_submit_lock(ctx, issue_flags);
- ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
- &up, req->rsrc_update.nr_args);
- io_ring_submit_unlock(ctx, issue_flags);
+ if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) {
+ ret = io_files_update_with_index_alloc(req, issue_flags);
+ } else {
+ io_ring_submit_lock(ctx, issue_flags);
+ ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
+ &up, req->rsrc_update.nr_args);
+ io_ring_submit_unlock(ctx, issue_flags);
+ }
if (ret < 0)
req_set_fail(req);
@@ -8025,7 +8038,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
case IORING_OP_POLL_ADD:
return io_poll_add_prep(req, sqe);
case IORING_OP_POLL_REMOVE:
- return io_poll_update_prep(req, sqe);
+ return io_poll_remove_prep(req, sqe);
case IORING_OP_FSYNC:
return io_fsync_prep(req, sqe);
case IORING_OP_SYNC_FILE_RANGE:
@@ -8039,13 +8052,13 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
case IORING_OP_CONNECT:
return io_connect_prep(req, sqe);
case IORING_OP_TIMEOUT:
- return io_timeout_prep(req, sqe, false);
+ return io_timeout_prep(req, sqe);
case IORING_OP_TIMEOUT_REMOVE:
return io_timeout_remove_prep(req, sqe);
case IORING_OP_ASYNC_CANCEL:
return io_async_cancel_prep(req, sqe);
case IORING_OP_LINK_TIMEOUT:
- return io_timeout_prep(req, sqe, true);
+ return io_link_timeout_prep(req, sqe);
case IORING_OP_ACCEPT:
return io_accept_prep(req, sqe);
case IORING_OP_FALLOCATE:
@@ -8055,7 +8068,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
case IORING_OP_CLOSE:
return io_close_prep(req, sqe);
case IORING_OP_FILES_UPDATE:
- return io_rsrc_update_prep(req, sqe);
+ return io_files_update_prep(req, sqe);
case IORING_OP_STATX:
return io_statx_prep(req, sqe);
case IORING_OP_FADVISE:
@@ -8123,9 +8136,9 @@ static int io_req_prep_async(struct io_kiocb *req)
switch (req->opcode) {
case IORING_OP_READV:
- return io_rw_prep_async(req, READ);
+ return io_readv_prep_async(req);
case IORING_OP_WRITEV:
- return io_rw_prep_async(req, WRITE);
+ return io_writev_prep_async(req);
case IORING_OP_SENDMSG:
return io_sendmsg_prep_async(req);
case IORING_OP_RECVMSG:
@@ -8264,6 +8277,11 @@ static void io_clean_op(struct io_kiocb *req)
kfree(req->apoll);
req->apoll = NULL;
}
+ if (req->flags & REQ_F_INFLIGHT) {
+ struct io_uring_task *tctx = req->task->io_uring;
+
+ atomic_dec(&tctx->inflight_tracked);
+ }
if (req->flags & REQ_F_CREDS)
put_cred(req->creds);
if (req->flags & REQ_F_ASYNC_DATA) {
@@ -8288,6 +8306,7 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
+ const struct io_op_def *def = &io_op_defs[req->opcode];
const struct cred *creds = NULL;
int ret;
@@ -8297,7 +8316,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);
- if (!io_op_defs[req->opcode].audit_skip)
+ if (!def->audit_skip)
audit_uring_entry(req->opcode);
switch (req->opcode) {
@@ -8321,7 +8340,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
ret = io_poll_add(req, issue_flags);
break;
case IORING_OP_POLL_REMOVE:
- ret = io_poll_update(req, issue_flags);
+ ret = io_poll_remove(req, issue_flags);
break;
case IORING_OP_SYNC_FILE_RANGE:
ret = io_sync_file_range(req, issue_flags);
@@ -8436,7 +8455,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
break;
}
- if (!io_op_defs[req->opcode].audit_skip)
+ if (!def->audit_skip)
audit_uring_exit(!ret, ret);
if (creds)
@@ -8569,19 +8588,6 @@ out:
return file;
}
-/*
- * Drop the file for requeue operations. Only used of req->file is the
- * io_uring descriptor itself.
- */
-static void io_drop_inflight_file(struct io_kiocb *req)
-{
- if (unlikely(req->flags & REQ_F_INFLIGHT)) {
- fput(req->file);
- req->file = NULL;
- req->flags &= ~REQ_F_INFLIGHT;
- }
-}
-
static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
{
struct file *file = fget(fd);
@@ -8590,7 +8596,7 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
/* we don't allow fixed io_uring files */
if (file && file->f_op == &io_uring_fops)
- req->flags |= REQ_F_INFLIGHT;
+ io_req_track_inflight(req);
return file;
}
@@ -8688,6 +8694,7 @@ static void io_queue_async(struct io_kiocb *req, int ret)
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
*/
+ io_kbuf_recycle(req, 0);
io_queue_iowq(req, NULL);
break;
case IO_APOLL_OK:
@@ -8788,6 +8795,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
+ const struct io_op_def *def;
unsigned int sqe_flags;
int personality;
u8 opcode;
@@ -8805,12 +8813,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->opcode = 0;
return -EINVAL;
}
+ def = &io_op_defs[opcode];
if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
/* enforce forwards compatibility on users */
if (sqe_flags & ~SQE_VALID_FLAGS)
return -EINVAL;
if (sqe_flags & IOSQE_BUFFER_SELECT) {
- if (!io_op_defs[opcode].buffer_select)
+ if (!def->buffer_select)
return -EOPNOTSUPP;
req->buf_index = READ_ONCE(sqe->buf_group);
}
@@ -8836,12 +8845,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
}
- if (!io_op_defs[opcode].ioprio && sqe->ioprio)
+ if (!def->ioprio && sqe->ioprio)
return -EINVAL;
- if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
+ if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (io_op_defs[opcode].needs_file) {
+ if (def->needs_file) {
struct io_submit_state *state = &ctx->submit_state;
req->cqe.fd = READ_ONCE(sqe->fd);
@@ -8850,7 +8859,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
* Plug now if we have more than 2 IO left after this, and the
* target is potentially a read/write to block based storage.
*/
- if (state->need_plug && io_op_defs[opcode].plug) {
+ if (state->need_plug && def->plug) {
state->plug_started = true;
state->need_plug = false;
blk_start_plug_nr_ios(&state->plug, state->submit_nr);
@@ -9658,8 +9667,7 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
{
WARN_ON_ONCE(test_bit(bit, table->bitmap));
__set_bit(bit, table->bitmap);
- if (bit == table->alloc_hint)
- table->alloc_hint++;
+ table->alloc_hint = bit + 1;
}
static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
@@ -9702,11 +9710,19 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
+ unsigned nr = ctx->nr_user_files;
int ret;
if (!ctx->file_data)
return -ENXIO;
+
+ /*
+ * Quiesce may unlock ->uring_lock, and while it's not held
+ * prevent new requests using the table.
+ */
+ ctx->nr_user_files = 0;
ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
+ ctx->nr_user_files = nr;
if (!ret)
__io_sqe_files_unregister(ctx);
return ret;
@@ -10113,21 +10129,19 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
+ __must_hold(&req->ctx->uring_lock)
{
struct io_ring_ctx *ctx = req->ctx;
bool needs_switch = false;
struct io_fixed_file *file_slot;
- int ret = -EBADF;
+ int ret;
- io_ring_submit_lock(ctx, issue_flags);
if (file->f_op == &io_uring_fops)
- goto err;
- ret = -ENXIO;
+ return -EBADF;
if (!ctx->file_data)
- goto err;
- ret = -EINVAL;
+ return -ENXIO;
if (slot_index >= ctx->nr_user_files)
- goto err;
+ return -EINVAL;
slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
@@ -10158,15 +10172,14 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
- io_ring_submit_unlock(ctx, issue_flags);
if (ret)
fput(file);
return ret;
}
-static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
+ unsigned int offset)
{
- unsigned int offset = req->close.file_slot - 1;
struct io_ring_ctx *ctx = req->ctx;
struct io_fixed_file *file_slot;
struct file *file;
@@ -10203,6 +10216,11 @@ out:
return ret;
}
+static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
+{
+ return __io_close_fixed(req, issue_flags, req->close.file_slot - 1);
+}
+
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up,
unsigned nr_args)
@@ -10351,6 +10369,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_idle, 0);
+ atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
@@ -10601,12 +10620,19 @@ static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
+ unsigned nr = ctx->nr_user_bufs;
int ret;
if (!ctx->buf_data)
return -ENXIO;
+ /*
+ * Quiesce may unlock ->uring_lock, and while it's not held
+ * prevent new requests using the table.
+ */
+ ctx->nr_user_bufs = 0;
ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
+ ctx->nr_user_bufs = nr;
if (!ret)
__io_sqe_buffers_unregister(ctx);
return ret;
@@ -11046,6 +11072,7 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
__io_remove_buffers(ctx, bl, -1U);
+ kfree(bl);
}
while (!list_empty(&ctx->io_buffers_pages)) {
@@ -11581,7 +11608,7 @@ static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{
if (tracked)
- return 0;
+ return atomic_read(&tctx->inflight_tracked);
return percpu_counter_sum(&tctx->inflight);
}
@@ -11957,14 +11984,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
f.file = tctx->registered_rings[fd];
- if (unlikely(!f.file))
- return -EBADF;
+ f.flags = 0;
} else {
f = fdget(fd);
- if (unlikely(!f.file))
- return -EBADF;
}
+ if (unlikely(!f.file))
+ return -EBADF;
+
ret = -EOPNOTSUPP;
if (unlikely(f.file->f_op != &io_uring_fops))
goto out_fput;
@@ -12062,8 +12089,7 @@ iopoll_locked:
out:
percpu_ref_put(&ctx->refs);
out_fput:
- if (!(flags & IORING_ENTER_REGISTERED_RING))
- fdput(f);
+ fdput(f);
return ret;
}
@@ -12913,6 +12939,10 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
+ /* cannot disambiguate full vs empty due to head/tail size */
+ if (reg.ring_entries >= 65536)
+ return -EINVAL;
+
if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
int ret = io_init_bl_list(ctx);
if (ret)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e49bb0938376..e9c308ae475f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2114,7 +2114,7 @@ out:
/**
* jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
- * @page: to try and free
+ * @folio: Folio to detach data from.
*
* For all the buffers on this page,
* if they are fully written out ordered data, move them onto BUF_CLEAN
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 7e9abdb89712..acd32f05b519 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -43,9 +43,9 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n",
__func__,
jeb->offset, jeb->offset, jeb->offset + c->sector_size);
- instr = kmalloc(sizeof(struct erase_info), GFP_KERNEL);
+ instr = kzalloc(sizeof(struct erase_info), GFP_KERNEL);
if (!instr) {
- pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
+ pr_warn("kzalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
mutex_lock(&c->erase_free_sem);
spin_lock(&c->erase_completion_lock);
list_move(&jeb->list, &c->erase_pending_list);
@@ -57,8 +57,6 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
return;
}
- memset(instr, 0, sizeof(*instr));
-
instr->addr = jeb->offset;
instr->len = c->sector_size;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 00a110f40e10..39cec28096a7 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -604,6 +604,7 @@ out_root:
jffs2_free_raw_node_refs(c);
kvfree(c->blocks);
jffs2_clear_xattr_subsystem(c);
+ jffs2_sum_exit(c);
out_inohash:
kfree(c->inocache_list);
out_wbuf:
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
index 285ec189ed5c..7156d2c218c7 100644
--- a/fs/jfs/Makefile
+++ b/fs/jfs/Makefile
@@ -13,5 +13,3 @@ jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
resize.o xattr.o ioctl.o
jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
-
-ccflags-y := -D_JFS_4K
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index a5dd7e53754a..259326556ada 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -224,18 +224,9 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
* this as a hole
*/
goto unlock;
-#ifdef _JFS_4K
XADoffset(&xad, lblock64);
XADlength(&xad, xlen);
XADaddress(&xad, xaddr);
-#else /* _JFS_4K */
- /*
- * As long as block size = 4K, this isn't a problem.
- * We should mark the whole page not ABNR, but how
- * will we know to mark the other blocks BH_New?
- */
- BUG();
-#endif /* _JFS_4K */
rc = extRecord(ip, &xad);
if (rc)
goto unlock;
@@ -252,7 +243,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
/*
* Allocate a new block
*/
-#ifdef _JFS_4K
if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad)))
goto unlock;
rc = extAlloc(ip, xlen, lblock64, &xad, false);
@@ -263,14 +253,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
map_bh(bh_result, ip->i_sb, addressXAD(&xad));
bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits;
-#else /* _JFS_4K */
- /*
- * We need to do whatever it takes to keep all but the last buffers
- * in 4K pages - see jfs_write.c
- */
- BUG();
-#endif /* _JFS_4K */
-
unlock:
/*
* Release lock on inode
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d8502f4989d9..6b838d3ae7c2 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -385,7 +385,8 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
}
/* write the last buffer. */
- write_metapage(mp);
+ if (mp)
+ write_metapage(mp);
IREAD_UNLOCK(ipbmap);
@@ -868,74 +869,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
return (rc);
}
-#ifdef _NOTYET
-/*
- * NAME: dbAllocExact()
- *
- * FUNCTION: try to allocate the requested extent;
- *
- * PARAMETERS:
- * ip - pointer to in-core inode;
- * blkno - extent address;
- * nblocks - extent length;
- *
- * RETURN VALUES:
- * 0 - success
- * -ENOSPC - insufficient disk resources
- * -EIO - i/o error
- */
-int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
-{
- int rc;
- struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
- struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
- struct dmap *dp;
- s64 lblkno;
- struct metapage *mp;
-
- IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
-
- /*
- * validate extent request:
- *
- * note: defragfs policy:
- * max 64 blocks will be moved.
- * allocation request size must be satisfied from a single dmap.
- */
- if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
- IREAD_UNLOCK(ipbmap);
- return -EINVAL;
- }
-
- if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
- /* the free space is no longer available */
- IREAD_UNLOCK(ipbmap);
- return -ENOSPC;
- }
-
- /* read in the dmap covering the extent */
- lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
- mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
- if (mp == NULL) {
- IREAD_UNLOCK(ipbmap);
- return -EIO;
- }
- dp = (struct dmap *) mp->data;
-
- /* try to allocate the requested extent */
- rc = dbAllocNext(bmp, dp, blkno, nblocks);
-
- IREAD_UNLOCK(ipbmap);
-
- if (rc == 0)
- mark_metapage_dirty(mp);
-
- release_metapage(mp);
-
- return (rc);
-}
-#endif /* _NOTYET */
-
/*
* NAME: dbReAlloc()
*
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 837d42f61464..92b7c533407c 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -2423,304 +2423,6 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
return 0;
}
-#ifdef _NOTYET
-/*
- * NAME: dtRelocate()
- *
- * FUNCTION: relocate dtpage (internal or leaf) of directory;
- * This function is mainly used by defragfs utility.
- */
-int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
- s64 nxaddr)
-{
- int rc = 0;
- struct metapage *mp, *pmp, *lmp, *rmp;
- dtpage_t *p, *pp, *rp = 0, *lp= 0;
- s64 bn;
- int index;
- struct btstack btstack;
- pxd_t *pxd;
- s64 oxaddr, nextbn, prevbn;
- int xlen, xsize;
- struct tlock *tlck;
- struct dt_lock *dtlck;
- struct pxd_lock *pxdlock;
- s8 *stbl;
- struct lv *lv;
-
- oxaddr = addressPXD(opxd);
- xlen = lengthPXD(opxd);
-
- jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d",
- (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr,
- xlen);
-
- /*
- * 1. get the internal parent dtpage covering
- * router entry for the tartget page to be relocated;
- */
- rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
- if (rc)
- return rc;
-
- /* retrieve search result */
- DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
- jfs_info("dtRelocate: parent router entry validated.");
-
- /*
- * 2. relocate the target dtpage
- */
- /* read in the target page from src extent */
- DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
- if (rc) {
- /* release the pinned parent page */
- DT_PUTPAGE(pmp);
- return rc;
- }
-
- /*
- * read in sibling pages if any to update sibling pointers;
- */
- rmp = NULL;
- if (p->header.next) {
- nextbn = le64_to_cpu(p->header.next);
- DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
- if (rc) {
- DT_PUTPAGE(mp);
- DT_PUTPAGE(pmp);
- return (rc);
- }
- }
-
- lmp = NULL;
- if (p->header.prev) {
- prevbn = le64_to_cpu(p->header.prev);
- DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
- if (rc) {
- DT_PUTPAGE(mp);
- DT_PUTPAGE(pmp);
- if (rmp)
- DT_PUTPAGE(rmp);
- return (rc);
- }
- }
-
- /* at this point, all xtpages to be updated are in memory */
-
- /*
- * update sibling pointers of sibling dtpages if any;
- */
- if (lmp) {
- tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
- dtlck = (struct dt_lock *) & tlck->lock;
- /* linelock header */
- ASSERT(dtlck->index == 0);
- lv = & dtlck->lv[0];
- lv->offset = 0;
- lv->length = 1;
- dtlck->index++;
-
- lp->header.next = cpu_to_le64(nxaddr);
- DT_PUTPAGE(lmp);
- }
-
- if (rmp) {
- tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
- dtlck = (struct dt_lock *) & tlck->lock;
- /* linelock header */
- ASSERT(dtlck->index == 0);
- lv = & dtlck->lv[0];
- lv->offset = 0;
- lv->length = 1;
- dtlck->index++;
-
- rp->header.prev = cpu_to_le64(nxaddr);
- DT_PUTPAGE(rmp);
- }
-
- /*
- * update the target dtpage to be relocated
- *
- * write LOG_REDOPAGE of LOG_NEW type for dst page
- * for the whole target page (logredo() will apply
- * after image and update bmap for allocation of the
- * dst extent), and update bmap for allocation of
- * the dst extent;
- */
- tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
- dtlck = (struct dt_lock *) & tlck->lock;
- /* linelock header */
- ASSERT(dtlck->index == 0);
- lv = & dtlck->lv[0];
-
- /* update the self address in the dtpage header */
- pxd = &p->header.self;
- PXDaddress(pxd, nxaddr);
-
- /* the dst page is the same as the src page, i.e.,
- * linelock for afterimage of the whole page;
- */
- lv->offset = 0;
- lv->length = p->header.maxslot;
- dtlck->index++;
-
- /* update the buffer extent descriptor of the dtpage */
- xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
-
- /* unpin the relocated page */
- DT_PUTPAGE(mp);
- jfs_info("dtRelocate: target dtpage relocated.");
-
- /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
- * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
- * will also force a bmap update ).
- */
-
- /*
- * 3. acquire maplock for the source extent to be freed;
- */
- /* for dtpage relocation, write a LOG_NOREDOPAGE record
- * for the source dtpage (logredo() will init NoRedoPage
- * filter and will also update bmap for free of the source
- * dtpage), and upadte bmap for free of the source dtpage;
- */
- tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
- pxdlock = (struct pxd_lock *) & tlck->lock;
- pxdlock->flag = mlckFREEPXD;
- PXDaddress(&pxdlock->pxd, oxaddr);
- PXDlength(&pxdlock->pxd, xlen);
- pxdlock->index = 1;
-
- /*
- * 4. update the parent router entry for relocation;
- *
- * acquire tlck for the parent entry covering the target dtpage;
- * write LOG_REDOPAGE to apply after image only;
- */
- jfs_info("dtRelocate: update parent router entry.");
- tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
- dtlck = (struct dt_lock *) & tlck->lock;
- lv = & dtlck->lv[dtlck->index];
-
- /* update the PXD with the new address */
- stbl = DT_GETSTBL(pp);
- pxd = (pxd_t *) & pp->slot[stbl[index]];
- PXDaddress(pxd, nxaddr);
- lv->offset = stbl[index];
- lv->length = 1;
- dtlck->index++;
-
- /* unpin the parent dtpage */
- DT_PUTPAGE(pmp);
-
- return rc;
-}
-
-/*
- * NAME: dtSearchNode()
- *
- * FUNCTION: Search for an dtpage containing a specified address
- * This function is mainly used by defragfs utility.
- *
- * NOTE: Search result on stack, the found page is pinned at exit.
- * The result page must be an internal dtpage.
- * lmxaddr give the address of the left most page of the
- * dtree level, in which the required dtpage resides.
- */
-static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
- struct btstack * btstack)
-{
- int rc = 0;
- s64 bn;
- struct metapage *mp;
- dtpage_t *p;
- int psize = 288; /* initial in-line directory */
- s8 *stbl;
- int i;
- pxd_t *pxd;
- struct btframe *btsp;
-
- BT_CLR(btstack); /* reset stack */
-
- /*
- * descend tree to the level with specified leftmost page
- *
- * by convention, root bn = 0.
- */
- for (bn = 0;;) {
- /* get/pin the page to search */
- DT_GETPAGE(ip, bn, mp, psize, p, rc);
- if (rc)
- return rc;
-
- /* does the xaddr of leftmost page of the levevl
- * matches levevl search key ?
- */
- if (p->header.flag & BT_ROOT) {
- if (lmxaddr == 0)
- break;
- } else if (addressPXD(&p->header.self) == lmxaddr)
- break;
-
- /*
- * descend down to leftmost child page
- */
- if (p->header.flag & BT_LEAF) {
- DT_PUTPAGE(mp);
- return -ESTALE;
- }
-
- /* get the leftmost entry */
- stbl = DT_GETSTBL(p);
- pxd = (pxd_t *) & p->slot[stbl[0]];
-
- /* get the child page block address */
- bn = addressPXD(pxd);
- psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
- /* unpin the parent page */
- DT_PUTPAGE(mp);
- }
-
- /*
- * search each page at the current levevl
- */
- loop:
- stbl = DT_GETSTBL(p);
- for (i = 0; i < p->header.nextindex; i++) {
- pxd = (pxd_t *) & p->slot[stbl[i]];
-
- /* found the specified router entry */
- if (addressPXD(pxd) == addressPXD(kpxd) &&
- lengthPXD(pxd) == lengthPXD(kpxd)) {
- btsp = btstack->top;
- btsp->bn = bn;
- btsp->index = i;
- btsp->mp = mp;
-
- return 0;
- }
- }
-
- /* get the right sibling page if any */
- if (p->header.next)
- bn = le64_to_cpu(p->header.next);
- else {
- DT_PUTPAGE(mp);
- return -ESTALE;
- }
-
- /* unpin current page */
- DT_PUTPAGE(mp);
-
- /* get the right sibling page */
- DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
-
- goto loop;
-}
-#endif /* _NOTYET */
-
/*
* dtRelink()
*
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index bb4a342a193d..ae99a7e232ee 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -16,9 +16,6 @@
* forward references
*/
static int extBalloc(struct inode *, s64, s64 *, s64 *);
-#ifdef _NOTYET
-static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
-#endif
static s64 extRoundDown(s64 nb);
#define DPD(a) (printk("(a): %d\n",(a)))
@@ -177,162 +174,6 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
return (0);
}
-
-#ifdef _NOTYET
-/*
- * NAME: extRealloc()
- *
- * FUNCTION: extend the allocation of a file extent containing a
- * partial back last page.
- *
- * PARAMETERS:
- * ip - the inode of the file.
- * cp - cbuf for the partial backed last page.
- * xlen - request size of the resulting extent.
- * xp - pointer to an xad. on successful exit, the xad
- * describes the newly allocated extent.
- * abnr - bool indicating whether the newly allocated extent
- * should be marked as allocated but not recorded.
- *
- * RETURN VALUES:
- * 0 - success
- * -EIO - i/o error.
- * -ENOSPC - insufficient disk resources.
- */
-int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
-{
- struct super_block *sb = ip->i_sb;
- s64 xaddr, xlen, nxaddr, delta, xoff;
- s64 ntail, nextend, ninsert;
- int rc, nbperpage = JFS_SBI(sb)->nbperpage;
- int xflag;
-
- /* This blocks if we are low on resources */
- txBeginAnon(ip->i_sb);
-
- mutex_lock(&JFS_IP(ip)->commit_mutex);
- /* validate extent length */
- if (nxlen > MAXXLEN)
- nxlen = MAXXLEN;
-
- /* get the extend (partial) page's disk block address and
- * number of blocks.
- */
- xaddr = addressXAD(xp);
- xlen = lengthXAD(xp);
- xoff = offsetXAD(xp);
-
- /* if the extend page is abnr and if the request is for
- * the extent to be allocated and recorded,
- * make the page allocated and recorded.
- */
- if ((xp->flag & XAD_NOTRECORDED) && !abnr) {
- xp->flag = 0;
- if ((rc = xtUpdate(0, ip, xp)))
- goto exit;
- }
-
- /* try to allocated the request number of blocks for the
- * extent. dbRealloc() first tries to satisfy the request
- * by extending the allocation in place. otherwise, it will
- * try to allocate a new set of blocks large enough for the
- * request. in satisfying a request, dbReAlloc() may allocate
- * less than what was request but will always allocate enough
- * space as to satisfy the extend page.
- */
- if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr)))
- goto exit;
-
- /* Allocat blocks to quota. */
- rc = dquot_alloc_block(ip, nxlen);
- if (rc) {
- dbFree(ip, nxaddr, (s64) nxlen);
- mutex_unlock(&JFS_IP(ip)->commit_mutex);
- return rc;
- }
-
- delta = nxlen - xlen;
-
- /* check if the extend page is not abnr but the request is abnr
- * and the allocated disk space is for more than one page. if this
- * is the case, there is a miss match of abnr between the extend page
- * and the one or more pages following the extend page. as a result,
- * two extents will have to be manipulated. the first will be that
- * of the extent of the extend page and will be manipulated thru
- * an xtExtend() or an xtTailgate(), depending upon whether the
- * disk allocation occurred as an inplace extension. the second
- * extent will be manipulated (created) through an xtInsert() and
- * will be for the pages following the extend page.
- */
- if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) {
- ntail = nbperpage;
- nextend = ntail - xlen;
- ninsert = nxlen - nbperpage;
-
- xflag = XAD_NOTRECORDED;
- } else {
- ntail = nxlen;
- nextend = delta;
- ninsert = 0;
-
- xflag = xp->flag;
- }
-
- /* if we were able to extend the disk allocation in place,
- * extend the extent. otherwise, move the extent to a
- * new disk location.
- */
- if (xaddr == nxaddr) {
- /* extend the extent */
- if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
- dbFree(ip, xaddr + xlen, delta);
- dquot_free_block(ip, nxlen);
- goto exit;
- }
- } else {
- /*
- * move the extent to a new location:
- *
- * xtTailgate() accounts for relocated tail extent;
- */
- if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
- dbFree(ip, nxaddr, nxlen);
- dquot_free_block(ip, nxlen);
- goto exit;
- }
- }
-
-
- /* check if we need to also insert a new extent */
- if (ninsert) {
- /* perform the insert. if it fails, free the blocks
- * to be inserted and make it appear that we only did
- * the xtExtend() or xtTailgate() above.
- */
- xaddr = nxaddr + ntail;
- if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert,
- &xaddr, 0)) {
- dbFree(ip, xaddr, (s64) ninsert);
- delta = nextend;
- nxlen = ntail;
- xflag = 0;
- }
- }
-
- /* set the return results */
- XADaddress(xp, nxaddr);
- XADlength(xp, nxlen);
- XADoffset(xp, xoff);
- xp->flag = xflag;
-
- mark_inode_dirty(ip);
-exit:
- mutex_unlock(&JFS_IP(ip)->commit_mutex);
- return (rc);
-}
-#endif /* _NOTYET */
-
-
/*
* NAME: extHint()
*
@@ -423,44 +264,6 @@ int extRecord(struct inode *ip, xad_t * xp)
return rc;
}
-
-#ifdef _NOTYET
-/*
- * NAME: extFill()
- *
- * FUNCTION: allocate disk space for a file page that represents
- * a file hole.
- *
- * PARAMETERS:
- * ip - the inode of the file.
- * cp - cbuf of the file page represent the hole.
- *
- * RETURN VALUES:
- * 0 - success
- * -EIO - i/o error.
- * -ENOSPC - insufficient disk resources.
- */
-int extFill(struct inode *ip, xad_t * xp)
-{
- int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
- s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
-
-// assert(ISSPARSE(ip));
-
- /* initialize the extent allocation hint */
- XADaddress(xp, 0);
-
- /* allocate an extent to fill the hole */
- if ((rc = extAlloc(ip, nbperpage, blkno, xp, false)))
- return (rc);
-
- assert(lengthPXD(xp) == nbperpage);
-
- return (0);
-}
-#endif /* _NOTYET */
-
-
/*
* NAME: extBalloc()
*
@@ -550,64 +353,6 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
return (0);
}
-
-#ifdef _NOTYET
-/*
- * NAME: extBrealloc()
- *
- * FUNCTION: attempt to extend an extent's allocation.
- *
- * Initially, we will try to extend the extent's allocation
- * in place. If this fails, we'll try to move the extent
- * to a new set of blocks. If moving the extent, we initially
- * will try to allocate disk blocks for the requested size
- * (newnblks). if this fails (new contiguous free blocks not
- * available), we'll try to allocate a smaller number of
- * blocks (producing a smaller extent), with this smaller
- * number of blocks consisting of the requested number of
- * blocks rounded down to the next smaller power of 2
- * number (i.e. 16 -> 8). We'll continue to round down and
- * retry the allocation until the number of blocks to allocate
- * is smaller than the number of blocks per page.
- *
- * PARAMETERS:
- * ip - the inode of the file.
- * blkno - starting block number of the extents current allocation.
- * nblks - number of blocks within the extents current allocation.
- * newnblks - pointer to a s64 value. on entry, this value is the
- * new desired extent size (number of blocks). on
- * successful exit, this value is set to the extent's actual
- * new size (new number of blocks).
- * newblkno - the starting block number of the extents new allocation.
- *
- * RETURN VALUES:
- * 0 - success
- * -EIO - i/o error.
- * -ENOSPC - insufficient disk resources.
- */
-static int
-extBrealloc(struct inode *ip,
- s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno)
-{
- int rc;
-
- /* try to extend in place */
- if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) {
- *newblkno = blkno;
- return (0);
- } else {
- if (rc != -ENOSPC)
- return (rc);
- }
-
- /* in place extension not possible.
- * try to move the extent to a new set of blocks.
- */
- return (extBalloc(ip, blkno, newnblks, newblkno));
-}
-#endif /* _NOTYET */
-
-
/*
* NAME: extRoundDown()
*
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 997c81fcea34..695415cbfe98 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -388,14 +388,6 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
linelock = (struct linelock *) & tlck->lock;
}
-#ifdef _JFS_WIP
- else if (tlck->flag & tlckINLINELOCK) {
-
- inlinelock = (struct inlinelock *) & tlck;
- p = (caddr_t) & inlinelock->pxd;
- linelock = (struct linelock *) & tlck;
- }
-#endif /* _JFS_WIP */
else {
jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck);
return 0; /* Probably should trap */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index aa4ff7bcaff2..48d1f70f786c 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -307,13 +307,11 @@ static int chkSuper(struct super_block *sb)
}
bsize = le32_to_cpu(j_sb->s_bsize);
-#ifdef _JFS_4K
if (bsize != PSIZE) {
- jfs_err("Currently only 4K block size supported!");
+ jfs_err("Only 4K block size supported!");
rc = -EINVAL;
goto out;
}
-#endif /* _JFS_4K */
jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx",
le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state),
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 042bbe6d8ac2..ffd4feece078 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1490,40 +1490,6 @@ static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd,
tlck->flag |= tlckWRITEPAGE;
} else
jfs_err("diLog: UFO type tlck:0x%p", tlck);
-#ifdef _JFS_WIP
- /*
- * alloc/free external EA extent
- *
- * a maplock for txUpdateMap() to update bPWMAP for alloc/free
- * of the extent has been formatted at txLock() time;
- */
- else {
- assert(tlck->type & tlckEA);
-
- /* log LOG_UPDATEMAP for logredo() to update bmap for
- * alloc of new (and free of old) external EA extent;
- */
- lrd->type = cpu_to_le16(LOG_UPDATEMAP);
- pxdlock = (struct pxd_lock *) & tlck->lock;
- nlock = pxdlock->index;
- for (i = 0; i < nlock; i++, pxdlock++) {
- if (pxdlock->flag & mlckALLOCPXD)
- lrd->log.updatemap.type =
- cpu_to_le16(LOG_ALLOCPXD);
- else
- lrd->log.updatemap.type =
- cpu_to_le16(LOG_FREEPXD);
- lrd->log.updatemap.nxd = cpu_to_le16(1);
- lrd->log.updatemap.pxd = pxdlock->pxd;
- lrd->backchain =
- cpu_to_le32(lmLog(log, tblk, lrd, NULL));
- }
-
- /* update bmap */
- tlck->flag |= tlckUPDATEMAP;
- }
-#endif /* _JFS_WIP */
-
return;
}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 3148e9b35f3b..2d304cee884c 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -114,17 +114,6 @@ static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split,
static int xtSplitRoot(tid_t tid, struct inode *ip,
struct xtsplit * split, struct metapage ** rmpp);
-#ifdef _STILL_TO_PORT
-static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
- xtpage_t * fp, struct btstack * btstack);
-
-static int xtSearchNode(struct inode *ip,
- xad_t * xad,
- int *cmpp, struct btstack * btstack, int flag);
-
-static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
-#endif /* _STILL_TO_PORT */
-
/*
* xtLookup()
*
@@ -1493,189 +1482,6 @@ int xtExtend(tid_t tid, /* transaction id */
return rc;
}
-#ifdef _NOTYET
-/*
- * xtTailgate()
- *
- * function: split existing 'tail' extent
- * (split offset >= start offset of tail extent), and
- * relocate and extend the split tail half;
- *
- * note: existing extent may or may not have been committed.
- * caller is responsible for pager buffer cache update, and
- * working block allocation map update;
- * update pmap: free old split tail extent, alloc new extent;
- */
-int xtTailgate(tid_t tid, /* transaction id */
- struct inode *ip, s64 xoff, /* split/new extent offset */
- s32 xlen, /* new extent length */
- s64 xaddr, /* new extent address */
- int flag)
-{
- int rc = 0;
- int cmp;
- struct metapage *mp; /* meta-page buffer */
- xtpage_t *p; /* base B+-tree index page */
- s64 bn;
- int index, nextindex, llen, rlen;
- struct btstack btstack; /* traverse stack */
- struct xtsplit split; /* split information */
- xad_t *xad;
- struct tlock *tlck;
- struct xtlock *xtlck = 0;
- struct tlock *mtlck;
- struct maplock *pxdlock;
-
-/*
-printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
- (ulong)xoff, xlen, (ulong)xaddr);
-*/
-
- /* there must exist extent to be tailgated */
- if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT)))
- return rc;
-
- /* retrieve search result */
- XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
-
- if (cmp != 0) {
- XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "couldn't find extent\n");
- return -EIO;
- }
-
- /* entry found must be last entry */
- nextindex = le16_to_cpu(p->header.nextindex);
- if (index != nextindex - 1) {
- XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "the entry found is not the last entry\n");
- return -EIO;
- }
-
- BT_MARK_DIRTY(mp, ip);
- /*
- * acquire tlock of the leaf page containing original entry
- */
- if (!test_cflag(COMMIT_Nolink, ip)) {
- tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
- xtlck = (struct xtlock *) & tlck->lock;
- }
-
- /* completely replace extent ? */
- xad = &p->xad[index];
-/*
-printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
- (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
-*/
- if ((llen = xoff - offsetXAD(xad)) == 0)
- goto updateOld;
-
- /*
- * partially replace extent: insert entry for new extent
- */
-//insertNew:
- /*
- * if the leaf page is full, insert the new entry and
- * propagate up the router entry for the new page from split
- *
- * The xtSplitUp() will insert the entry and unpin the leaf page.
- */
- if (nextindex == le16_to_cpu(p->header.maxentry)) {
- /* xtSpliUp() unpins leaf pages */
- split.mp = mp;
- split.index = index + 1;
- split.flag = XAD_NEW;
- split.off = xoff; /* split offset */
- split.len = xlen;
- split.addr = xaddr;
- split.pxdlist = NULL;
- if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
- return rc;
-
- /* get back old page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
- /*
- * if leaf root has been split, original root has been
- * copied to new child page, i.e., original entry now
- * resides on the new child page;
- */
- if (p->header.flag & BT_INTERNAL) {
- ASSERT(p->header.nextindex ==
- cpu_to_le16(XTENTRYSTART + 1));
- xad = &p->xad[XTENTRYSTART];
- bn = addressXAD(xad);
- XT_PUTPAGE(mp);
-
- /* get new child page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
-
- BT_MARK_DIRTY(mp, ip);
- if (!test_cflag(COMMIT_Nolink, ip)) {
- tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
- xtlck = (struct xtlock *) & tlck->lock;
- }
- }
- }
- /*
- * insert the new entry into the leaf page
- */
- else {
- /* insert the new entry: mark the entry NEW */
- xad = &p->xad[index + 1];
- XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
-
- /* advance next available entry index */
- le16_add_cpu(&p->header.nextindex, 1);
- }
-
- /* get back old XAD */
- xad = &p->xad[index];
-
- /*
- * truncate/relocate old extent at split offset
- */
- updateOld:
- /* update dmap for old/committed/truncated extent */
- rlen = lengthXAD(xad) - llen;
- if (!(xad->flag & XAD_NEW)) {
- /* free from PWMAP at commit */
- if (!test_cflag(COMMIT_Nolink, ip)) {
- mtlck = txMaplock(tid, ip, tlckMAP);
- pxdlock = (struct maplock *) & mtlck->lock;
- pxdlock->flag = mlckFREEPXD;
- PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen);
- PXDlength(&pxdlock->pxd, rlen);
- pxdlock->index = 1;
- }
- } else
- /* free from WMAP */
- dbFree(ip, addressXAD(xad) + llen, (s64) rlen);
-
- if (llen)
- /* truncate */
- XADlength(xad, llen);
- else
- /* replace */
- XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
-
- if (!test_cflag(COMMIT_Nolink, ip)) {
- xtlck->lwm.offset = (xtlck->lwm.offset) ?
- min(index, (int)xtlck->lwm.offset) : index;
- xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
- xtlck->lwm.offset;
- }
-
- /* unpin the leaf page */
- XT_PUTPAGE(mp);
-
- return rc;
-}
-#endif /* _NOTYET */
-
/*
* xtUpdate()
*
@@ -1753,32 +1559,12 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
newindex = index + 1;
nextindex = le16_to_cpu(p->header.nextindex);
-#ifdef _JFS_WIP_NOCOALESCE
- if (xoff < nxoff)
- goto updateRight;
-
- /*
- * replace XAD with nXAD
- */
- replace: /* (nxoff == xoff) */
- if (nxlen == xlen) {
- /* replace XAD with nXAD:recorded */
- *xad = *nxad;
- xad->flag = xflag & ~XAD_NOTRECORDED;
-
- goto out;
- } else /* (nxlen < xlen) */
- goto updateLeft;
-#endif /* _JFS_WIP_NOCOALESCE */
-
-/* #ifdef _JFS_WIP_COALESCE */
if (xoff < nxoff)
goto coalesceRight;
/*
* coalesce with left XAD
*/
-//coalesceLeft: /* (xoff == nxoff) */
/* is XAD first entry of page ? */
if (index == XTENTRYSTART)
goto replace;
@@ -1897,7 +1683,6 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
jfs_error(ip->i_sb, "xoff >= nxoff\n");
return -EIO;
}
-/* #endif _JFS_WIP_COALESCE */
/*
* split XAD into (lXAD, nXAD):
@@ -2305,752 +2090,6 @@ int xtAppend(tid_t tid, /* transaction id */
return rc;
}
-#ifdef _STILL_TO_PORT
-
-/* - TBD for defragmentaion/reorganization -
- *
- * xtDelete()
- *
- * function:
- * delete the entry with the specified key.
- *
- * N.B.: whole extent of the entry is assumed to be deleted.
- *
- * parameter:
- *
- * return:
- * ENOENT: if the entry is not found.
- *
- * exception:
- */
-int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
-{
- int rc = 0;
- struct btstack btstack;
- int cmp;
- s64 bn;
- struct metapage *mp;
- xtpage_t *p;
- int index, nextindex;
- struct tlock *tlck;
- struct xtlock *xtlck;
-
- /*
- * find the matching entry; xtSearch() pins the page
- */
- if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
- return rc;
-
- XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
- if (cmp) {
- /* unpin the leaf page */
- XT_PUTPAGE(mp);
- return -ENOENT;
- }
-
- /*
- * delete the entry from the leaf page
- */
- nextindex = le16_to_cpu(p->header.nextindex);
- le16_add_cpu(&p->header.nextindex, -1);
-
- /*
- * if the leaf page bocome empty, free the page
- */
- if (p->header.nextindex == cpu_to_le16(XTENTRYSTART))
- return (xtDeleteUp(tid, ip, mp, p, &btstack));
-
- BT_MARK_DIRTY(mp, ip);
- /*
- * acquire a transaction lock on the leaf page;
- *
- * action:xad deletion;
- */
- tlck = txLock(tid, ip, mp, tlckXTREE);
- xtlck = (struct xtlock *) & tlck->lock;
- xtlck->lwm.offset =
- (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index;
-
- /* if delete from middle, shift left/compact the remaining entries */
- if (index < nextindex - 1)
- memmove(&p->xad[index], &p->xad[index + 1],
- (nextindex - index - 1) * sizeof(xad_t));
-
- XT_PUTPAGE(mp);
-
- return 0;
-}
-
-
-/* - TBD for defragmentaion/reorganization -
- *
- * xtDeleteUp()
- *
- * function:
- * free empty pages as propagating deletion up the tree
- *
- * parameter:
- *
- * return:
- */
-static int
-xtDeleteUp(tid_t tid, struct inode *ip,
- struct metapage * fmp, xtpage_t * fp, struct btstack * btstack)
-{
- int rc = 0;
- struct metapage *mp;
- xtpage_t *p;
- int index, nextindex;
- s64 xaddr;
- int xlen;
- struct btframe *parent;
- struct tlock *tlck;
- struct xtlock *xtlck;
-
- /*
- * keep root leaf page which has become empty
- */
- if (fp->header.flag & BT_ROOT) {
- /* keep the root page */
- fp->header.flag &= ~BT_INTERNAL;
- fp->header.flag |= BT_LEAF;
- fp->header.nextindex = cpu_to_le16(XTENTRYSTART);
-
- /* XT_PUTPAGE(fmp); */
-
- return 0;
- }
-
- /*
- * free non-root leaf page
- */
- if ((rc = xtRelink(tid, ip, fp))) {
- XT_PUTPAGE(fmp);
- return rc;
- }
-
- xaddr = addressPXD(&fp->header.self);
- xlen = lengthPXD(&fp->header.self);
- /* free the page extent */
- dbFree(ip, xaddr, (s64) xlen);
-
- /* free the buffer page */
- discard_metapage(fmp);
-
- /*
- * propagate page deletion up the index tree
- *
- * If the delete from the parent page makes it empty,
- * continue all the way up the tree.
- * stop if the root page is reached (which is never deleted) or
- * if the entry deletion does not empty the page.
- */
- while ((parent = BT_POP(btstack)) != NULL) {
- /* get/pin the parent page <sp> */
- XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
-
- index = parent->index;
-
- /* delete the entry for the freed child page from parent.
- */
- nextindex = le16_to_cpu(p->header.nextindex);
-
- /*
- * the parent has the single entry being deleted:
- * free the parent page which has become empty.
- */
- if (nextindex == 1) {
- if (p->header.flag & BT_ROOT) {
- /* keep the root page */
- p->header.flag &= ~BT_INTERNAL;
- p->header.flag |= BT_LEAF;
- p->header.nextindex =
- cpu_to_le16(XTENTRYSTART);
-
- /* XT_PUTPAGE(mp); */
-
- break;
- } else {
- /* free the parent page */
- if ((rc = xtRelink(tid, ip, p)))
- return rc;
-
- xaddr = addressPXD(&p->header.self);
- /* free the page extent */
- dbFree(ip, xaddr,
- (s64) JFS_SBI(ip->i_sb)->nbperpage);
-
- /* unpin/free the buffer page */
- discard_metapage(mp);
-
- /* propagate up */
- continue;
- }
- }
- /*
- * the parent has other entries remaining:
- * delete the router entry from the parent page.
- */
- else {
- BT_MARK_DIRTY(mp, ip);
- /*
- * acquire a transaction lock on the leaf page;
- *
- * action:xad deletion;
- */
- tlck = txLock(tid, ip, mp, tlckXTREE);
- xtlck = (struct xtlock *) & tlck->lock;
- xtlck->lwm.offset =
- (xtlck->lwm.offset) ? min(index,
- xtlck->lwm.
- offset) : index;
-
- /* if delete from middle,
- * shift left/compact the remaining entries in the page
- */
- if (index < nextindex - 1)
- memmove(&p->xad[index], &p->xad[index + 1],
- (nextindex - index -
- 1) << L2XTSLOTSIZE);
-
- le16_add_cpu(&p->header.nextindex, -1);
- jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
- (ulong) parent->bn, index);
- }
-
- /* unpin the parent page */
- XT_PUTPAGE(mp);
-
- /* exit propagation up */
- break;
- }
-
- return 0;
-}
-
-
-/*
- * NAME: xtRelocate()
- *
- * FUNCTION: relocate xtpage or data extent of regular file;
- * This function is mainly used by defragfs utility.
- *
- * NOTE: This routine does not have the logic to handle
- * uncommitted allocated extent. The caller should call
- * txCommit() to commit all the allocation before call
- * this routine.
- */
-int
-xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
- s64 nxaddr, /* new xaddr */
- int xtype)
-{ /* extent type: XTPAGE or DATAEXT */
- int rc = 0;
- struct tblock *tblk;
- struct tlock *tlck;
- struct xtlock *xtlck;
- struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */
- xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */
- xad_t *xad;
- pxd_t *pxd;
- s64 xoff, xsize;
- int xlen;
- s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn;
- cbuf_t *cp;
- s64 offset, nbytes, nbrd, pno;
- int nb, npages, nblks;
- s64 bn;
- int cmp;
- int index;
- struct pxd_lock *pxdlock;
- struct btstack btstack; /* traverse stack */
-
- xtype = xtype & EXTENT_TYPE;
-
- xoff = offsetXAD(oxad);
- oxaddr = addressXAD(oxad);
- xlen = lengthXAD(oxad);
-
- /* validate extent offset */
- offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
- if (offset >= ip->i_size)
- return -ESTALE; /* stale extent */
-
- jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx",
- xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
-
- /*
- * 1. get and validate the parent xtpage/xad entry
- * covering the source extent to be relocated;
- */
- if (xtype == DATAEXT) {
- /* search in leaf entry */
- rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
- if (rc)
- return rc;
-
- /* retrieve search result */
- XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
-
- if (cmp) {
- XT_PUTPAGE(pmp);
- return -ESTALE;
- }
-
- /* validate for exact match with a single entry */
- xad = &pp->xad[index];
- if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) {
- XT_PUTPAGE(pmp);
- return -ESTALE;
- }
- } else { /* (xtype == XTPAGE) */
-
- /* search in internal entry */
- rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0);
- if (rc)
- return rc;
-
- /* retrieve search result */
- XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
-
- if (cmp) {
- XT_PUTPAGE(pmp);
- return -ESTALE;
- }
-
- /* xtSearchNode() validated for exact match with a single entry
- */
- xad = &pp->xad[index];
- }
- jfs_info("xtRelocate: parent xad entry validated.");
-
- /*
- * 2. relocate the extent
- */
- if (xtype == DATAEXT) {
- /* if the extent is allocated-but-not-recorded
- * there is no real data to be moved in this extent,
- */
- if (xad->flag & XAD_NOTRECORDED)
- goto out;
- else
- /* release xtpage for cmRead()/xtLookup() */
- XT_PUTPAGE(pmp);
-
- /*
- * cmRelocate()
- *
- * copy target data pages to be relocated;
- *
- * data extent must start at page boundary and
- * multiple of page size (except the last data extent);
- * read in each page of the source data extent into cbuf,
- * update the cbuf extent descriptor of the page to be
- * homeward bound to new dst data extent
- * copy the data from the old extent to new extent.
- * copy is essential for compressed files to avoid problems
- * that can arise if there was a change in compression
- * algorithms.
- * it is a good strategy because it may disrupt cache
- * policy to keep the pages in memory afterwards.
- */
- offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
- assert((offset & CM_OFFSET) == 0);
- nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize;
- pno = offset >> CM_L2BSIZE;
- npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
-/*
- npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
- (offset >> CM_L2BSIZE) + 1;
-*/
- sxaddr = oxaddr;
- dxaddr = nxaddr;
-
- /* process the request one cache buffer at a time */
- for (nbrd = 0; nbrd < nbytes; nbrd += nb,
- offset += nb, pno++, npages--) {
- /* compute page size */
- nb = min(nbytes - nbrd, CM_BSIZE);
-
- /* get the cache buffer of the page */
- if (rc = cmRead(ip, offset, npages, &cp))
- break;
-
- assert(addressPXD(&cp->cm_pxd) == sxaddr);
- assert(!cp->cm_modified);
-
- /* bind buffer with the new extent address */
- nblks = nb >> JFS_IP(ip->i_sb)->l2bsize;
- cmSetXD(ip, cp, pno, dxaddr, nblks);
-
- /* release the cbuf, mark it as modified */
- cmPut(cp, true);
-
- dxaddr += nblks;
- sxaddr += nblks;
- }
-
- /* get back parent page */
- if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
- return rc;
-
- XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
- jfs_info("xtRelocate: target data extent relocated.");
- } else { /* (xtype == XTPAGE) */
-
- /*
- * read in the target xtpage from the source extent;
- */
- XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
- if (rc) {
- XT_PUTPAGE(pmp);
- return rc;
- }
-
- /*
- * read in sibling pages if any to update sibling pointers;
- */
- rmp = NULL;
- if (p->header.next) {
- nextbn = le64_to_cpu(p->header.next);
- XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
- if (rc) {
- XT_PUTPAGE(pmp);
- XT_PUTPAGE(mp);
- return (rc);
- }
- }
-
- lmp = NULL;
- if (p->header.prev) {
- prevbn = le64_to_cpu(p->header.prev);
- XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
- if (rc) {
- XT_PUTPAGE(pmp);
- XT_PUTPAGE(mp);
- if (rmp)
- XT_PUTPAGE(rmp);
- return (rc);
- }
- }
-
- /* at this point, all xtpages to be updated are in memory */
-
- /*
- * update sibling pointers of sibling xtpages if any;
- */
- if (lmp) {
- BT_MARK_DIRTY(lmp, ip);
- tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
- lp->header.next = cpu_to_le64(nxaddr);
- XT_PUTPAGE(lmp);
- }
-
- if (rmp) {
- BT_MARK_DIRTY(rmp, ip);
- tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
- rp->header.prev = cpu_to_le64(nxaddr);
- XT_PUTPAGE(rmp);
- }
-
- /*
- * update the target xtpage to be relocated
- *
- * update the self address of the target page
- * and write to destination extent;
- * redo image covers the whole xtpage since it is new page
- * to the destination extent;
- * update of bmap for the free of source extent
- * of the target xtpage itself:
- * update of bmap for the allocation of destination extent
- * of the target xtpage itself:
- * update of bmap for the extents covered by xad entries in
- * the target xtpage is not necessary since they are not
- * updated;
- * if not committed before this relocation,
- * target page may contain XAD_NEW entries which must
- * be scanned for bmap update (logredo() always
- * scan xtpage REDOPAGE image for bmap update);
- * if committed before this relocation (tlckRELOCATE),
- * scan may be skipped by commit() and logredo();
- */
- BT_MARK_DIRTY(mp, ip);
- /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
- tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
- xtlck = (struct xtlock *) & tlck->lock;
-
- /* update the self address in the xtpage header */
- pxd = &p->header.self;
- PXDaddress(pxd, nxaddr);
-
- /* linelock for the after image of the whole page */
- xtlck->lwm.length =
- le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
-
- /* update the buffer extent descriptor of target xtpage */
- xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
- bmSetXD(mp, nxaddr, xsize);
-
- /* unpin the target page to new homeward bound */
- XT_PUTPAGE(mp);
- jfs_info("xtRelocate: target xtpage relocated.");
- }
-
- /*
- * 3. acquire maplock for the source extent to be freed;
- *
- * acquire a maplock saving the src relocated extent address;
- * to free of the extent at commit time;
- */
- out:
- /* if DATAEXT relocation, write a LOG_UPDATEMAP record for
- * free PXD of the source data extent (logredo() will update
- * bmap for free of source data extent), and update bmap for
- * free of the source data extent;
- */
- if (xtype == DATAEXT)
- tlck = txMaplock(tid, ip, tlckMAP);
- /* if XTPAGE relocation, write a LOG_NOREDOPAGE record
- * for the source xtpage (logredo() will init NoRedoPage
- * filter and will also update bmap for free of the source
- * xtpage), and update bmap for free of the source xtpage;
- * N.B. We use tlckMAP instead of tlkcXTREE because there
- * is no buffer associated with this lock since the buffer
- * has been redirected to the target location.
- */
- else /* (xtype == XTPAGE) */
- tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
-
- pxdlock = (struct pxd_lock *) & tlck->lock;
- pxdlock->flag = mlckFREEPXD;
- PXDaddress(&pxdlock->pxd, oxaddr);
- PXDlength(&pxdlock->pxd, xlen);
- pxdlock->index = 1;
-
- /*
- * 4. update the parent xad entry for relocation;
- *
- * acquire tlck for the parent entry with XAD_NEW as entry
- * update which will write LOG_REDOPAGE and update bmap for
- * allocation of XAD_NEW destination extent;
- */
- jfs_info("xtRelocate: update parent xad entry.");
- BT_MARK_DIRTY(pmp, ip);
- tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW);
- xtlck = (struct xtlock *) & tlck->lock;
-
- /* update the XAD with the new destination extent; */
- xad = &pp->xad[index];
- xad->flag |= XAD_NEW;
- XADaddress(xad, nxaddr);
-
- xtlck->lwm.offset = min(index, xtlck->lwm.offset);
- xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) -
- xtlck->lwm.offset;
-
- /* unpin the parent xtpage */
- XT_PUTPAGE(pmp);
-
- return rc;
-}
-
-
-/*
- * xtSearchNode()
- *
- * function: search for the internal xad entry covering specified extent.
- * This function is mainly used by defragfs utility.
- *
- * parameters:
- * ip - file object;
- * xad - extent to find;
- * cmpp - comparison result:
- * btstack - traverse stack;
- * flag - search process flag;
- *
- * returns:
- * btstack contains (bn, index) of search path traversed to the entry.
- * *cmpp is set to result of comparison with the entry returned.
- * the page containing the entry is pinned at exit.
- */
-static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
- int *cmpp, struct btstack * btstack, int flag)
-{
- int rc = 0;
- s64 xoff, xaddr;
- int xlen;
- int cmp = 1; /* init for empty page */
- s64 bn; /* block number */
- struct metapage *mp; /* meta-page buffer */
- xtpage_t *p; /* page */
- int base, index, lim;
- struct btframe *btsp;
- s64 t64;
-
- BT_CLR(btstack);
-
- xoff = offsetXAD(xad);
- xlen = lengthXAD(xad);
- xaddr = addressXAD(xad);
-
- /*
- * search down tree from root:
- *
- * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
- * internal page, child page Pi contains entry with k, Ki <= K < Kj.
- *
- * if entry with search key K is not found
- * internal page search find the entry with largest key Ki
- * less than K which point to the child page to search;
- * leaf page search find the entry with smallest key Kj
- * greater than K so that the returned index is the position of
- * the entry to be shifted right for insertion of new entry.
- * for empty tree, search key is greater than any key of the tree.
- *
- * by convention, root bn = 0.
- */
- for (bn = 0;;) {
- /* get/pin the page to search */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
- if (p->header.flag & BT_LEAF) {
- XT_PUTPAGE(mp);
- return -ESTALE;
- }
-
- lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
-
- /*
- * binary search with search key K on the current page
- */
- for (base = XTENTRYSTART; lim; lim >>= 1) {
- index = base + (lim >> 1);
-
- XT_CMP(cmp, xoff, &p->xad[index], t64);
- if (cmp == 0) {
- /*
- * search hit
- *
- * verify for exact match;
- */
- if (xaddr == addressXAD(&p->xad[index]) &&
- xoff == offsetXAD(&p->xad[index])) {
- *cmpp = cmp;
-
- /* save search result */
- btsp = btstack->top;
- btsp->bn = bn;
- btsp->index = index;
- btsp->mp = mp;
-
- return 0;
- }
-
- /* descend/search its child page */
- goto next;
- }
-
- if (cmp > 0) {
- base = index + 1;
- --lim;
- }
- }
-
- /*
- * search miss - non-leaf page:
- *
- * base is the smallest index with key (Kj) greater than
- * search key (K) and may be zero or maxentry index.
- * if base is non-zero, decrement base by one to get the parent
- * entry of the child page to search.
- */
- index = base ? base - 1 : base;
-
- /*
- * go down to child page
- */
- next:
- /* get the child page block number */
- bn = addressXAD(&p->xad[index]);
-
- /* unpin the parent page */
- XT_PUTPAGE(mp);
- }
-}
-
-
-/*
- * xtRelink()
- *
- * function:
- * link around a freed page.
- *
- * Parameter:
- * int tid,
- * struct inode *ip,
- * xtpage_t *p)
- *
- * returns:
- */
-static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
-{
- int rc = 0;
- struct metapage *mp;
- s64 nextbn, prevbn;
- struct tlock *tlck;
-
- nextbn = le64_to_cpu(p->header.next);
- prevbn = le64_to_cpu(p->header.prev);
-
- /* update prev pointer of the next page */
- if (nextbn != 0) {
- XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
-
- /*
- * acquire a transaction lock on the page;
- *
- * action: update prev pointer;
- */
- BT_MARK_DIRTY(mp, ip);
- tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
-
- /* the page may already have been tlock'd */
-
- p->header.prev = cpu_to_le64(prevbn);
-
- XT_PUTPAGE(mp);
- }
-
- /* update next pointer of the previous page */
- if (prevbn != 0) {
- XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
-
- /*
- * acquire a transaction lock on the page;
- *
- * action: update next pointer;
- */
- BT_MARK_DIRTY(mp, ip);
- tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
-
- /* the page may already have been tlock'd */
-
- p->header.next = le64_to_cpu(nextbn);
-
- XT_PUTPAGE(mp);
- }
-
- return 0;
-}
-#endif /* _STILL_TO_PORT */
-
/*
* xtInitRoot()
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 5f51be8596b3..142caafc73b1 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -95,10 +95,6 @@ extern int xtInsert(tid_t tid, struct inode *ip,
int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag);
extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen,
int flag);
-#ifdef _NOTYET
-extern int xtTailgate(tid_t tid, struct inode *ip,
- s64 xoff, int xlen, s64 xaddr, int flag);
-#endif
extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad);
extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen,
int flag);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e205fde7163a..6eca72cfa1f2 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -18,7 +18,15 @@
#include "kernfs-internal.h"
static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
-static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
+/*
+ * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
+ * call pr_cont() while holding rename_lock. Because sometimes pr_cont()
+ * will perform wakeups when releasing console_sem. Holding rename_lock
+ * will introduce deadlock if the scheduler reads the kernfs_name in the
+ * wakeup path.
+ */
+static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
+static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */
static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
@@ -229,12 +237,12 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
{
unsigned long flags;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
- kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
+ kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
pr_cont("%s", kernfs_pr_cont_buf);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}
/**
@@ -248,10 +256,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
unsigned long flags;
int sz;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
- sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
- sizeof(kernfs_pr_cont_buf));
+ sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
if (sz < 0) {
pr_cont("(error)");
goto out;
@@ -265,7 +273,7 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
pr_cont("%s", kernfs_pr_cont_buf);
out:
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}
/**
@@ -823,13 +831,12 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
- /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
- spin_lock_irq(&kernfs_rename_lock);
+ spin_lock_irq(&kernfs_pr_cont_lock);
len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
if (len >= sizeof(kernfs_pr_cont_buf)) {
- spin_unlock_irq(&kernfs_rename_lock);
+ spin_unlock_irq(&kernfs_pr_cont_lock);
return NULL;
}
@@ -841,7 +848,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
parent = kernfs_find_ns(parent, name, ns);
}
- spin_unlock_irq(&kernfs_rename_lock);
+ spin_unlock_irq(&kernfs_pr_cont_lock);
return parent;
}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 88423069407c..e3abfa843879 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -33,7 +33,6 @@ static DEFINE_SPINLOCK(kernfs_open_node_lock);
static DEFINE_MUTEX(kernfs_open_file_mutex);
struct kernfs_open_node {
- atomic_t refcnt;
atomic_t event;
wait_queue_head_t poll;
struct list_head files; /* goes through kernfs_open_file.list */
@@ -530,10 +529,8 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
}
on = kn->attr.open;
- if (on) {
- atomic_inc(&on->refcnt);
+ if (on)
list_add_tail(&of->list, &on->files);
- }
spin_unlock_irq(&kernfs_open_node_lock);
mutex_unlock(&kernfs_open_file_mutex);
@@ -548,7 +545,6 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
if (!new_on)
return -ENOMEM;
- atomic_set(&new_on->refcnt, 0);
atomic_set(&new_on->event, 1);
init_waitqueue_head(&new_on->poll);
INIT_LIST_HEAD(&new_on->files);
@@ -556,17 +552,19 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
}
/**
- * kernfs_put_open_node - put kernfs_open_node
- * @kn: target kernfs_nodet
+ * kernfs_unlink_open_file - Unlink @of from @kn.
+ *
+ * @kn: target kernfs_node
* @of: associated kernfs_open_file
*
- * Put @kn->attr.open and unlink @of from the files list. If
- * reference count reaches zero, disassociate and free it.
+ * Unlink @of from list of @kn's associated open files. If list of
+ * associated open files becomes empty, disassociate and free
+ * kernfs_open_node.
*
* LOCKING:
* None.
*/
-static void kernfs_put_open_node(struct kernfs_node *kn,
+static void kernfs_unlink_open_file(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
struct kernfs_open_node *on = kn->attr.open;
@@ -578,7 +576,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
if (of)
list_del(&of->list);
- if (atomic_dec_and_test(&on->refcnt))
+ if (list_empty(&on->files))
kn->attr.open = NULL;
else
on = NULL;
@@ -706,7 +704,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
return 0;
err_put_node:
- kernfs_put_open_node(kn, of);
+ kernfs_unlink_open_file(kn, of);
err_seq_release:
seq_release(inode, file);
err_free:
@@ -752,7 +750,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
mutex_unlock(&kernfs_open_file_mutex);
}
- kernfs_put_open_node(kn, of);
+ kernfs_unlink_open_file(kn, of);
seq_release(inode, filp);
kfree(of->prealloc_buf);
kfree(of);
@@ -768,15 +766,24 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
return;
- spin_lock_irq(&kernfs_open_node_lock);
- on = kn->attr.open;
- if (on)
- atomic_inc(&on->refcnt);
- spin_unlock_irq(&kernfs_open_node_lock);
- if (!on)
+ /*
+ * lockless opportunistic check is safe below because no one is adding to
+ * ->attr.open at this point of time. This check allows early bail out
+ * if ->attr.open is already NULL. kernfs_unlink_open_file makes
+ * ->attr.open NULL only while holding kernfs_open_file_mutex so below
+ * check under kernfs_open_file_mutex will ensure bailing out if
+ * ->attr.open became NULL while waiting for the mutex.
+ */
+ if (!kn->attr.open)
return;
mutex_lock(&kernfs_open_file_mutex);
+ if (!kn->attr.open) {
+ mutex_unlock(&kernfs_open_file_mutex);
+ return;
+ }
+
+ on = kn->attr.open;
list_for_each_entry(of, &on->files, list) {
struct inode *inode = file_inode(of->file);
@@ -789,8 +796,6 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
}
mutex_unlock(&kernfs_open_file_mutex);
-
- kernfs_put_open_node(kn, NULL);
}
/*
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index 208d2cff7bd3..e8f476c5f189 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -62,7 +62,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
conn->total_credits = 1;
- conn->outstanding_credits = 1;
+ conn->outstanding_credits = 0;
init_waitqueue_head(&conn->req_running_q);
INIT_LIST_HEAD(&conn->conns_list);
@@ -205,31 +205,31 @@ int ksmbd_conn_write(struct ksmbd_work *work)
return 0;
}
-int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf,
- unsigned int buflen, u32 remote_key, u64 remote_offset,
- u32 remote_len)
+int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len)
{
int ret = -EINVAL;
if (conn->transport->ops->rdma_read)
ret = conn->transport->ops->rdma_read(conn->transport,
buf, buflen,
- remote_key, remote_offset,
- remote_len);
+ desc, desc_len);
return ret;
}
-int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf,
- unsigned int buflen, u32 remote_key,
- u64 remote_offset, u32 remote_len)
+int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len)
{
int ret = -EINVAL;
if (conn->transport->ops->rdma_write)
ret = conn->transport->ops->rdma_write(conn->transport,
buf, buflen,
- remote_key, remote_offset,
- remote_len);
+ desc, desc_len);
return ret;
}
diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h
index 7a59aacb5daa..98c1cbe45ec9 100644
--- a/fs/ksmbd/connection.h
+++ b/fs/ksmbd/connection.h
@@ -122,11 +122,14 @@ struct ksmbd_transport_ops {
int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov,
int size, bool need_invalidate_rkey,
unsigned int remote_key);
- int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len,
- u32 remote_key, u64 remote_offset, u32 remote_len);
- int (*rdma_write)(struct ksmbd_transport *t, void *buf,
- unsigned int len, u32 remote_key, u64 remote_offset,
- u32 remote_len);
+ int (*rdma_read)(struct ksmbd_transport *t,
+ void *buf, unsigned int len,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len);
+ int (*rdma_write)(struct ksmbd_transport *t,
+ void *buf, unsigned int len,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len);
};
struct ksmbd_transport {
@@ -148,12 +151,14 @@ struct ksmbd_conn *ksmbd_conn_alloc(void);
void ksmbd_conn_free(struct ksmbd_conn *conn);
bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c);
int ksmbd_conn_write(struct ksmbd_work *work);
-int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf,
- unsigned int buflen, u32 remote_key, u64 remote_offset,
- u32 remote_len);
-int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf,
- unsigned int buflen, u32 remote_key, u64 remote_offset,
- u32 remote_len);
+int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len);
+int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len);
void ksmbd_conn_enqueue_request(struct ksmbd_work *work);
int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work);
void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops);
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index ebe6ca08467a..52aa0adeb951 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -104,7 +104,8 @@ struct ksmbd_startup_request {
*/
__u32 sub_auth[3]; /* Subauth value for Security ID */
__u32 smb2_max_credits; /* MAX credits */
- __u32 reserved[128]; /* Reserved room */
+ __u32 smbd_max_io_size; /* smbd read write size */
+ __u32 reserved[127]; /* Reserved room */
__u32 ifc_list_sz; /* interfaces list size */
__s8 ____payload[];
};
diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c
index 1e2076a53bed..df991107ad2c 100644
--- a/fs/ksmbd/misc.c
+++ b/fs/ksmbd/misc.c
@@ -20,7 +20,7 @@
* wildcard '*' and '?'
* TODO : implement consideration about DOS_DOT, DOS_QM and DOS_STAR
*
- * @string: string to compare with a pattern
+ * @str: string to compare with a pattern
* @len: string length
* @pattern: pattern string which might include wildcard '*' and '?'
*
@@ -152,8 +152,8 @@ out:
/**
* convert_to_nt_pathname() - extract and return windows path string
* whose share directory prefix was removed from file path
- * @filename : unix filename
- * @sharepath: share path string
+ * @share: ksmbd_share_config pointer
+ * @path: path to report
*
* Return : windows path string or error
*/
@@ -250,8 +250,8 @@ char *ksmbd_extract_sharename(char *treename)
/**
* convert_to_unix_name() - convert windows name to unix format
- * @path: name to be converted
- * @tid: tree id of mathing share
+ * @share: ksmbd_share_config pointer
+ * @name: file name that is relative to share
*
* Return: converted name on success, otherwise NULL
*/
diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c
index 4a9460153b59..f8f456377a51 100644
--- a/fs/ksmbd/smb2misc.c
+++ b/fs/ksmbd/smb2misc.c
@@ -338,7 +338,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn,
ret = 1;
}
- if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) {
+ if ((u64)conn->outstanding_credits + credit_charge > conn->total_credits) {
ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n",
credit_charge, conn->outstanding_credits);
ret = 1;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 16c803a9d996..e6f4ccc12f49 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -3938,6 +3938,12 @@ int smb2_query_dir(struct ksmbd_work *work)
set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir);
rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx);
+ /*
+ * req->OutputBufferLength is too small to contain even one entry.
+ * In this case, it immediately returns OutputBufferLength 0 to client.
+ */
+ if (!d_info.out_buf_len && !d_info.num_entry)
+ goto no_buf_len;
if (rc == 0)
restart_ctx(&dir_fp->readdir_data.ctx);
if (rc == -ENOSPC)
@@ -3964,10 +3970,12 @@ int smb2_query_dir(struct ksmbd_work *work)
rsp->Buffer[0] = 0;
inc_rfc1001_len(work->response_buf, 9);
} else {
+no_buf_len:
((struct file_directory_info *)
((char *)rsp->Buffer + d_info.last_entry_offset))
->NextEntryOffset = 0;
- d_info.data_count -= d_info.last_entry_off_align;
+ if (d_info.data_count >= d_info.last_entry_off_align)
+ d_info.data_count -= d_info.last_entry_off_align;
rsp->StructureSize = cpu_to_le16(9);
rsp->OutputBufferOffset = cpu_to_le16(72);
@@ -6116,7 +6124,6 @@ out:
static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work,
struct smb2_buffer_desc_v1 *desc,
__le32 Channel,
- __le16 ChannelInfoOffset,
__le16 ChannelInfoLength)
{
unsigned int i, ch_count;
@@ -6134,15 +6141,13 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work,
le32_to_cpu(desc[i].length));
}
}
- if (ch_count != 1) {
- ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n",
- ch_count);
+ if (!ch_count)
return -EINVAL;
- }
work->need_invalidate_rkey =
(Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE);
- work->remote_key = le32_to_cpu(desc->token);
+ if (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE)
+ work->remote_key = le32_to_cpu(desc->token);
return 0;
}
@@ -6150,14 +6155,12 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work,
struct smb2_read_req *req, void *data_buf,
size_t length)
{
- struct smb2_buffer_desc_v1 *desc =
- (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
int err;
err = ksmbd_conn_rdma_write(work->conn, data_buf, length,
- le32_to_cpu(desc->token),
- le64_to_cpu(desc->offset),
- le32_to_cpu(desc->length));
+ (struct smb2_buffer_desc_v1 *)
+ ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)),
+ le16_to_cpu(req->ReadChannelInfoLength));
if (err)
return err;
@@ -6180,6 +6183,8 @@ int smb2_read(struct ksmbd_work *work)
size_t length, mincount;
ssize_t nbytes = 0, remain_bytes = 0;
int err = 0;
+ bool is_rdma_channel = false;
+ unsigned int max_read_size = conn->vals->max_read_size;
WORK_BUFFERS(work, req, rsp);
@@ -6191,6 +6196,11 @@ int smb2_read(struct ksmbd_work *work)
if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
req->Channel == SMB2_CHANNEL_RDMA_V1) {
+ is_rdma_channel = true;
+ max_read_size = get_smbd_max_read_write_size();
+ }
+
+ if (is_rdma_channel == true) {
unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset);
if (ch_offset < offsetof(struct smb2_read_req, Buffer)) {
@@ -6201,7 +6211,6 @@ int smb2_read(struct ksmbd_work *work)
(struct smb2_buffer_desc_v1 *)
((char *)req + ch_offset),
req->Channel,
- req->ReadChannelInfoOffset,
req->ReadChannelInfoLength);
if (err)
goto out;
@@ -6223,9 +6232,9 @@ int smb2_read(struct ksmbd_work *work)
length = le32_to_cpu(req->Length);
mincount = le32_to_cpu(req->MinimumCount);
- if (length > conn->vals->max_read_size) {
+ if (length > max_read_size) {
ksmbd_debug(SMB, "limiting read size to max size(%u)\n",
- conn->vals->max_read_size);
+ max_read_size);
err = -EINVAL;
goto out;
}
@@ -6257,8 +6266,7 @@ int smb2_read(struct ksmbd_work *work)
ksmbd_debug(SMB, "nbytes %zu, offset %lld mincount %zu\n",
nbytes, offset, mincount);
- if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
- req->Channel == SMB2_CHANNEL_RDMA_V1) {
+ if (is_rdma_channel == true) {
/* write data to the client using rdma channel */
remain_bytes = smb2_read_rdma_channel(work, req,
work->aux_payload_buf,
@@ -6328,23 +6336,18 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work)
length = le32_to_cpu(req->Length);
id = req->VolatileFileId;
- if (le16_to_cpu(req->DataOffset) ==
- offsetof(struct smb2_write_req, Buffer)) {
- data_buf = (char *)&req->Buffer[0];
- } else {
- if ((u64)le16_to_cpu(req->DataOffset) + length >
- get_rfc1002_len(work->request_buf)) {
- pr_err("invalid write data offset %u, smb_len %u\n",
- le16_to_cpu(req->DataOffset),
- get_rfc1002_len(work->request_buf));
- err = -EINVAL;
- goto out;
- }
-
- data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
- le16_to_cpu(req->DataOffset));
+ if ((u64)le16_to_cpu(req->DataOffset) + length >
+ get_rfc1002_len(work->request_buf)) {
+ pr_err("invalid write data offset %u, smb_len %u\n",
+ le16_to_cpu(req->DataOffset),
+ get_rfc1002_len(work->request_buf));
+ err = -EINVAL;
+ goto out;
}
+ data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
+ le16_to_cpu(req->DataOffset));
+
rpc_resp = ksmbd_rpc_write(work->sess, id, data_buf, length);
if (rpc_resp) {
if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) {
@@ -6384,21 +6387,18 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work,
struct ksmbd_file *fp,
loff_t offset, size_t length, bool sync)
{
- struct smb2_buffer_desc_v1 *desc;
char *data_buf;
int ret;
ssize_t nbytes;
- desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
-
data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO);
if (!data_buf)
return -ENOMEM;
ret = ksmbd_conn_rdma_read(work->conn, data_buf, length,
- le32_to_cpu(desc->token),
- le64_to_cpu(desc->offset),
- le32_to_cpu(desc->length));
+ (struct smb2_buffer_desc_v1 *)
+ ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)),
+ le16_to_cpu(req->WriteChannelInfoLength));
if (ret < 0) {
kvfree(data_buf);
return ret;
@@ -6427,8 +6427,9 @@ int smb2_write(struct ksmbd_work *work)
size_t length;
ssize_t nbytes;
char *data_buf;
- bool writethrough = false;
+ bool writethrough = false, is_rdma_channel = false;
int err = 0;
+ unsigned int max_write_size = work->conn->vals->max_write_size;
WORK_BUFFERS(work, req, rsp);
@@ -6437,8 +6438,17 @@ int smb2_write(struct ksmbd_work *work)
return smb2_write_pipe(work);
}
+ offset = le64_to_cpu(req->Offset);
+ length = le32_to_cpu(req->Length);
+
if (req->Channel == SMB2_CHANNEL_RDMA_V1 ||
req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
+ is_rdma_channel = true;
+ max_write_size = get_smbd_max_read_write_size();
+ length = le32_to_cpu(req->RemainingBytes);
+ }
+
+ if (is_rdma_channel == true) {
unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset);
if (req->Length != 0 || req->DataOffset != 0 ||
@@ -6450,7 +6460,6 @@ int smb2_write(struct ksmbd_work *work)
(struct smb2_buffer_desc_v1 *)
((char *)req + ch_offset),
req->Channel,
- req->WriteChannelInfoOffset,
req->WriteChannelInfoLength);
if (err)
goto out;
@@ -6474,12 +6483,9 @@ int smb2_write(struct ksmbd_work *work)
goto out;
}
- offset = le64_to_cpu(req->Offset);
- length = le32_to_cpu(req->Length);
-
- if (length > work->conn->vals->max_write_size) {
+ if (length > max_write_size) {
ksmbd_debug(SMB, "limiting write size to max size(%u)\n",
- work->conn->vals->max_write_size);
+ max_write_size);
err = -EINVAL;
goto out;
}
@@ -6487,24 +6493,17 @@ int smb2_write(struct ksmbd_work *work)
if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
writethrough = true;
- if (req->Channel != SMB2_CHANNEL_RDMA_V1 &&
- req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
- if (le16_to_cpu(req->DataOffset) ==
- offsetof(struct smb2_write_req, Buffer)) {
- data_buf = (char *)&req->Buffer[0];
- } else {
- if ((u64)le16_to_cpu(req->DataOffset) + length >
- get_rfc1002_len(work->request_buf)) {
- pr_err("invalid write data offset %u, smb_len %u\n",
- le16_to_cpu(req->DataOffset),
- get_rfc1002_len(work->request_buf));
- err = -EINVAL;
- goto out;
- }
-
- data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
- le16_to_cpu(req->DataOffset));
+ if (is_rdma_channel == false) {
+ if ((u64)le16_to_cpu(req->DataOffset) + length >
+ get_rfc1002_len(work->request_buf)) {
+ pr_err("invalid write data offset %u, smb_len %u\n",
+ le16_to_cpu(req->DataOffset),
+ get_rfc1002_len(work->request_buf));
+ err = -EINVAL;
+ goto out;
}
+ data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
+ le16_to_cpu(req->DataOffset));
ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
@@ -6520,8 +6519,7 @@ int smb2_write(struct ksmbd_work *work)
/* read data from the client using rdma channel, and
* write the data.
*/
- nbytes = smb2_write_rdma_channel(work, req, fp, offset,
- le32_to_cpu(req->RemainingBytes),
+ nbytes = smb2_write_rdma_channel(work, req, fp, offset, length,
writethrough);
if (nbytes < 0) {
err = (int)nbytes;
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index 9a7e211dbf4f..7f8ab14fb8ec 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -140,8 +140,10 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work)
hdr = work->request_buf;
if (*(__le32 *)hdr->Protocol == SMB1_PROTO_NUMBER &&
- hdr->Command == SMB_COM_NEGOTIATE)
+ hdr->Command == SMB_COM_NEGOTIATE) {
+ work->conn->outstanding_credits++;
return 0;
+ }
return -EINVAL;
}
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index 6ecf55ea1fed..38f23bf981ac 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -1261,6 +1261,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
if (!access_bits)
access_bits =
SET_MINIMUM_RIGHTS;
+ posix_acl_release(posix_acls);
goto check_access_bits;
}
}
diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c
index 3ad6881e0f7e..7cb0eeb07c80 100644
--- a/fs/ksmbd/transport_ipc.c
+++ b/fs/ksmbd/transport_ipc.c
@@ -26,6 +26,7 @@
#include "mgmt/ksmbd_ida.h"
#include "connection.h"
#include "transport_tcp.h"
+#include "transport_rdma.h"
#define IPC_WAIT_TIMEOUT (2 * HZ)
@@ -303,6 +304,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
init_smb2_max_trans_size(req->smb2_max_trans);
if (req->smb2_max_credits)
init_smb2_max_credits(req->smb2_max_credits);
+ if (req->smbd_max_io_size)
+ init_smbd_max_io_size(req->smbd_max_io_size);
ret = ksmbd_set_netbios_name(req->netbios_name);
ret |= ksmbd_set_server_string(req->server_string);
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index e646d79554b8..d035e060c2f0 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -80,9 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024;
/* The maximum single-message size which can be received */
static int smb_direct_max_receive_size = 8192;
-static int smb_direct_max_read_write_size = 524224;
-
-static int smb_direct_max_outstanding_rw_ops = 8;
+static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE;
static LIST_HEAD(smb_direct_device_list);
static DEFINE_RWLOCK(smb_direct_device_lock);
@@ -147,18 +145,18 @@ struct smb_direct_transport {
atomic_t send_credits;
spinlock_t lock_new_recv_credits;
int new_recv_credits;
- atomic_t rw_avail_ops;
+ int max_rw_credits;
+ int pages_per_rw_credit;
+ atomic_t rw_credits;
wait_queue_head_t wait_send_credits;
- wait_queue_head_t wait_rw_avail_ops;
+ wait_queue_head_t wait_rw_credits;
mempool_t *sendmsg_mempool;
struct kmem_cache *sendmsg_cache;
mempool_t *recvmsg_mempool;
struct kmem_cache *recvmsg_cache;
- wait_queue_head_t wait_send_payload_pending;
- atomic_t send_payload_pending;
wait_queue_head_t wait_send_pending;
atomic_t send_pending;
@@ -208,12 +206,25 @@ struct smb_direct_recvmsg {
struct smb_direct_rdma_rw_msg {
struct smb_direct_transport *t;
struct ib_cqe cqe;
+ int status;
struct completion *completion;
+ struct list_head list;
struct rdma_rw_ctx rw_ctx;
struct sg_table sgt;
struct scatterlist sg_list[];
};
+void init_smbd_max_io_size(unsigned int sz)
+{
+ sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE);
+ smb_direct_max_read_write_size = sz;
+}
+
+unsigned int get_smbd_max_read_write_size(void)
+{
+ return smb_direct_max_read_write_size;
+}
+
static inline int get_buf_page_count(void *buf, int size)
{
return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) -
@@ -377,7 +388,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
t->reassembly_queue_length = 0;
init_waitqueue_head(&t->wait_reassembly_queue);
init_waitqueue_head(&t->wait_send_credits);
- init_waitqueue_head(&t->wait_rw_avail_ops);
+ init_waitqueue_head(&t->wait_rw_credits);
spin_lock_init(&t->receive_credit_lock);
spin_lock_init(&t->recvmsg_queue_lock);
@@ -386,8 +397,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
spin_lock_init(&t->empty_recvmsg_queue_lock);
INIT_LIST_HEAD(&t->empty_recvmsg_queue);
- init_waitqueue_head(&t->wait_send_payload_pending);
- atomic_set(&t->send_payload_pending, 0);
init_waitqueue_head(&t->wait_send_pending);
atomic_set(&t->send_pending, 0);
@@ -417,8 +426,6 @@ static void free_transport(struct smb_direct_transport *t)
wake_up_interruptible(&t->wait_send_credits);
ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n");
- wait_event(t->wait_send_payload_pending,
- atomic_read(&t->send_payload_pending) == 0);
wait_event(t->wait_send_pending,
atomic_read(&t->send_pending) == 0);
@@ -569,6 +576,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
}
t->negotiation_requested = true;
t->full_packet_received = true;
+ t->status = SMB_DIRECT_CS_CONNECTED;
enqueue_reassembly(t, recvmsg, 0);
wake_up_interruptible(&t->wait_status);
break;
@@ -873,13 +881,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
smb_direct_disconnect_rdma_connection(t);
}
- if (sendmsg->num_sge > 1) {
- if (atomic_dec_and_test(&t->send_payload_pending))
- wake_up(&t->wait_send_payload_pending);
- } else {
- if (atomic_dec_and_test(&t->send_pending))
- wake_up(&t->wait_send_pending);
- }
+ if (atomic_dec_and_test(&t->send_pending))
+ wake_up(&t->wait_send_pending);
/* iterate and free the list of messages in reverse. the list's head
* is invalid.
@@ -911,21 +914,12 @@ static int smb_direct_post_send(struct smb_direct_transport *t,
{
int ret;
- if (wr->num_sge > 1)
- atomic_inc(&t->send_payload_pending);
- else
- atomic_inc(&t->send_pending);
-
+ atomic_inc(&t->send_pending);
ret = ib_post_send(t->qp, wr, NULL);
if (ret) {
pr_err("failed to post send: %d\n", ret);
- if (wr->num_sge > 1) {
- if (atomic_dec_and_test(&t->send_payload_pending))
- wake_up(&t->wait_send_payload_pending);
- } else {
- if (atomic_dec_and_test(&t->send_pending))
- wake_up(&t->wait_send_pending);
- }
+ if (atomic_dec_and_test(&t->send_pending))
+ wake_up(&t->wait_send_pending);
smb_direct_disconnect_rdma_connection(t);
}
return ret;
@@ -983,18 +977,19 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t,
}
static int wait_for_credits(struct smb_direct_transport *t,
- wait_queue_head_t *waitq, atomic_t *credits)
+ wait_queue_head_t *waitq, atomic_t *total_credits,
+ int needed)
{
int ret;
do {
- if (atomic_dec_return(credits) >= 0)
+ if (atomic_sub_return(needed, total_credits) >= 0)
return 0;
- atomic_inc(credits);
+ atomic_add(needed, total_credits);
ret = wait_event_interruptible(*waitq,
- atomic_read(credits) > 0 ||
- t->status != SMB_DIRECT_CS_CONNECTED);
+ atomic_read(total_credits) >= needed ||
+ t->status != SMB_DIRECT_CS_CONNECTED);
if (t->status != SMB_DIRECT_CS_CONNECTED)
return -ENOTCONN;
@@ -1015,7 +1010,19 @@ static int wait_for_send_credits(struct smb_direct_transport *t,
return ret;
}
- return wait_for_credits(t, &t->wait_send_credits, &t->send_credits);
+ return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1);
+}
+
+static int wait_for_rw_credits(struct smb_direct_transport *t, int credits)
+{
+ return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits);
+}
+
+static int calc_rw_credits(struct smb_direct_transport *t,
+ char *buf, unsigned int len)
+{
+ return DIV_ROUND_UP(get_buf_page_count(buf, len),
+ t->pages_per_rw_credit);
}
static int smb_direct_create_header(struct smb_direct_transport *t,
@@ -1086,7 +1093,7 @@ static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nen
int offset, len;
int i = 0;
- if (nentries < get_buf_page_count(buf, size))
+ if (size <= 0 || nentries < get_buf_page_count(buf, size))
return -EINVAL;
offset = offset_in_page(buf);
@@ -1118,7 +1125,7 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size,
int npages;
npages = get_sg_list(buf, size, sg_list, nentries);
- if (npages <= 0)
+ if (npages < 0)
return -EINVAL;
return ib_dma_map_sg(device, sg_list, npages, dir);
}
@@ -1313,11 +1320,21 @@ done:
* that means all the I/Os have been out and we are good to return
*/
- wait_event(st->wait_send_payload_pending,
- atomic_read(&st->send_payload_pending) == 0);
+ wait_event(st->wait_send_pending,
+ atomic_read(&st->send_pending) == 0);
return ret;
}
+static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t,
+ struct smb_direct_rdma_rw_msg *msg,
+ enum dma_data_direction dir)
+{
+ rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
+ msg->sgt.sgl, msg->sgt.nents, dir);
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+ kfree(msg);
+}
+
static void read_write_done(struct ib_cq *cq, struct ib_wc *wc,
enum dma_data_direction dir)
{
@@ -1326,19 +1343,14 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc,
struct smb_direct_transport *t = msg->t;
if (wc->status != IB_WC_SUCCESS) {
+ msg->status = -EIO;
pr_err("read/write error. opcode = %d, status = %s(%d)\n",
wc->opcode, ib_wc_status_msg(wc->status), wc->status);
- smb_direct_disconnect_rdma_connection(t);
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ smb_direct_disconnect_rdma_connection(t);
}
- if (atomic_inc_return(&t->rw_avail_ops) > 0)
- wake_up(&t->wait_rw_avail_ops);
-
- rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
- msg->sg_list, msg->sgt.nents, dir);
- sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
complete(msg->completion);
- kfree(msg);
}
static void read_done(struct ib_cq *cq, struct ib_wc *wc)
@@ -1351,94 +1363,141 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc)
read_write_done(cq, wc, DMA_TO_DEVICE);
}
-static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf,
- int buf_len, u32 remote_key, u64 remote_offset,
- u32 remote_len, bool is_read)
+static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
+ void *buf, int buf_len,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len,
+ bool is_read)
{
- struct smb_direct_rdma_rw_msg *msg;
- int ret;
+ struct smb_direct_rdma_rw_msg *msg, *next_msg;
+ int i, ret;
DECLARE_COMPLETION_ONSTACK(completion);
- struct ib_send_wr *first_wr = NULL;
+ struct ib_send_wr *first_wr;
+ LIST_HEAD(msg_list);
+ char *desc_buf;
+ int credits_needed;
+ unsigned int desc_buf_len;
+ size_t total_length = 0;
+
+ if (t->status != SMB_DIRECT_CS_CONNECTED)
+ return -ENOTCONN;
- ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops);
+ /* calculate needed credits */
+ credits_needed = 0;
+ desc_buf = buf;
+ for (i = 0; i < desc_len / sizeof(*desc); i++) {
+ desc_buf_len = le32_to_cpu(desc[i].length);
+
+ credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len);
+ desc_buf += desc_buf_len;
+ total_length += desc_buf_len;
+ if (desc_buf_len == 0 || total_length > buf_len ||
+ total_length > t->max_rdma_rw_size)
+ return -EINVAL;
+ }
+
+ ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n",
+ is_read ? "read" : "write", buf_len, credits_needed);
+
+ ret = wait_for_rw_credits(t, credits_needed);
if (ret < 0)
return ret;
- /* TODO: mempool */
- msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) +
- sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL);
- if (!msg) {
- atomic_inc(&t->rw_avail_ops);
- return -ENOMEM;
- }
+ /* build rdma_rw_ctx for each descriptor */
+ desc_buf = buf;
+ for (i = 0; i < desc_len / sizeof(*desc); i++) {
+ msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) +
+ sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto out;
+ }
- msg->sgt.sgl = &msg->sg_list[0];
- ret = sg_alloc_table_chained(&msg->sgt,
- get_buf_page_count(buf, buf_len),
- msg->sg_list, SG_CHUNK_SIZE);
- if (ret) {
- atomic_inc(&t->rw_avail_ops);
- kfree(msg);
- return -ENOMEM;
- }
+ desc_buf_len = le32_to_cpu(desc[i].length);
- ret = get_sg_list(buf, buf_len, msg->sgt.sgl, msg->sgt.orig_nents);
- if (ret <= 0) {
- pr_err("failed to get pages\n");
- goto err;
- }
+ msg->t = t;
+ msg->cqe.done = is_read ? read_done : write_done;
+ msg->completion = &completion;
- ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port,
- msg->sg_list, get_buf_page_count(buf, buf_len),
- 0, remote_offset, remote_key,
- is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
- if (ret < 0) {
- pr_err("failed to init rdma_rw_ctx: %d\n", ret);
- goto err;
+ msg->sgt.sgl = &msg->sg_list[0];
+ ret = sg_alloc_table_chained(&msg->sgt,
+ get_buf_page_count(desc_buf, desc_buf_len),
+ msg->sg_list, SG_CHUNK_SIZE);
+ if (ret) {
+ kfree(msg);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_sg_list(desc_buf, desc_buf_len,
+ msg->sgt.sgl, msg->sgt.orig_nents);
+ if (ret < 0) {
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+ kfree(msg);
+ goto out;
+ }
+
+ ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port,
+ msg->sgt.sgl,
+ get_buf_page_count(desc_buf, desc_buf_len),
+ 0,
+ le64_to_cpu(desc[i].offset),
+ le32_to_cpu(desc[i].token),
+ is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ if (ret < 0) {
+ pr_err("failed to init rdma_rw_ctx: %d\n", ret);
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+ kfree(msg);
+ goto out;
+ }
+
+ list_add_tail(&msg->list, &msg_list);
+ desc_buf += desc_buf_len;
}
- msg->t = t;
- msg->cqe.done = is_read ? read_done : write_done;
- msg->completion = &completion;
- first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port,
- &msg->cqe, NULL);
+ /* concatenate work requests of rdma_rw_ctxs */
+ first_wr = NULL;
+ list_for_each_entry_reverse(msg, &msg_list, list) {
+ first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port,
+ &msg->cqe, first_wr);
+ }
ret = ib_post_send(t->qp, first_wr, NULL);
if (ret) {
- pr_err("failed to post send wr: %d\n", ret);
- goto err;
+ pr_err("failed to post send wr for RDMA R/W: %d\n", ret);
+ goto out;
}
+ msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list);
wait_for_completion(&completion);
- return 0;
-
-err:
- atomic_inc(&t->rw_avail_ops);
- if (first_wr)
- rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
- msg->sg_list, msg->sgt.nents,
- is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
- sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
- kfree(msg);
+ ret = msg->status;
+out:
+ list_for_each_entry_safe(msg, next_msg, &msg_list, list) {
+ list_del(&msg->list);
+ smb_direct_free_rdma_rw_msg(t, msg,
+ is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ }
+ atomic_add(credits_needed, &t->rw_credits);
+ wake_up(&t->wait_rw_credits);
return ret;
}
-static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf,
- unsigned int buflen, u32 remote_key,
- u64 remote_offset, u32 remote_len)
+static int smb_direct_rdma_write(struct ksmbd_transport *t,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len)
{
return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
- remote_key, remote_offset,
- remote_len, false);
+ desc, desc_len, false);
}
-static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf,
- unsigned int buflen, u32 remote_key,
- u64 remote_offset, u32 remote_len)
+static int smb_direct_rdma_read(struct ksmbd_transport *t,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len)
{
return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
- remote_key, remote_offset,
- remote_len, true);
+ desc, desc_len, true);
}
static void smb_direct_disconnect(struct ksmbd_transport *t)
@@ -1638,41 +1697,57 @@ out_err:
return ret;
}
+static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t)
+{
+ return min_t(unsigned int,
+ t->cm_id->device->attrs.max_fast_reg_page_list_len,
+ 256);
+}
+
static int smb_direct_init_params(struct smb_direct_transport *t,
struct ib_qp_cap *cap)
{
struct ib_device *device = t->cm_id->device;
- int max_send_sges, max_pages, max_rw_wrs, max_send_wrs;
+ int max_send_sges, max_rw_wrs, max_send_wrs;
+ unsigned int max_sge_per_wr, wrs_per_credit;
- /* need 2 more sge. because a SMB_DIRECT header will be mapped,
- * and maybe a send buffer could be not page aligned.
+ /* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
+ * SMB2 response could be mapped.
*/
t->max_send_size = smb_direct_max_send_size;
- max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 2;
+ max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3;
if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) {
pr_err("max_send_size %d is too large\n", t->max_send_size);
return -EINVAL;
}
- /*
- * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA
- * read/writes. HCA guarantees at least max_send_sge of sges for
- * a RDMA read/write work request, and if memory registration is used,
- * we need reg_mr, local_inv wrs for each read/write.
+ /* Calculate the number of work requests for RDMA R/W.
+ * The maximum number of pages which can be registered
+ * with one Memory region can be transferred with one
+ * R/W credit. And at least 4 work requests for each credit
+ * are needed for MR registration, RDMA R/W, local & remote
+ * MR invalidation.
*/
t->max_rdma_rw_size = smb_direct_max_read_write_size;
- max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1;
- max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES);
- max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num,
- max_pages) * 2;
- max_rw_wrs *= smb_direct_max_outstanding_rw_ops;
+ t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t);
+ t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size,
+ (t->pages_per_rw_credit - 1) *
+ PAGE_SIZE);
+
+ max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge,
+ device->attrs.max_sge_rd);
+ max_sge_per_wr = max_t(unsigned int, max_sge_per_wr,
+ max_send_sges);
+ wrs_per_credit = max_t(unsigned int, 4,
+ DIV_ROUND_UP(t->pages_per_rw_credit,
+ max_sge_per_wr) + 1);
+ max_rw_wrs = t->max_rw_credits * wrs_per_credit;
max_send_wrs = smb_direct_send_credit_target + max_rw_wrs;
if (max_send_wrs > device->attrs.max_cqe ||
max_send_wrs > device->attrs.max_qp_wr) {
- pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n",
- smb_direct_send_credit_target,
- smb_direct_max_outstanding_rw_ops);
+ pr_err("consider lowering send_credit_target = %d\n",
+ smb_direct_send_credit_target);
pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
device->attrs.max_cqe, device->attrs.max_qp_wr);
return -EINVAL;
@@ -1687,11 +1762,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
return -EINVAL;
}
- if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) {
- pr_err("warning: device max_send_sge = %d too small\n",
- device->attrs.max_send_sge);
- return -EINVAL;
- }
if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) {
pr_err("warning: device max_recv_sge = %d too small\n",
device->attrs.max_recv_sge);
@@ -1707,7 +1777,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
t->send_credit_target = smb_direct_send_credit_target;
atomic_set(&t->send_credits, 0);
- atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops);
+ atomic_set(&t->rw_credits, t->max_rw_credits);
t->max_send_size = smb_direct_max_send_size;
t->max_recv_size = smb_direct_max_receive_size;
@@ -1715,12 +1785,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
cap->max_send_wr = max_send_wrs;
cap->max_recv_wr = t->recv_credit_max;
- cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES;
+ cap->max_send_sge = max_sge_per_wr;
cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
cap->max_inline_data = 0;
- cap->max_rdma_ctxs =
- rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) *
- smb_direct_max_outstanding_rw_ops;
+ cap->max_rdma_ctxs = t->max_rw_credits;
return 0;
}
@@ -1813,7 +1881,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
}
t->send_cq = ib_alloc_cq(t->cm_id->device, t,
- t->send_credit_target, 0, IB_POLL_WORKQUEUE);
+ smb_direct_send_credit_target + cap->max_rdma_ctxs,
+ 0, IB_POLL_WORKQUEUE);
if (IS_ERR(t->send_cq)) {
pr_err("Can't create RDMA send CQ\n");
ret = PTR_ERR(t->send_cq);
@@ -1822,8 +1891,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
}
t->recv_cq = ib_alloc_cq(t->cm_id->device, t,
- cap->max_send_wr + cap->max_rdma_ctxs,
- 0, IB_POLL_WORKQUEUE);
+ t->recv_credit_max, 0, IB_POLL_WORKQUEUE);
if (IS_ERR(t->recv_cq)) {
pr_err("Can't create RDMA recv CQ\n");
ret = PTR_ERR(t->recv_cq);
@@ -1852,17 +1920,12 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1;
if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) {
- int pages_per_mr, mr_count;
-
- pages_per_mr = min_t(int, pages_per_rw,
- t->cm_id->device->attrs.max_fast_reg_page_list_len);
- mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) *
- atomic_read(&t->rw_avail_ops);
- ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count,
- IB_MR_TYPE_MEM_REG, pages_per_mr, 0);
+ ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs,
+ t->max_rw_credits, IB_MR_TYPE_MEM_REG,
+ t->pages_per_rw_credit, 0);
if (ret) {
pr_err("failed to init mr pool count %d pages %d\n",
- mr_count, pages_per_mr);
+ t->max_rw_credits, t->pages_per_rw_credit);
goto err;
}
}
diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h
index 5567d93a6f96..77aee4e5c9dc 100644
--- a/fs/ksmbd/transport_rdma.h
+++ b/fs/ksmbd/transport_rdma.h
@@ -7,6 +7,10 @@
#ifndef __KSMBD_TRANSPORT_RDMA_H__
#define __KSMBD_TRANSPORT_RDMA_H__
+#define SMBD_DEFAULT_IOSIZE (8 * 1024 * 1024)
+#define SMBD_MIN_IOSIZE (512 * 1024)
+#define SMBD_MAX_IOSIZE (16 * 1024 * 1024)
+
/* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */
struct smb_direct_negotiate_req {
__le16 min_version;
@@ -52,10 +56,14 @@ struct smb_direct_data_transfer {
int ksmbd_rdma_init(void);
void ksmbd_rdma_destroy(void);
bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
+void init_smbd_max_io_size(unsigned int sz);
+unsigned int get_smbd_max_read_write_size(void);
#else
static inline int ksmbd_rdma_init(void) { return 0; }
static inline int ksmbd_rdma_destroy(void) { return 0; }
static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
+static inline void init_smbd_max_io_size(unsigned int sz) { }
+static inline unsigned int get_smbd_max_read_write_size(void) { return 0; }
#endif
#endif /* __KSMBD_TRANSPORT_RDMA_H__ */
diff --git a/fs/locks.c b/fs/locks.c
index 8c6df10cd9ed..ca28e0e50e56 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -300,6 +300,34 @@ void locks_release_private(struct file_lock *fl)
}
EXPORT_SYMBOL_GPL(locks_release_private);
+/**
+ * locks_owner_has_blockers - Check for blocking lock requests
+ * @flctx: file lock context
+ * @owner: lock owner
+ *
+ * Return values:
+ * %true: @owner has at least one blocker
+ * %false: @owner has no blockers
+ */
+bool locks_owner_has_blockers(struct file_lock_context *flctx,
+ fl_owner_t owner)
+{
+ struct file_lock *fl;
+
+ spin_lock(&flctx->flc_lock);
+ list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+ if (fl->fl_owner != owner)
+ continue;
+ if (!list_empty(&fl->fl_blocked_requests)) {
+ spin_unlock(&flctx->flc_lock);
+ return true;
+ }
+ }
+ spin_unlock(&flctx->flc_lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(locks_owner_has_blockers);
+
/* Free a lock which is not in use. */
void locks_free_lock(struct file_lock *fl)
{
@@ -874,6 +902,8 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
struct file_lock *cfl;
struct file_lock_context *ctx;
struct inode *inode = locks_inode(filp);
+ void *owner;
+ void (*func)(void);
ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
@@ -881,12 +911,23 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
return;
}
+retry:
spin_lock(&ctx->flc_lock);
list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
- if (posix_locks_conflict(fl, cfl)) {
- locks_copy_conflock(fl, cfl);
- goto out;
+ if (!posix_locks_conflict(fl, cfl))
+ continue;
+ if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
+ && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
+ owner = cfl->fl_lmops->lm_mod_owner;
+ func = cfl->fl_lmops->lm_expire_lock;
+ __module_get(owner);
+ spin_unlock(&ctx->flc_lock);
+ (*func)();
+ module_put(owner);
+ goto retry;
}
+ locks_copy_conflock(fl, cfl);
+ goto out;
}
fl->fl_type = F_UNLCK;
out:
@@ -1060,6 +1101,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
int error;
bool added = false;
LIST_HEAD(dispose);
+ void *owner;
+ void (*func)(void);
ctx = locks_get_lock_context(inode, request->fl_type);
if (!ctx)
@@ -1078,6 +1121,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
new_fl2 = locks_alloc_lock();
}
+retry:
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
@@ -1089,6 +1133,17 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
if (!posix_locks_conflict(request, fl))
continue;
+ if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
+ && (*fl->fl_lmops->lm_lock_expirable)(fl)) {
+ owner = fl->fl_lmops->lm_mod_owner;
+ func = fl->fl_lmops->lm_expire_lock;
+ __module_get(owner);
+ spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
+ (*func)();
+ module_put(owner);
+ goto retry;
+ }
if (conflock)
locks_copy_conflock(conflock, fl);
error = -EAGAIN;
diff --git a/fs/namei.c b/fs/namei.c
index 896ade8b7400..1f28d3f463c3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -730,13 +730,6 @@ static bool legitimize_links(struct nameidata *nd)
static bool legitimize_root(struct nameidata *nd)
{
- /*
- * For scoped-lookups (where nd->root has been zeroed), we need to
- * restart the whole lookup from scratch -- because set_root() is wrong
- * for these lookups (nd->dfd is the root, not the filesystem root).
- */
- if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
- return false;
/* Nothing to do if nd->root is zero or is managed by the VFS user. */
if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
return true;
@@ -798,7 +791,7 @@ out:
* @seq: seq number to check @dentry against
* Returns: true on success, false on failure
*
- * Similar to to try_to_unlazy(), but here we have the next dentry already
+ * Similar to try_to_unlazy(), but here we have the next dentry already
* picked by rcu-walk and want to legitimize that in addition to the current
* nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context.
* Nothing should touch nameidata between try_to_unlazy_next() failure and
@@ -1032,7 +1025,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_symlinks",
.data = &sysctl_protected_symlinks,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -1041,7 +1034,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_hardlinks",
.data = &sysctl_protected_hardlinks,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -1050,7 +1043,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_fifos",
.data = &sysctl_protected_fifos,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
@@ -1059,7 +1052,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_regular",
.data = &sysctl_protected_regular,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
@@ -1755,7 +1748,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
// unlazy even if we fail to grab the link - cleanup needs it
bool grabbed_link = legitimize_path(nd, link, seq);
- if (!try_to_unlazy(nd) != 0 || !grabbed_link)
+ if (!try_to_unlazy(nd) || !grabbed_link)
return -ECHILD;
if (nd_alloc_stack(nd))
@@ -2769,7 +2762,8 @@ struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name,
EXPORT_SYMBOL(lookup_one);
/**
- * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * lookup_one_unlocked - filesystem helper to lookup single pathname component
+ * @mnt_userns: idmapping of the mount the lookup is performed from
* @name: pathname component to lookup
* @base: base directory to lookup from
* @len: maximum length @len should be interpreted to
@@ -2780,14 +2774,15 @@ EXPORT_SYMBOL(lookup_one);
* Unlike lookup_one_len, it should be called without the parent
* i_mutex held, and will take the i_mutex itself if necessary.
*/
-struct dentry *lookup_one_len_unlocked(const char *name,
- struct dentry *base, int len)
+struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns,
+ const char *name, struct dentry *base,
+ int len)
{
struct qstr this;
int err;
struct dentry *ret;
- err = lookup_one_common(&init_user_ns, name, base, len, &this);
+ err = lookup_one_common(mnt_userns, name, base, len, &this);
if (err)
return ERR_PTR(err);
@@ -2796,6 +2791,59 @@ struct dentry *lookup_one_len_unlocked(const char *name,
ret = lookup_slow(&this, base, 0);
return ret;
}
+EXPORT_SYMBOL(lookup_one_unlocked);
+
+/**
+ * lookup_one_positive_unlocked - filesystem helper to lookup single
+ * pathname component
+ * @mnt_userns: idmapping of the mount the lookup is performed from
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
+ * known positive or ERR_PTR(). This is what most of the users want.
+ *
+ * Note that pinned negative with unlocked parent _can_ become positive at any
+ * time, so callers of lookup_one_unlocked() need to be very careful; pinned
+ * positives have >d_inode stable, so this one avoids such problems.
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * The helper should be called without i_mutex held.
+ */
+struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns,
+ const char *name,
+ struct dentry *base, int len)
+{
+ struct dentry *ret = lookup_one_unlocked(mnt_userns, name, base, len);
+
+ if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
+ dput(ret);
+ ret = ERR_PTR(-ENOENT);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_positive_unlocked);
+
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ return lookup_one_unlocked(&init_user_ns, name, base, len);
+}
EXPORT_SYMBOL(lookup_one_len_unlocked);
/*
@@ -2809,12 +2857,7 @@ EXPORT_SYMBOL(lookup_one_len_unlocked);
struct dentry *lookup_positive_unlocked(const char *name,
struct dentry *base, int len)
{
- struct dentry *ret = lookup_one_len_unlocked(name, base, len);
- if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
- dput(ret);
- ret = ERR_PTR(-ENOENT);
- }
- return ret;
+ return lookup_one_positive_unlocked(&init_user_ns, name, base, len);
}
EXPORT_SYMBOL(lookup_positive_unlocked);
diff --git a/fs/namespace.c b/fs/namespace.c
index 41461f55c039..e6a7e769d25d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1760,7 +1760,7 @@ out_unlock:
/*
* Is the caller allowed to modify his namespace?
*/
-static inline bool may_mount(void)
+bool may_mount(void)
{
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 8742d22dfd2b..42f892c5712e 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -155,7 +155,7 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq,
void netfs_readahead(struct readahead_control *ractl)
{
struct netfs_io_request *rreq;
- struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host);
+ struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
int ret;
_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
@@ -215,7 +215,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
{
struct address_space *mapping = folio_file_mapping(folio);
struct netfs_io_request *rreq;
- struct netfs_i_context *ctx = netfs_i_context(mapping->host);
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
int ret;
_enter("%lx", folio_index(folio));
@@ -297,6 +297,7 @@ zero_out:
/**
* netfs_write_begin - Helper to prepare for writing
+ * @ctx: The netfs context
* @file: The file to read from
* @mapping: The mapping to read from
* @pos: File position at which the write will begin
@@ -326,12 +327,12 @@ zero_out:
*
* This is usable whether or not caching is enabled.
*/
-int netfs_write_begin(struct file *file, struct address_space *mapping,
+int netfs_write_begin(struct netfs_inode *ctx,
+ struct file *file, struct address_space *mapping,
loff_t pos, unsigned int len, struct folio **_folio,
void **_fsdata)
{
struct netfs_io_request *rreq;
- struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
struct folio *folio;
unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
pgoff_t index = pos >> PAGE_SHIFT;
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index b7b0e3d18d9e..43fac1b14e40 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -91,7 +91,7 @@ static inline void netfs_stat_d(atomic_t *stat)
/*
* Miscellaneous functions.
*/
-static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx)
+static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
{
#if IS_ENABLED(CONFIG_FSCACHE)
struct fscache_cookie *cookie = ctx->cache;
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e86107b30ba4..e17cdf53f6a7 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -18,7 +18,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
{
static atomic_t debug_ids;
struct inode *inode = file ? file_inode(file) : mapping->host;
- struct netfs_i_context *ctx = netfs_i_context(inode);
+ struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
int ret;
@@ -75,10 +75,10 @@ static void netfs_free_request(struct work_struct *work)
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
- netfs_clear_subrequests(rreq, false);
- if (rreq->netfs_priv)
- rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ netfs_clear_subrequests(rreq, false);
+ if (rreq->netfs_ops->free_request)
+ rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
kfree(rreq);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c8520284dda7..c1eda73254e1 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -288,6 +288,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
rv = NFS4_OK;
break;
case -ENOENT:
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
/* Embrace your forgetfulness! */
rv = NFS4ERR_NOMATCHING_LAYOUT;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a8ecdd527662..0c4e8dd6aa96 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2124,6 +2124,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
}
goto out;
}
+ file->f_mode |= FMODE_CAN_ODIRECT;
err = nfs_finish_open(ctx, ctx->dentry, file, open_flags);
trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 11c566d8769f..4eb2a8380a28 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -153,28 +153,25 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
}
/**
- * nfs_direct_IO - NFS address space operation for direct I/O
+ * nfs_swap_rw - NFS address space operation for swap I/O
* @iocb: target I/O control block
* @iter: I/O buffer
*
- * The presence of this routine in the address space ops vector means
- * the NFS client supports direct I/O. However, for most direct IO, we
- * shunt off direct read and write requests before the VFS gets them,
- * so this method is only ever called for swap.
+ * Perform IO to the swap-file. This is much like direct IO.
*/
-ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
-
- /* we only support swap file calling nfs_direct_IO */
- if (!IS_SWAPFILE(inode))
- return 0;
+ ssize_t ret;
VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
if (iov_iter_rw(iter) == READ)
- return nfs_file_direct_read(iocb, iter, true);
- return nfs_file_direct_write(iocb, iter, true);
+ ret = nfs_file_direct_read(iocb, iter, true);
+ else
+ ret = nfs_file_direct_write(iocb, iter, true);
+ if (ret < 0)
+ return ret;
+ return 0;
}
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d764b3ce7905..2d72b1b7ed74 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -69,6 +69,8 @@ nfs_file_open(struct inode *inode, struct file *filp)
return res;
res = nfs_open(inode, filp);
+ if (res == 0)
+ filp->f_mode |= FMODE_CAN_ODIRECT;
return res;
}
@@ -204,15 +206,16 @@ static int
nfs_file_fsync_commit(struct file *file, int datasync)
{
struct inode *inode = file_inode(file);
- int ret;
+ int ret, ret2;
dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
ret = nfs_commit_inode(inode, FLUSH_SYNC);
- if (ret < 0)
- return ret;
- return file_check_and_advance_wb_err(file);
+ ret2 = file_check_and_advance_wb_err(file);
+ if (ret2 < 0)
+ return ret2;
+ return ret;
}
int
@@ -385,11 +388,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
return status;
NFS_I(mapping->host)->write_io += copied;
- if (nfs_ctx_key_to_expire(ctx, mapping->host)) {
- status = nfs_wb_all(mapping->host);
- if (status < 0)
- return status;
- }
+ if (nfs_ctx_key_to_expire(ctx, mapping->host))
+ nfs_wb_all(mapping->host);
return copied;
}
@@ -480,6 +480,7 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
{
unsigned long blocks;
long long isize;
+ int ret;
struct inode *inode = file_inode(file);
struct rpc_clnt *clnt = NFS_CLIENT(inode);
struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
@@ -493,13 +494,22 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
return -EINVAL;
}
- *span = sis->pages;
+ ret = rpc_clnt_swap_activate(clnt);
+ if (ret)
+ return ret;
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ if (ret < 0) {
+ rpc_clnt_swap_deactivate(clnt);
+ return ret;
+ }
+ *span = sis->pages;
if (cl->rpc_ops->enable_swap)
cl->rpc_ops->enable_swap(inode);
- return rpc_clnt_swap_activate(clnt);
+ sis->flags |= SWP_FS_OPS;
+ return ret;
}
static void nfs_swap_deactivate(struct file *file)
@@ -523,7 +533,6 @@ const struct address_space_operations nfs_file_aops = {
.write_end = nfs_write_end,
.invalidate_folio = nfs_invalidate_folio,
.release_folio = nfs_release_folio,
- .direct_IO = nfs_direct_IO,
#ifdef CONFIG_MIGRATION
.migratepage = nfs_migrate_page,
#endif
@@ -532,6 +541,7 @@ const struct address_space_operations nfs_file_aops = {
.error_remove_page = generic_error_remove_page,
.swap_activate = nfs_swap_activate,
.swap_deactivate = nfs_swap_deactivate,
+ .swap_rw = nfs_swap_rw,
};
/*
@@ -594,18 +604,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
.page_mkwrite = nfs_vm_page_mkwrite,
};
-static int nfs_need_check_write(struct file *filp, struct inode *inode,
- int error)
-{
- struct nfs_open_context *ctx;
-
- ctx = nfs_file_open_context(filp);
- if (nfs_error_is_fatal_on_server(error) ||
- nfs_ctx_key_to_expire(ctx, inode))
- return 1;
- return 0;
-}
-
ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -633,7 +631,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) {
result = nfs_revalidate_file_size(inode, file);
if (result)
- goto out;
+ return result;
}
nfs_clear_invalid_mapping(file->f_mapping);
@@ -652,6 +650,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
written = result;
iocb->ki_pos += written;
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
if (mntflags & NFS_MOUNT_WRITE_EAGER) {
result = filemap_fdatawrite_range(file->f_mapping,
@@ -669,17 +668,22 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
}
result = generic_write_sync(iocb, written);
if (result < 0)
- goto out;
+ return result;
+out:
/* Return error values */
error = filemap_check_wb_err(file->f_mapping, since);
- if (nfs_need_check_write(file, inode, error)) {
- int err = nfs_wb_all(inode);
- if (err < 0)
- result = err;
+ switch (error) {
+ default:
+ break;
+ case -EDQUOT:
+ case -EFBIG:
+ case -ENOSPC:
+ nfs_wb_all(inode);
+ error = file_check_and_advance_wb_err(file);
+ if (error < 0)
+ result = error;
}
- nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
-out:
return result;
out_swapfile:
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 76deddab0a8f..2b2661582bbe 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -839,7 +839,12 @@ fl_pnfs_update_layout(struct inode *ino,
lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode,
gfp_flags);
- if (IS_ERR_OR_NULL(lseg))
+ if (IS_ERR(lseg)) {
+ /* Fall back to MDS on recoverable errors */
+ if (!nfs_error_is_fatal_on_server(PTR_ERR(lseg)))
+ lseg = NULL;
+ goto out;
+ } else if (!lseg)
goto out;
lo = NFS_I(ino)->layout;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index f73c09a9cf0a..e861d7bae305 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -231,11 +231,10 @@ void nfs_fscache_release_file(struct inode *inode, struct file *filp)
{
struct nfs_fscache_inode_auxdata auxdata;
struct fscache_cookie *cookie = nfs_i_fscache(inode);
+ loff_t i_size = i_size_read(inode);
- if (fscache_cookie_valid(cookie)) {
- nfs_fscache_update_auxdata(&auxdata, inode);
- fscache_unuse_cookie(cookie, &auxdata, NULL);
- }
+ nfs_fscache_update_auxdata(&auxdata, inode);
+ fscache_unuse_cookie(cookie, &auxdata, &i_size);
}
/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7eefa16ed381..8f8cd6e2d4db 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -841,6 +841,7 @@ static inline bool nfs_error_is_fatal_on_server(int err)
case 0:
case -ERESTARTSYS:
case -EINTR:
+ case -ENOMEM:
return false;
}
return nfs_error_is_fatal(err);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 7b861e4f0533..e88f6b18445e 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -93,6 +93,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
nfs_file_set_open_context(filp, ctx);
nfs_fscache_open_file(inode, filp);
err = 0;
+ filp->f_mode |= FMODE_CAN_ODIRECT;
out_put_ctx:
put_nfs_open_context(ctx);
@@ -328,7 +329,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
char *read_name = NULL;
int len, status = 0;
- server = NFS_SERVER(ss_mnt->mnt_root->d_inode);
+ server = NFS_SB(ss_mnt->mnt_sb);
if (!fattr)
return ERR_PTR(-ENOMEM);
@@ -346,7 +347,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
goto out;
snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++);
- r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, fattr);
+ r_ino = nfs_fhget(ss_mnt->mnt_sb, src_fh, fattr);
if (IS_ERR(r_ino)) {
res = ERR_CAST(r_ino);
goto out_free_name;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3680c8da510c..f2dbf904c598 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -417,6 +417,9 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client)
fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
if (!fs_locations)
goto out_free;
+ fs_locations->fattr = nfs_alloc_fattr();
+ if (!fs_locations->fattr)
+ goto out_free_2;
/* Get locations */
dentry = ctx->clone_data.dentry;
@@ -427,14 +430,16 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client)
err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page);
dput(parent);
if (err != 0)
- goto out_free_2;
+ goto out_free_3;
err = -ENOENT;
if (fs_locations->nlocations <= 0 ||
fs_locations->fs_path.ncomponents <= 0)
- goto out_free_2;
+ goto out_free_3;
err = nfs_follow_referral(fc, fs_locations);
+out_free_3:
+ kfree(fs_locations->fattr);
out_free_2:
kfree(fs_locations);
out_free:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a79f66432bd3..c0fdcf8c0032 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1162,7 +1162,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
{
unsigned short task_flags = 0;
- if (server->nfs_client->cl_minorversion)
+ if (server->caps & NFS_CAP_MOVEABLE)
task_flags = RPC_TASK_MOVEABLE;
return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags);
}
@@ -2568,7 +2568,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data,
};
int status;
- if (server->nfs_client->cl_minorversion)
+ if (nfs_server_capable(dir, NFS_CAP_MOVEABLE))
task_setup_data.flags |= RPC_TASK_MOVEABLE;
kref_get(&data->kref);
@@ -3098,6 +3098,10 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
}
out:
+ if (opendata->lgp) {
+ nfs4_lgopen_release(opendata->lgp);
+ opendata->lgp = NULL;
+ }
if (!opendata->cancelled)
nfs4_sequence_free_slot(&opendata->o_res.seq_res);
return ret;
@@ -3733,7 +3737,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
};
int status = -ENOMEM;
- if (server->nfs_client->cl_minorversion)
+ if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE))
task_setup_data.flags |= RPC_TASK_MOVEABLE;
nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP,
@@ -4243,6 +4247,8 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
if (locations == NULL)
goto out;
+ locations->fattr = fattr;
+
status = nfs4_proc_fs_locations(client, dir, name, locations, page);
if (status != 0)
goto out;
@@ -4252,17 +4258,14 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
* referral. Cause us to drop into the exception handler, which
* will kick off migration recovery.
*/
- if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &fattr->fsid)) {
dprintk("%s: server did not return a different fsid for"
" a referral at %s\n", __func__, name->name);
status = -NFS4ERR_MOVED;
goto out;
}
/* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
- nfs_fixup_referral_attributes(&locations->fattr);
-
- /* replace the lookup nfs_fattr with the locations nfs_fattr */
- memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+ nfs_fixup_referral_attributes(fattr);
memset(fhandle, 0, sizeof(struct nfs_fh));
out:
if (page)
@@ -4404,7 +4407,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
};
unsigned short task_flags = 0;
- if (server->nfs_client->cl_minorversion)
+ if (nfs_server_capable(dir, NFS_CAP_MOVEABLE))
task_flags = RPC_TASK_MOVEABLE;
/* Is this is an attribute revalidation, subject to softreval? */
@@ -5768,9 +5771,17 @@ static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred)
return 0;
}
-static inline int nfs4_server_supports_acls(struct nfs_server *server)
+static bool nfs4_server_supports_acls(const struct nfs_server *server,
+ enum nfs4_acl_type type)
{
- return server->caps & NFS_CAP_ACLS;
+ switch (type) {
+ default:
+ return server->attr_bitmask[0] & FATTR4_WORD0_ACL;
+ case NFS4ACL_DACL:
+ return server->attr_bitmask[1] & FATTR4_WORD1_DACL;
+ case NFS4ACL_SACL:
+ return server->attr_bitmask[1] & FATTR4_WORD1_SACL;
+ }
}
/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
@@ -5809,6 +5820,7 @@ unwind:
}
struct nfs4_cached_acl {
+ enum nfs4_acl_type type;
int cached;
size_t len;
char data[];
@@ -5829,7 +5841,8 @@ static void nfs4_zap_acl_attr(struct inode *inode)
nfs4_set_cached_acl(inode, NULL);
}
-static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen)
+static ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf,
+ size_t buflen, enum nfs4_acl_type type)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs4_cached_acl *acl;
@@ -5839,6 +5852,8 @@ static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_
acl = nfsi->nfs4_acl;
if (acl == NULL)
goto out;
+ if (acl->type != type)
+ goto out;
if (buf == NULL) /* user is just asking for length */
goto out_len;
if (acl->cached == 0)
@@ -5854,7 +5869,9 @@ out:
return ret;
}
-static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len)
+static void nfs4_write_cached_acl(struct inode *inode, struct page **pages,
+ size_t pgbase, size_t acl_len,
+ enum nfs4_acl_type type)
{
struct nfs4_cached_acl *acl;
size_t buflen = sizeof(*acl) + acl_len;
@@ -5871,6 +5888,7 @@ static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size
goto out;
acl->cached = 0;
}
+ acl->type = type;
acl->len = acl_len;
out:
nfs4_set_cached_acl(inode, acl);
@@ -5886,14 +5904,17 @@ out:
* length. The next getxattr call will then produce another round trip to
* the server, this time with the input buf of the required size.
*/
-static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
+static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf,
+ size_t buflen, enum nfs4_acl_type type)
{
struct page **pages;
struct nfs_getaclargs args = {
.fh = NFS_FH(inode),
+ .acl_type = type,
.acl_len = buflen,
};
struct nfs_getaclres res = {
+ .acl_type = type,
.acl_len = buflen,
};
struct rpc_message msg = {
@@ -5943,7 +5964,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
ret = -ERANGE;
goto out_free;
}
- nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
+ nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len,
+ type);
if (buf) {
if (res.acl_len > buflen) {
ret = -ERANGE;
@@ -5963,14 +5985,15 @@ out_free:
return ret;
}
-static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
+static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf,
+ size_t buflen, enum nfs4_acl_type type)
{
struct nfs4_exception exception = {
.interruptible = true,
};
ssize_t ret;
do {
- ret = __nfs4_get_acl_uncached(inode, buf, buflen);
+ ret = __nfs4_get_acl_uncached(inode, buf, buflen, type);
trace_nfs4_get_acl(inode, ret);
if (ret >= 0)
break;
@@ -5979,34 +6002,37 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl
return ret;
}
-static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
+static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen,
+ enum nfs4_acl_type type)
{
struct nfs_server *server = NFS_SERVER(inode);
int ret;
- if (!nfs4_server_supports_acls(server))
+ if (!nfs4_server_supports_acls(server, type))
return -EOPNOTSUPP;
ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
if (ret < 0)
return ret;
if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
nfs_zap_acl_cache(inode);
- ret = nfs4_read_cached_acl(inode, buf, buflen);
+ ret = nfs4_read_cached_acl(inode, buf, buflen, type);
if (ret != -ENOENT)
/* -ENOENT is returned if there is no ACL or if there is an ACL
* but no cached acl data, just the acl length */
return ret;
- return nfs4_get_acl_uncached(inode, buf, buflen);
+ return nfs4_get_acl_uncached(inode, buf, buflen, type);
}
-static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
+static int __nfs4_proc_set_acl(struct inode *inode, const void *buf,
+ size_t buflen, enum nfs4_acl_type type)
{
struct nfs_server *server = NFS_SERVER(inode);
struct page *pages[NFS4ACL_MAXPAGES];
struct nfs_setaclargs arg = {
- .fh = NFS_FH(inode),
- .acl_pages = pages,
- .acl_len = buflen,
+ .fh = NFS_FH(inode),
+ .acl_type = type,
+ .acl_len = buflen,
+ .acl_pages = pages,
};
struct nfs_setaclres res;
struct rpc_message msg = {
@@ -6020,7 +6046,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
/* You can't remove system.nfs4_acl: */
if (buflen == 0)
return -EINVAL;
- if (!nfs4_server_supports_acls(server))
+ if (!nfs4_server_supports_acls(server, type))
return -EOPNOTSUPP;
if (npages > ARRAY_SIZE(pages))
return -ERANGE;
@@ -6051,12 +6077,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
return ret;
}
-static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
+static int nfs4_proc_set_acl(struct inode *inode, const void *buf,
+ size_t buflen, enum nfs4_acl_type type)
{
struct nfs4_exception exception = { };
int err;
do {
- err = __nfs4_proc_set_acl(inode, buf, buflen);
+ err = __nfs4_proc_set_acl(inode, buf, buflen, type);
trace_nfs4_set_acl(inode, err);
if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) {
/*
@@ -6612,10 +6639,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
.rpc_client = server->client,
.rpc_message = &msg,
.callback_ops = &nfs4_delegreturn_ops,
- .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
};
int status = 0;
+ if (nfs_server_capable(inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
@@ -6929,10 +6959,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
.workqueue = nfsiod_workqueue,
.flags = RPC_TASK_ASYNC,
};
- struct nfs_client *client =
- NFS_SERVER(lsp->ls_state->inode)->nfs_client;
- if (client->cl_minorversion)
+ if (nfs_server_capable(lsp->ls_state->inode, NFS_CAP_MOVEABLE))
task_setup_data.flags |= RPC_TASK_MOVEABLE;
nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client,
@@ -7203,9 +7231,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
.flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int ret;
- struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client;
- if (client->cl_minorversion)
+ if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE))
task_setup_data.flags |= RPC_TASK_MOVEABLE;
data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
@@ -7655,21 +7682,70 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
const char *key, const void *buf,
size_t buflen, int flags)
{
- return nfs4_proc_set_acl(inode, buf, buflen);
+ return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_ACL);
}
static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *key, void *buf, size_t buflen)
{
- return nfs4_proc_get_acl(inode, buf, buflen);
+ return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_ACL);
}
static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
{
- return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)));
+ return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_ACL);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+#define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl"
+
+static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_DACL);
+}
+
+static int nfs4_xattr_get_nfs4_dacl(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_DACL);
+}
+
+static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry)
+{
+ return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_DACL);
+}
+
+#define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl"
+
+static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_SACL);
+}
+
+static int nfs4_xattr_get_nfs4_sacl(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_SACL);
}
+static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry)
+{
+ return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_SACL);
+}
+
+#endif
+
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
@@ -7902,7 +7978,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
else
bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
- nfs_fattr_init(&fs_locations->fattr);
+ nfs_fattr_init(fs_locations->fattr);
fs_locations->server = server;
fs_locations->nlocations = 0;
status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0);
@@ -7967,7 +8043,7 @@ static int _nfs40_proc_get_locations(struct nfs_server *server,
unsigned long now = jiffies;
int status;
- nfs_fattr_init(&locations->fattr);
+ nfs_fattr_init(locations->fattr);
locations->server = server;
locations->nlocations = 0;
@@ -8032,7 +8108,7 @@ static int _nfs41_proc_get_locations(struct nfs_server *server,
};
int status;
- nfs_fattr_init(&locations->fattr);
+ nfs_fattr_init(locations->fattr);
locations->server = server;
locations->nlocations = 0;
@@ -10391,7 +10467,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
| NFS_CAP_ATOMIC_OPEN_V1
- | NFS_CAP_LGOPEN,
+ | NFS_CAP_LGOPEN
+ | NFS_CAP_MOVEABLE,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
@@ -10426,7 +10503,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_LAYOUTSTATS
| NFS_CAP_CLONE
| NFS_CAP_LAYOUTERROR
- | NFS_CAP_READ_PLUS,
+ | NFS_CAP_READ_PLUS
+ | NFS_CAP_MOVEABLE,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
@@ -10587,6 +10665,22 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
.set = nfs4_xattr_set_nfs4_acl,
};
+#if defined(CONFIG_NFS_V4_1)
+static const struct xattr_handler nfs4_xattr_nfs4_dacl_handler = {
+ .name = XATTR_NAME_NFSV4_DACL,
+ .list = nfs4_xattr_list_nfs4_dacl,
+ .get = nfs4_xattr_get_nfs4_dacl,
+ .set = nfs4_xattr_set_nfs4_dacl,
+};
+
+static const struct xattr_handler nfs4_xattr_nfs4_sacl_handler = {
+ .name = XATTR_NAME_NFSV4_SACL,
+ .list = nfs4_xattr_list_nfs4_sacl,
+ .get = nfs4_xattr_get_nfs4_sacl,
+ .set = nfs4_xattr_set_nfs4_sacl,
+};
+#endif
+
#ifdef CONFIG_NFS_V4_2
static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
.prefix = XATTR_USER_PREFIX,
@@ -10597,6 +10691,10 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
const struct xattr_handler *nfs4_xattr_handlers[] = {
&nfs4_xattr_nfs4_acl_handler,
+#if defined(CONFIG_NFS_V4_1)
+ &nfs4_xattr_nfs4_dacl_handler,
+ &nfs4_xattr_nfs4_sacl_handler,
+#endif
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
&nfs4_xattr_nfs4_label_handler,
#endif
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9e1c987c81e7..2540b35ec187 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1602,7 +1602,8 @@ static inline void nfs42_complete_copies(struct nfs4_state_owner *sp,
#endif /* CONFIG_NFS_V4_2 */
static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state,
- const struct nfs4_state_recovery_ops *ops)
+ const struct nfs4_state_recovery_ops *ops,
+ int *lost_locks)
{
struct nfs4_lock_state *lock;
int status;
@@ -1620,7 +1621,7 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st
list_for_each_entry(lock, &state->lock_states, ls_locks) {
trace_nfs4_state_lock_reclaim(state, lock);
if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
- pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__);
+ *lost_locks += 1;
}
spin_unlock(&state->state_lock);
}
@@ -1630,7 +1631,9 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st
return status;
}
-static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
+static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp,
+ const struct nfs4_state_recovery_ops *ops,
+ int *lost_locks)
{
struct nfs4_state *state;
unsigned int loop = 0;
@@ -1666,7 +1669,7 @@ restart:
#endif /* CONFIG_NFS_V4_2 */
refcount_inc(&state->count);
spin_unlock(&sp->so_lock);
- status = __nfs4_reclaim_open_state(sp, state, ops);
+ status = __nfs4_reclaim_open_state(sp, state, ops, lost_locks);
switch (status) {
default:
@@ -1909,6 +1912,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
struct rb_node *pos;
LIST_HEAD(freeme);
int status = 0;
+ int lost_locks = 0;
restart:
rcu_read_lock();
@@ -1928,8 +1932,11 @@ restart:
spin_unlock(&clp->cl_lock);
rcu_read_unlock();
- status = nfs4_reclaim_open_state(sp, ops);
+ status = nfs4_reclaim_open_state(sp, ops, &lost_locks);
if (status < 0) {
+ if (lost_locks)
+ pr_warn("NFS: %s: lost %d locks\n",
+ clp->cl_hostname, lost_locks);
set_bit(ops->owner_flag_bit, &sp->so_flags);
nfs4_put_state_owner(sp);
status = nfs4_recovery_handle_error(clp, status);
@@ -1943,6 +1950,9 @@ restart:
}
rcu_read_unlock();
nfs4_free_state_owners(&freeme);
+ if (lost_locks)
+ pr_warn("NFS: %s: lost %d locks\n",
+ clp->cl_hostname, lost_locks);
return 0;
}
@@ -2106,6 +2116,11 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
dprintk("<-- %s: no memory\n", __func__);
goto out;
}
+ locations->fattr = nfs_alloc_fattr();
+ if (locations->fattr == NULL) {
+ dprintk("<-- %s: no memory\n", __func__);
+ goto out;
+ }
inode = d_inode(server->super->s_root);
result = nfs4_proc_get_locations(server, NFS_FH(inode), locations,
@@ -2120,7 +2135,7 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
if (!locations->nlocations)
goto out;
- if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
+ if (!(locations->fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
dprintk("<-- %s: No fs_locations data, migration skipped\n",
__func__);
goto out;
@@ -2145,6 +2160,8 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
out:
if (page != NULL)
__free_page(page);
+ if (locations != NULL)
+ kfree(locations->fattr);
kfree(locations);
if (result) {
pr_err("NFS: migration recovery failed (server %s)\n",
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 86a5f6516928..acfe5f4bda48 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1680,19 +1680,35 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr);
}
-static void
-encode_setacl(struct xdr_stream *xdr, const struct nfs_setaclargs *arg,
- struct compound_hdr *hdr)
+static void nfs4_acltype_to_bitmap(enum nfs4_acl_type type, __u32 bitmap[2])
{
- __be32 *p;
+ switch (type) {
+ default:
+ bitmap[0] = FATTR4_WORD0_ACL;
+ bitmap[1] = 0;
+ break;
+ case NFS4ACL_DACL:
+ bitmap[0] = 0;
+ bitmap[1] = FATTR4_WORD1_DACL;
+ break;
+ case NFS4ACL_SACL:
+ bitmap[0] = 0;
+ bitmap[1] = FATTR4_WORD1_SACL;
+ }
+}
+
+static void encode_setacl(struct xdr_stream *xdr,
+ const struct nfs_setaclargs *arg,
+ struct compound_hdr *hdr)
+{
+ __u32 bitmap[2];
+
+ nfs4_acltype_to_bitmap(arg->acl_type, bitmap);
encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr);
encode_nfs4_stateid(xdr, &zero_stateid);
- p = reserve_space(xdr, 2*4);
- *p++ = cpu_to_be32(1);
- *p = cpu_to_be32(FATTR4_WORD0_ACL);
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(arg->acl_len);
+ xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap));
+ encode_uint32(xdr, arg->acl_len);
xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len);
}
@@ -2587,11 +2603,11 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
- const __u32 nfs4_acl_bitmap[1] = {
- [0] = FATTR4_WORD0_ACL,
- };
+ __u32 nfs4_acl_bitmap[2];
uint32_t replen;
+ nfs4_acltype_to_bitmap(args->acl_type, nfs4_acl_bitmap);
+
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
@@ -5386,7 +5402,7 @@ decode_restorefh(struct xdr_stream *xdr)
}
static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
- struct nfs_getaclres *res)
+ struct nfs_getaclres *res, enum nfs4_acl_type type)
{
unsigned int savep;
uint32_t attrlen,
@@ -5404,26 +5420,39 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
goto out;
- if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
- return -EIO;
- if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
-
- /* The bitmap (xdr len + bitmaps) and the attr xdr len words
- * are stored with the acl data to handle the problem of
- * variable length bitmaps.*/
- res->acl_data_offset = xdr_page_pos(xdr);
- res->acl_len = attrlen;
-
- /* Check for receive buffer overflow */
- if (res->acl_len > xdr_stream_remaining(xdr) ||
- res->acl_len + res->acl_data_offset > xdr->buf->page_len) {
- res->acl_flags |= NFS4_ACL_TRUNC;
- dprintk("NFS: acl reply: attrlen %u > page_len %zu\n",
- attrlen, xdr_stream_remaining(xdr));
- }
- } else
- status = -EOPNOTSUPP;
+ switch (type) {
+ default:
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
+ return -EIO;
+ if (!(bitmap[0] & FATTR4_WORD0_ACL))
+ return -EOPNOTSUPP;
+ break;
+ case NFS4ACL_DACL:
+ if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_DACL - 1U)))
+ return -EIO;
+ if (!(bitmap[1] & FATTR4_WORD1_DACL))
+ return -EOPNOTSUPP;
+ break;
+ case NFS4ACL_SACL:
+ if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_SACL - 1U)))
+ return -EIO;
+ if (!(bitmap[1] & FATTR4_WORD1_SACL))
+ return -EOPNOTSUPP;
+ }
+ /* The bitmap (xdr len + bitmaps) and the attr xdr len words
+ * are stored with the acl data to handle the problem of
+ * variable length bitmaps.*/
+ res->acl_data_offset = xdr_page_pos(xdr);
+ res->acl_len = attrlen;
+
+ /* Check for receive buffer overflow */
+ if (res->acl_len > xdr_stream_remaining(xdr) ||
+ res->acl_len + res->acl_data_offset > xdr->buf->page_len) {
+ res->acl_flags |= NFS4_ACL_TRUNC;
+ dprintk("NFS: acl reply: attrlen %u > page_len %zu\n",
+ attrlen, xdr_stream_remaining(xdr));
+ }
out:
return status;
}
@@ -6486,7 +6515,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getacl(xdr, rqstp, res);
+ status = decode_getacl(xdr, rqstp, res, res->acl_type);
out:
return status;
@@ -7051,7 +7080,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
if (res->migration) {
xdr_enter_page(xdr, PAGE_SIZE);
status = decode_getfattr_generic(xdr,
- &res->fs_locations->fattr,
+ res->fs_locations->fattr,
NULL, res->fs_locations,
res->fs_locations->server);
if (status)
@@ -7064,7 +7093,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
goto out;
xdr_enter_page(xdr, PAGE_SIZE);
status = decode_getfattr_generic(xdr,
- &res->fs_locations->fattr,
+ res->fs_locations->fattr,
NULL, res->fs_locations,
res->fs_locations->server);
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 9157dd19b8b4..317cedfa52bf 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -767,6 +767,9 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
.flags = RPC_TASK_ASYNC | flags,
};
+ if (nfs_server_capable(hdr->inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
dprintk("NFS: initiated pgio call "
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 856c962273c7..41a9b6b58fb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -469,6 +469,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
pnfs_clear_lseg_state(lseg, lseg_list);
pnfs_clear_layoutreturn_info(lo);
pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
!test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
pnfs_clear_layoutreturn_waitbit(lo);
@@ -1917,8 +1918,9 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
{
- if (atomic_dec_and_test(&lo->plh_outstanding))
- wake_up_var(&lo->plh_outstanding);
+ if (atomic_dec_and_test(&lo->plh_outstanding) &&
+ test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
}
static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
@@ -2000,6 +2002,7 @@ lookup_again:
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
spin_unlock(&ino->i_lock);
+ lseg = ERR_PTR(-ENOMEM);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_NOMEM);
goto out;
@@ -2024,11 +2027,11 @@ lookup_again:
* If the layout segment list is empty, but there are outstanding
* layoutget calls, then they might be subject to a layoutrecall.
*/
- if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) &&
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
atomic_read(&lo->plh_outstanding) != 0) {
spin_unlock(&ino->i_lock);
- lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
- !atomic_read(&lo->plh_outstanding)));
+ lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN,
+ TASK_KILLABLE));
if (IS_ERR(lseg))
goto out_put_layout_hdr;
pnfs_put_layout_hdr(lo);
@@ -2128,6 +2131,7 @@ lookup_again:
lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
if (!lgp) {
+ lseg = ERR_PTR(-ENOMEM);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
PNFS_UPDATE_LAYOUT_NOMEM);
nfs_layoutget_end(lo);
@@ -2150,6 +2154,12 @@ lookup_again:
case -ERECALLCONFLICT:
case -EAGAIN:
break;
+ case -ENODATA:
+ /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */
+ pnfs_layout_set_fail_bit(
+ lo, pnfs_iomode_to_fail_bit(iomode));
+ lseg = NULL;
+ goto out_put_layout_hdr;
default:
if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
@@ -2405,7 +2415,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget;
}
- if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo))
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
+ !pnfs_is_first_layoutget(lo))
goto out_forget;
if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 07f11489e4e9..f331f067691b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -105,6 +105,7 @@ enum {
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
NFS_LAYOUT_HASHED, /* The layout visible */
+ NFS_LAYOUT_DRAIN,
};
enum layoutdriver_policy_flags {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6f325e10056c..9697cd5d2561 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -102,6 +102,10 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data)
};
struct rpc_task *task;
struct inode *dir = d_inode(data->dentry->d_parent);
+
+ if (nfs_server_capable(inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
nfs_sb_active(dir->i_sb);
data->args.fh = NFS_FH(dir);
nfs_fattr_init(data->res.dir_attr);
@@ -344,6 +348,10 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
.flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
+ if (nfs_server_capable(old_dir, NFS_CAP_MOVEABLE) &&
+ nfs_server_capable(new_dir, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f00d45cf80ef..1c706465d090 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -603,8 +603,9 @@ static void nfs_write_error(struct nfs_page *req, int error)
* Find an associated nfs write request, and prepare to flush it out
* May return an error if the user signalled nfs_wait_on_request().
*/
-static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page)
+static int nfs_page_async_flush(struct page *page,
+ struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio)
{
struct nfs_page *req;
int ret = 0;
@@ -630,11 +631,11 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
/*
* Remove the problematic req upon fatal errors on the server
*/
- if (nfs_error_is_fatal(ret)) {
- if (nfs_error_is_fatal_on_server(ret))
- goto out_launder;
- } else
- ret = -EAGAIN;
+ if (nfs_error_is_fatal_on_server(ret))
+ goto out_launder;
+ if (wbc->sync_mode == WB_SYNC_NONE)
+ ret = AOP_WRITEPAGE_ACTIVATE;
+ redirty_page_for_writepage(wbc, page);
nfs_redirty_request(req);
pgio->pg_error = 0;
} else
@@ -650,15 +651,8 @@ out_launder:
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
struct nfs_pageio_descriptor *pgio)
{
- int ret;
-
nfs_pageio_cond_complete(pgio, page_index(page));
- ret = nfs_page_async_flush(pgio, page);
- if (ret == -EAGAIN) {
- redirty_page_for_writepage(wbc, page);
- ret = AOP_WRITEPAGE_ACTIVATE;
- }
- return ret;
+ return nfs_page_async_flush(page, wbc, pgio);
}
/*
@@ -681,11 +675,7 @@ static int nfs_writepage_locked(struct page *page,
err = nfs_do_writepage(page, wbc, &pgio);
pgio.pg_error = 0;
nfs_pageio_complete(&pgio);
- if (err < 0)
- return err;
- if (nfs_error_is_fatal(pgio.pg_error))
- return pgio.pg_error;
- return 0;
+ return err;
}
int nfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -737,19 +727,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
priority = wb_priority(wbc);
}
- nfs_pageio_init_write(&pgio, inode, priority, false,
- &nfs_async_write_completion_ops);
- pgio.pg_io_completion = ioc;
- err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
- pgio.pg_error = 0;
- nfs_pageio_complete(&pgio);
+ do {
+ nfs_pageio_init_write(&pgio, inode, priority, false,
+ &nfs_async_write_completion_ops);
+ pgio.pg_io_completion = ioc;
+ err = write_cache_pages(mapping, wbc, nfs_writepages_callback,
+ &pgio);
+ pgio.pg_error = 0;
+ nfs_pageio_complete(&pgio);
+ } while (err < 0 && !nfs_error_is_fatal(err));
nfs_io_completion_put(ioc);
if (err < 0)
goto out_err;
- err = pgio.pg_error;
- if (nfs_error_is_fatal(err))
- goto out_err;
return 0;
out_err:
return err;
@@ -1444,7 +1434,7 @@ static void nfs_async_write_error(struct list_head *head, int error)
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
- if (nfs_error_is_fatal(error))
+ if (nfs_error_is_fatal_on_server(error))
nfs_write_error(req, error);
else
nfs_redirty_request(req);
@@ -1719,6 +1709,10 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
.flags = RPC_TASK_ASYNC | flags,
.priority = priority,
};
+
+ if (nfs_server_capable(data->inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
/* Set up the initial task struct. */
nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client);
trace_nfs_initiate_commit(data);
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 2c1b027774d4..9cb2d590c036 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -119,14 +119,14 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
struct inode *inode = nf->nf_inode;
do {
- mutex_lock(&nfsd_file_fsnotify_group->mark_mutex);
+ fsnotify_group_lock(nfsd_file_fsnotify_group);
mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
- nfsd_file_fsnotify_group);
+ nfsd_file_fsnotify_group);
if (mark) {
nfm = nfsd_file_mark_get(container_of(mark,
struct nfsd_file_mark,
nfm_mark));
- mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+ fsnotify_group_unlock(nfsd_file_fsnotify_group);
if (nfm) {
fsnotify_put_mark(mark);
break;
@@ -134,8 +134,9 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
/* Avoid soft lockup race with nfsd_file_mark_put() */
fsnotify_destroy_mark(mark, nfsd_file_fsnotify_group);
fsnotify_put_mark(mark);
- } else
- mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+ } else {
+ fsnotify_group_unlock(nfsd_file_fsnotify_group);
+ }
/* allocate a new nfm */
new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL);
@@ -302,15 +303,18 @@ nfsd_file_put_noref(struct nfsd_file *nf)
void
nfsd_file_put(struct nfsd_file *nf)
{
+ might_sleep();
+
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
nfsd_file_flush(nf);
nfsd_file_put_noref(nf);
- } else {
+ } else if (nf->nf_file) {
nfsd_file_put_noref(nf);
- if (nf->nf_file)
- nfsd_file_schedule_laundrette();
- }
+ nfsd_file_schedule_laundrette();
+ } else
+ nfsd_file_put_noref(nf);
+
if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)
nfsd_file_gc();
}
@@ -678,7 +682,8 @@ nfsd_file_cache_init(void)
goto out_shrinker;
}
- nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops);
+ nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops,
+ FSNOTIFY_GROUP_NOFS);
if (IS_ERR(nfsd_file_fsnotify_group)) {
pr_err("nfsd: unable to create fsnotify group: %ld\n",
PTR_ERR(nfsd_file_fsnotify_group));
@@ -897,9 +902,9 @@ nfsd_file_is_cached(struct inode *inode)
return ret;
}
-__be32
-nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **pnf)
+static __be32
+nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf, bool open)
{
__be32 status;
struct net *net = SVC_NET(rqstp);
@@ -994,10 +999,14 @@ open_file:
nfsd_file_gc();
nf->nf_mark = nfsd_file_mark_find_or_create(nf);
- if (nf->nf_mark)
- status = nfsd_open_verified(rqstp, fhp, S_IFREG,
- may_flags, &nf->nf_file);
- else
+ if (nf->nf_mark) {
+ if (open) {
+ status = nfsd_open_verified(rqstp, fhp, may_flags,
+ &nf->nf_file);
+ trace_nfsd_file_open(nf, status);
+ } else
+ status = nfs_ok;
+ } else
status = nfserr_jukebox;
/*
* If construction failed, or we raced with a call to unlink()
@@ -1017,6 +1026,40 @@ open_file:
goto out;
}
+/**
+ * nfsd_file_acquire - Get a struct nfsd_file with an open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, true);
+}
+
+/**
+ * nfsd_file_create - Get a struct nfsd_file, do not open
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file just created
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, false);
+}
+
/*
* Note that fields may be added, removed or reordered in the future. Programs
* scraping this file for info should test the labels to ensure they're
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 435ceab27897..1da0c79a5580 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -59,5 +59,7 @@ void nfsd_file_close_inode_sync(struct inode *inode);
bool nfsd_file_is_cached(struct inode *inode);
__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
+__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **nfp);
int nfsd_file_cache_stats_open(struct inode *, struct file *);
#endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 936eebd4c56d..981a3a7a6e16 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/ext2_fs.h>
#include <linux/magic.h>
+#include <linux/namei.h>
#include "cache.h"
#include "xdr3.h"
@@ -220,17 +221,132 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
}
/*
- * With NFSv3, CREATE processing is a lot easier than with NFSv2.
- * At least in theory; we'll see how it fares in practice when the
- * first reports about SunOS compatibility problems start to pour in...
+ * Implement NFSv3's unchecked, guarded, and exclusive CREATE
+ * semantics for regular files. Except for the created file,
+ * this operation is stateless on the server.
+ *
+ * Upon return, caller must release @fhp and @resfhp.
*/
static __be32
+nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct nfsd3_createargs *argp)
+{
+ struct iattr *iap = &argp->attrs;
+ struct dentry *parent, *child;
+ __u32 v_mtime, v_atime;
+ struct inode *inode;
+ __be32 status;
+ int host_err;
+
+ if (isdotent(argp->name, argp->len))
+ return nfserr_exist;
+ if (!(iap->ia_valid & ATTR_MODE))
+ iap->ia_mode = 0;
+
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (status != nfs_ok)
+ return status;
+
+ parent = fhp->fh_dentry;
+ inode = d_inode(parent);
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
+
+ child = lookup_one_len(argp->name, parent, argp->len);
+ if (IS_ERR(child)) {
+ status = nfserrno(PTR_ERR(child));
+ goto out;
+ }
+
+ if (d_really_is_negative(child)) {
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+ if (status != nfs_ok)
+ goto out;
+ }
+
+ status = fh_compose(resfhp, fhp->fh_export, child, fhp);
+ if (status != nfs_ok)
+ goto out;
+
+ v_mtime = 0;
+ v_atime = 0;
+ if (argp->createmode == NFS3_CREATE_EXCLUSIVE) {
+ u32 *verifier = (u32 *)argp->verf;
+
+ /*
+ * Solaris 7 gets confused (bugid 4218508) if these have
+ * the high bit set, as do xfs filesystems without the
+ * "bigtime" feature. So just clear the high bits.
+ */
+ v_mtime = verifier[0] & 0x7fffffff;
+ v_atime = verifier[1] & 0x7fffffff;
+ }
+
+ if (d_really_is_positive(child)) {
+ status = nfs_ok;
+
+ switch (argp->createmode) {
+ case NFS3_CREATE_UNCHECKED:
+ if (!d_is_reg(child))
+ break;
+ iap->ia_valid &= ATTR_SIZE;
+ goto set_attr;
+ case NFS3_CREATE_GUARDED:
+ status = nfserr_exist;
+ break;
+ case NFS3_CREATE_EXCLUSIVE:
+ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
+ d_inode(child)->i_atime.tv_sec == v_atime &&
+ d_inode(child)->i_size == 0) {
+ break;
+ }
+ status = nfserr_exist;
+ }
+ goto out;
+ }
+
+ if (!IS_POSIXACL(inode))
+ iap->ia_mode &= ~current_umask();
+
+ host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true);
+ if (host_err < 0) {
+ status = nfserrno(host_err);
+ goto out;
+ }
+
+ /* A newly created file already has a file size of zero. */
+ if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+ iap->ia_valid &= ~ATTR_SIZE;
+ if (argp->createmode == NFS3_CREATE_EXCLUSIVE) {
+ iap->ia_valid = ATTR_MTIME | ATTR_ATIME |
+ ATTR_MTIME_SET | ATTR_ATIME_SET;
+ iap->ia_mtime.tv_sec = v_mtime;
+ iap->ia_atime.tv_sec = v_atime;
+ iap->ia_mtime.tv_nsec = 0;
+ iap->ia_atime.tv_nsec = 0;
+ }
+
+set_attr:
+ status = nfsd_create_setattr(rqstp, fhp, resfhp, iap);
+
+out:
+ fh_unlock(fhp);
+ if (child && !IS_ERR(child))
+ dput(child);
+ fh_drop_write(fhp);
+ return status;
+}
+
+static __be32
nfsd3_proc_create(struct svc_rqst *rqstp)
{
struct nfsd3_createargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
- svc_fh *dirfhp, *newfhp = NULL;
- struct iattr *attr;
+ svc_fh *dirfhp, *newfhp;
dprintk("nfsd: CREATE(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -239,21 +355,8 @@ nfsd3_proc_create(struct svc_rqst *rqstp)
dirfhp = fh_copy(&resp->dirfh, &argp->fh);
newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
- attr = &argp->attrs;
-
- /* Unfudge the mode bits */
- attr->ia_mode &= ~S_IFMT;
- if (!(attr->ia_valid & ATTR_MODE)) {
- attr->ia_valid |= ATTR_MODE;
- attr->ia_mode = S_IFREG;
- } else {
- attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG;
- }
- /* Now create the file and set attributes */
- resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
- attr, newfhp, argp->createmode,
- (u32 *)argp->verf, NULL, NULL);
+ resp->status = nfsd3_create_file(rqstp, dirfhp, newfhp, argp);
return rpc_success;
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b207c76a873f..3895eb52d2b1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -37,6 +37,8 @@
#include <linux/falloc.h>
#include <linux/slab.h>
#include <linux/kthread.h>
+#include <linux/namei.h>
+
#include <linux/sunrpc/addr.h>
#include <linux/nfs_ssc.h>
@@ -235,6 +237,183 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate
&resfh->fh_handle);
}
+static inline bool nfsd4_create_is_exclusive(int createmode)
+{
+ return createmode == NFS4_CREATE_EXCLUSIVE ||
+ createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
+static __be32
+nfsd4_vfs_create(struct svc_fh *fhp, struct dentry *child,
+ struct nfsd4_open *open)
+{
+ struct file *filp;
+ struct path path;
+ int oflags;
+
+ oflags = O_CREAT | O_LARGEFILE;
+ switch (open->op_share_access & NFS4_SHARE_ACCESS_BOTH) {
+ case NFS4_SHARE_ACCESS_WRITE:
+ oflags |= O_WRONLY;
+ break;
+ case NFS4_SHARE_ACCESS_BOTH:
+ oflags |= O_RDWR;
+ break;
+ default:
+ oflags |= O_RDONLY;
+ }
+
+ path.mnt = fhp->fh_export->ex_path.mnt;
+ path.dentry = child;
+ filp = dentry_create(&path, oflags, open->op_iattr.ia_mode,
+ current_cred());
+ if (IS_ERR(filp))
+ return nfserrno(PTR_ERR(filp));
+
+ open->op_filp = filp;
+ return nfs_ok;
+}
+
+/*
+ * Implement NFSv4's unchecked, guarded, and exclusive create
+ * semantics for regular files. Open state for this new file is
+ * subsequently fabricated in nfsd4_process_open2().
+ *
+ * Upon return, caller must release @fhp and @resfhp.
+ */
+static __be32
+nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct nfsd4_open *open)
+{
+ struct iattr *iap = &open->op_iattr;
+ struct dentry *parent, *child;
+ __u32 v_mtime, v_atime;
+ struct inode *inode;
+ __be32 status;
+ int host_err;
+
+ if (isdotent(open->op_fname, open->op_fnamelen))
+ return nfserr_exist;
+ if (!(iap->ia_valid & ATTR_MODE))
+ iap->ia_mode = 0;
+
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (status != nfs_ok)
+ return status;
+ parent = fhp->fh_dentry;
+ inode = d_inode(parent);
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
+
+ child = lookup_one_len(open->op_fname, parent, open->op_fnamelen);
+ if (IS_ERR(child)) {
+ status = nfserrno(PTR_ERR(child));
+ goto out;
+ }
+
+ if (d_really_is_negative(child)) {
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+ if (status != nfs_ok)
+ goto out;
+ }
+
+ status = fh_compose(resfhp, fhp->fh_export, child, fhp);
+ if (status != nfs_ok)
+ goto out;
+
+ v_mtime = 0;
+ v_atime = 0;
+ if (nfsd4_create_is_exclusive(open->op_createmode)) {
+ u32 *verifier = (u32 *)open->op_verf.data;
+
+ /*
+ * Solaris 7 gets confused (bugid 4218508) if these have
+ * the high bit set, as do xfs filesystems without the
+ * "bigtime" feature. So just clear the high bits. If this
+ * is ever changed to use different attrs for storing the
+ * verifier, then do_open_lookup() will also need to be
+ * fixed accordingly.
+ */
+ v_mtime = verifier[0] & 0x7fffffff;
+ v_atime = verifier[1] & 0x7fffffff;
+ }
+
+ if (d_really_is_positive(child)) {
+ status = nfs_ok;
+
+ switch (open->op_createmode) {
+ case NFS4_CREATE_UNCHECKED:
+ if (!d_is_reg(child))
+ break;
+
+ /*
+ * In NFSv4, we don't want to truncate the file
+ * now. This would be wrong if the OPEN fails for
+ * some other reason. Furthermore, if the size is
+ * nonzero, we should ignore it according to spec!
+ */
+ open->op_truncate = (iap->ia_valid & ATTR_SIZE) &&
+ !iap->ia_size;
+ break;
+ case NFS4_CREATE_GUARDED:
+ status = nfserr_exist;
+ break;
+ case NFS4_CREATE_EXCLUSIVE:
+ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
+ d_inode(child)->i_atime.tv_sec == v_atime &&
+ d_inode(child)->i_size == 0) {
+ open->op_created = true;
+ break; /* subtle */
+ }
+ status = nfserr_exist;
+ break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
+ d_inode(child)->i_atime.tv_sec == v_atime &&
+ d_inode(child)->i_size == 0) {
+ open->op_created = true;
+ goto set_attr; /* subtle */
+ }
+ status = nfserr_exist;
+ }
+ goto out;
+ }
+
+ if (!IS_POSIXACL(inode))
+ iap->ia_mode &= ~current_umask();
+
+ status = nfsd4_vfs_create(fhp, child, open);
+ if (status != nfs_ok)
+ goto out;
+ open->op_created = true;
+
+ /* A newly created file already has a file size of zero. */
+ if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+ iap->ia_valid &= ~ATTR_SIZE;
+ if (nfsd4_create_is_exclusive(open->op_createmode)) {
+ iap->ia_valid = ATTR_MTIME | ATTR_ATIME |
+ ATTR_MTIME_SET|ATTR_ATIME_SET;
+ iap->ia_mtime.tv_sec = v_mtime;
+ iap->ia_atime.tv_sec = v_atime;
+ iap->ia_mtime.tv_nsec = 0;
+ iap->ia_atime.tv_nsec = 0;
+ }
+
+set_attr:
+ status = nfsd_create_setattr(rqstp, fhp, resfhp, iap);
+
+out:
+ fh_unlock(fhp);
+ if (child && !IS_ERR(child))
+ dput(child);
+ fh_drop_write(fhp);
+ return status;
+}
+
static __be32
do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
{
@@ -264,16 +443,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
* yes | yes | GUARDED4 | GUARDED4
*/
- /*
- * Note: create modes (UNCHECKED,GUARDED...) are the same
- * in NFSv4 as in v3 except EXCLUSIVE4_1.
- */
current->fs->umask = open->op_umask;
- status = do_nfsd_create(rqstp, current_fh, open->op_fname,
- open->op_fnamelen, &open->op_iattr,
- *resfh, open->op_createmode,
- (u32 *)open->op_verf.data,
- &open->op_truncate, &open->op_created);
+ status = nfsd4_create_file(rqstp, current_fh, *resfh, open);
current->fs->umask = 0;
if (!status && open->op_label.len)
@@ -284,7 +455,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
* use the returned bitmask to indicate which attributes
* we used to store the verifier:
*/
- if (nfsd_create_is_exclusive(open->op_createmode) && status == 0)
+ if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0)
open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
FATTR4_WORD1_TIME_MODIFY);
} else
@@ -375,6 +546,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
(int)open->op_fnamelen, open->op_fname,
open->op_openowner);
+ open->op_filp = NULL;
+
/* This check required by spec. */
if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
return nfserr_inval;
@@ -427,43 +600,35 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
switch (open->op_claim_type) {
- case NFS4_OPEN_CLAIM_DELEGATE_CUR:
- case NFS4_OPEN_CLAIM_NULL:
- status = do_open_lookup(rqstp, cstate, open, &resfh);
- if (status)
- goto out;
- break;
- case NFS4_OPEN_CLAIM_PREVIOUS:
- status = nfs4_check_open_reclaim(cstate->clp);
- if (status)
- goto out;
- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
- reclaim = true;
- fallthrough;
- case NFS4_OPEN_CLAIM_FH:
- case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
- status = do_open_fhandle(rqstp, cstate, open);
- if (status)
- goto out;
- resfh = &cstate->current_fh;
- break;
- case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
- case NFS4_OPEN_CLAIM_DELEGATE_PREV:
- dprintk("NFSD: unsupported OPEN claim type %d\n",
- open->op_claim_type);
- status = nfserr_notsupp;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_NULL:
+ status = do_open_lookup(rqstp, cstate, open, &resfh);
+ if (status)
goto out;
- default:
- dprintk("NFSD: Invalid OPEN claim type %d\n",
- open->op_claim_type);
- status = nfserr_inval;
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ status = nfs4_check_open_reclaim(cstate->clp);
+ if (status)
+ goto out;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ reclaim = true;
+ fallthrough;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ status = do_open_fhandle(rqstp, cstate, open);
+ if (status)
goto out;
+ resfh = &cstate->current_fh;
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ status = nfserr_notsupp;
+ goto out;
+ default:
+ status = nfserr_inval;
+ goto out;
}
- /*
- * nfsd4_process_open2() does the actual opening of the file. If
- * successful, it (1) truncates the file if open->op_truncate was
- * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
- */
+
status = nfsd4_process_open2(rqstp, resfh, open);
WARN(status && open->op_created,
"nfsd4_process_open2 failed to open newly-created file! status=%u\n",
@@ -471,6 +636,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (reclaim && !status)
nn->somebody_reclaimed = true;
out:
+ if (open->op_filp) {
+ fput(open->op_filp);
+ open->op_filp = NULL;
+ }
if (resfh && resfh != &cstate->current_fh) {
fh_dup2(&cstate->current_fh, resfh);
fh_put(resfh);
@@ -801,7 +970,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* the client wants us to do more in this compound:
*/
if (!nfsd4_last_compound_op(rqstp))
- clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+ __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* check stateid */
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
@@ -2481,11 +2650,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
cstate->minorversion = args->minorversion;
fh_init(current_fh, NFS4_FHSIZE);
fh_init(save_fh, NFS4_FHSIZE);
+
/*
* Don't use the deferral mechanism for NFSv4; compounds make it
* too hard to avoid non-idempotency problems.
*/
- clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ __clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
/*
* According to RFC3010, this takes precedence over all other errors.
@@ -2600,7 +2770,7 @@ encode_op:
out:
cstate->status = status;
/* Reset deferral mechanism for RPC deferrals */
- set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
return rpc_success;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 234e852fcdfa..9409a0dc1b76 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -125,6 +125,23 @@ static void free_session(struct nfsd4_session *);
static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
+static struct workqueue_struct *laundry_wq;
+
+int nfsd4_create_laundry_wq(void)
+{
+ int rc = 0;
+
+ laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
+ if (laundry_wq == NULL)
+ rc = -ENOMEM;
+ return rc;
+}
+
+void nfsd4_destroy_laundry_wq(void)
+{
+ destroy_workqueue(laundry_wq);
+}
+
static bool is_session_dead(struct nfsd4_session *ses)
{
return ses->se_flags & NFS4_SESSION_DEAD;
@@ -152,6 +169,7 @@ static __be32 get_client_locked(struct nfs4_client *clp)
if (is_client_expired(clp))
return nfserr_expired;
atomic_inc(&clp->cl_rpc_users);
+ clp->cl_state = NFSD4_ACTIVE;
return nfs_ok;
}
@@ -172,6 +190,7 @@ renew_client_locked(struct nfs4_client *clp)
list_move_tail(&clp->cl_lru, &nn->client_lru);
clp->cl_time = ktime_get_boottime_seconds();
+ clp->cl_state = NFSD4_ACTIVE;
}
static void put_client_renew_locked(struct nfs4_client *clp)
@@ -690,6 +709,57 @@ static unsigned int file_hashval(struct svc_fh *fh)
static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
+/*
+ * Check if courtesy clients have conflicting access and resolve it if possible
+ *
+ * access: is op_share_access if share_access is true.
+ * Check if access mode, op_share_access, would conflict with
+ * the current deny mode of the file 'fp'.
+ * access: is op_share_deny if share_access is false.
+ * Check if the deny mode, op_share_deny, would conflict with
+ * current access of the file 'fp'.
+ * stp: skip checking this entry.
+ * new_stp: normal open, not open upgrade.
+ *
+ * Function returns:
+ * false - access/deny mode conflict with normal client.
+ * true - no conflict or conflict with courtesy client(s) is resolved.
+ */
+static bool
+nfs4_resolve_deny_conflicts_locked(struct nfs4_file *fp, bool new_stp,
+ struct nfs4_ol_stateid *stp, u32 access, bool share_access)
+{
+ struct nfs4_ol_stateid *st;
+ bool resolvable = true;
+ unsigned char bmap;
+ struct nfsd_net *nn;
+ struct nfs4_client *clp;
+
+ lockdep_assert_held(&fp->fi_lock);
+ list_for_each_entry(st, &fp->fi_stateids, st_perfile) {
+ /* ignore lock stateid */
+ if (st->st_openstp)
+ continue;
+ if (st == stp && new_stp)
+ continue;
+ /* check file access against deny mode or vice versa */
+ bmap = share_access ? st->st_deny_bmap : st->st_access_bmap;
+ if (!(access & bmap_to_share_mode(bmap)))
+ continue;
+ clp = st->st_stid.sc_client;
+ if (try_to_expire_client(clp))
+ continue;
+ resolvable = false;
+ break;
+ }
+ if (resolvable) {
+ clp = stp->st_stid.sc_client;
+ nn = net_generic(clp->net, nfsd_net_id);
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ }
+ return resolvable;
+}
+
static void
__nfs4_file_get_access(struct nfs4_file *fp, u32 access)
{
@@ -1090,6 +1160,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
get_clnt_odstate(odstate);
dp->dl_type = NFS4_OPEN_DELEGATE_READ;
dp->dl_retries = 1;
+ dp->dl_recalled = false;
nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
&nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
get_nfs4_file(fp);
@@ -2004,6 +2075,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
idr_init(&clp->cl_stateids);
atomic_set(&clp->cl_rpc_users, 0);
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ clp->cl_state = NFSD4_ACTIVE;
+ atomic_set(&clp->cl_delegs_in_recall, 0);
INIT_LIST_HEAD(&clp->cl_idhash);
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
@@ -2408,10 +2481,17 @@ static int client_info_show(struct seq_file *m, void *v)
memcpy(&clid, &clp->cl_clientid, sizeof(clid));
seq_printf(m, "clientid: 0x%llx\n", clid);
seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr);
- if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
+
+ if (clp->cl_state == NFSD4_COURTESY)
+ seq_puts(m, "status: courtesy\n");
+ else if (clp->cl_state == NFSD4_EXPIRABLE)
+ seq_puts(m, "status: expirable\n");
+ else if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
seq_puts(m, "status: confirmed\n");
else
seq_puts(m, "status: unconfirmed\n");
+ seq_printf(m, "seconds from last renew: %lld\n",
+ ktime_get_boottime_seconds() - clp->cl_time);
seq_printf(m, "name: ");
seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
@@ -4694,9 +4774,18 @@ nfsd_break_deleg_cb(struct file_lock *fl)
bool ret = false;
struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
struct nfs4_file *fp = dp->dl_stid.sc_file;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+ struct nfsd_net *nn;
trace_nfsd_cb_recall(&dp->dl_stid);
+ dp->dl_recalled = true;
+ atomic_inc(&clp->cl_delegs_in_recall);
+ if (try_to_expire_client(clp)) {
+ nn = net_generic(clp->net, nfsd_net_id);
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ }
+
/*
* We don't want the locks code to timeout the lease for us;
* we'll remove it ourself if a delegation isn't returned
@@ -4739,9 +4828,14 @@ static int
nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
struct list_head *dispose)
{
- if (arg & F_UNLCK)
+ struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+
+ if (arg & F_UNLCK) {
+ if (dp->dl_recalled)
+ atomic_dec(&clp->cl_delegs_in_recall);
return lease_modify(onlist, arg, dispose);
- else
+ } else
return -EAGAIN;
}
@@ -4947,7 +5041,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
- struct nfsd4_open *open)
+ struct nfsd4_open *open, bool new_stp)
{
struct nfsd_file *nf = NULL;
__be32 status;
@@ -4963,6 +5057,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
*/
status = nfs4_file_check_deny(fp, open->op_share_deny);
if (status != nfs_ok) {
+ if (status != nfserr_share_denied) {
+ spin_unlock(&fp->fi_lock);
+ goto out;
+ }
+ if (nfs4_resolve_deny_conflicts_locked(fp, new_stp,
+ stp, open->op_share_deny, false))
+ status = nfserr_jukebox;
spin_unlock(&fp->fi_lock);
goto out;
}
@@ -4970,6 +5071,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
/* set access to the file */
status = nfs4_file_get_access(fp, open->op_share_access);
if (status != nfs_ok) {
+ if (status != nfserr_share_denied) {
+ spin_unlock(&fp->fi_lock);
+ goto out;
+ }
+ if (nfs4_resolve_deny_conflicts_locked(fp, new_stp,
+ stp, open->op_share_access, true))
+ status = nfserr_jukebox;
spin_unlock(&fp->fi_lock);
goto out;
}
@@ -4985,9 +5093,19 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
if (!fp->fi_fds[oflag]) {
spin_unlock(&fp->fi_lock);
- status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
- if (status)
- goto out_put_access;
+
+ if (!open->op_filp) {
+ status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
+ if (status != nfs_ok)
+ goto out_put_access;
+ } else {
+ status = nfsd_file_create(rqstp, cur_fh, access, &nf);
+ if (status != nfs_ok)
+ goto out_put_access;
+ nf->nf_file = open->op_filp;
+ open->op_filp = NULL;
+ }
+
spin_lock(&fp->fi_lock);
if (!fp->fi_fds[oflag]) {
fp->fi_fds[oflag] = nf;
@@ -5016,21 +5134,29 @@ out_put_access:
}
static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp,
+ struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
+ struct nfsd4_open *open)
{
__be32 status;
unsigned char old_deny_bmap = stp->st_deny_bmap;
if (!test_access(open->op_share_access, stp))
- return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
+ return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open, false);
/* test and set deny mode */
spin_lock(&fp->fi_lock);
status = nfs4_file_check_deny(fp, open->op_share_deny);
if (status == nfs_ok) {
- set_deny(open->op_share_deny, stp);
- fp->fi_share_deny |=
+ if (status != nfserr_share_denied) {
+ set_deny(open->op_share_deny, stp);
+ fp->fi_share_deny |=
(open->op_share_deny & NFS4_SHARE_DENY_BOTH);
+ } else {
+ if (nfs4_resolve_deny_conflicts_locked(fp, false,
+ stp, open->op_share_deny, false))
+ status = nfserr_jukebox;
+ }
}
spin_unlock(&fp->fi_lock);
@@ -5322,6 +5448,18 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
*/
}
+/**
+ * nfsd4_process_open2 - finish open processing
+ * @rqstp: the RPC transaction being executed
+ * @current_fh: NFSv4 COMPOUND's current filehandle
+ * @open: OPEN arguments
+ *
+ * If successful, (1) truncate the file if open->op_truncate was
+ * set, (2) set open->op_stateid, (3) set open->op_delegation.
+ *
+ * Returns %nfs_ok on success; otherwise an nfs4stat value in
+ * network byte order is returned.
+ */
__be32
nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
@@ -5371,7 +5509,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
goto out;
}
} else {
- status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
+ status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true);
if (status) {
stp->st_stid.sc_type = NFS4_CLOSED_STID;
release_open_stateid(stp);
@@ -5605,6 +5743,81 @@ static void nfsd4_ssc_expire_umount(struct nfsd_net *nn)
}
#endif
+/* Check if any lock belonging to this lockowner has any blockers */
+static bool
+nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo)
+{
+ struct file_lock_context *ctx;
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_file *nf;
+
+ list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
+ nf = stp->st_stid.sc_file;
+ ctx = nf->fi_inode->i_flctx;
+ if (!ctx)
+ continue;
+ if (locks_owner_has_blockers(ctx, lo))
+ return true;
+ }
+ return false;
+}
+
+static bool
+nfs4_anylock_blockers(struct nfs4_client *clp)
+{
+ int i;
+ struct nfs4_stateowner *so;
+ struct nfs4_lockowner *lo;
+
+ if (atomic_read(&clp->cl_delegs_in_recall))
+ return true;
+ spin_lock(&clp->cl_lock);
+ for (i = 0; i < OWNER_HASH_SIZE; i++) {
+ list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[i],
+ so_strhash) {
+ if (so->so_is_open_owner)
+ continue;
+ lo = lockowner(so);
+ if (nfs4_lockowner_has_blockers(lo)) {
+ spin_unlock(&clp->cl_lock);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&clp->cl_lock);
+ return false;
+}
+
+static void
+nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
+ struct laundry_time *lt)
+{
+ struct list_head *pos, *next;
+ struct nfs4_client *clp;
+
+ INIT_LIST_HEAD(reaplist);
+ spin_lock(&nn->client_lock);
+ list_for_each_safe(pos, next, &nn->client_lru) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ if (clp->cl_state == NFSD4_EXPIRABLE)
+ goto exp_client;
+ if (!state_expired(lt, clp->cl_time))
+ break;
+ if (!atomic_read(&clp->cl_rpc_users))
+ clp->cl_state = NFSD4_COURTESY;
+ if (!client_has_state(clp) ||
+ ktime_get_boottime_seconds() >=
+ (clp->cl_time + NFSD_COURTESY_CLIENT_TIMEOUT))
+ goto exp_client;
+ if (nfs4_anylock_blockers(clp)) {
+exp_client:
+ if (!mark_client_expired_locked(clp))
+ list_add(&clp->cl_lru, reaplist);
+ }
+ }
+ spin_unlock(&nn->client_lock);
+}
+
static time64_t
nfs4_laundromat(struct nfsd_net *nn)
{
@@ -5627,7 +5840,6 @@ nfs4_laundromat(struct nfsd_net *nn)
goto out;
}
nfsd4_end_grace(nn);
- INIT_LIST_HEAD(&reaplist);
spin_lock(&nn->s2s_cp_lock);
idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
@@ -5637,17 +5849,7 @@ nfs4_laundromat(struct nfsd_net *nn)
_free_cpntf_state_locked(nn, cps);
}
spin_unlock(&nn->s2s_cp_lock);
-
- spin_lock(&nn->client_lock);
- list_for_each_safe(pos, next, &nn->client_lru) {
- clp = list_entry(pos, struct nfs4_client, cl_lru);
- if (!state_expired(&lt, clp->cl_time))
- break;
- if (mark_client_expired_locked(clp))
- continue;
- list_add(&clp->cl_lru, &reaplist);
- }
- spin_unlock(&nn->client_lock);
+ nfs4_get_client_reaplist(nn, &reaplist, &lt);
list_for_each_safe(pos, next, &reaplist) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
trace_nfsd_clid_purged(&clp->cl_clientid);
@@ -5722,7 +5924,6 @@ out:
return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
}
-static struct workqueue_struct *laundry_wq;
static void laundromat_main(struct work_struct *);
static void
@@ -6551,6 +6752,29 @@ nfsd4_lm_put_owner(fl_owner_t owner)
nfs4_put_stateowner(&lo->lo_owner);
}
+/* return pointer to struct nfs4_client if client is expirable */
+static bool
+nfsd4_lm_lock_expirable(struct file_lock *cfl)
+{
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner;
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+ struct nfsd_net *nn;
+
+ if (try_to_expire_client(clp)) {
+ nn = net_generic(clp->net, nfsd_net_id);
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ return true;
+ }
+ return false;
+}
+
+/* schedule laundromat to run immediately and wait for it to complete */
+static void
+nfsd4_lm_expire_lock(void)
+{
+ flush_workqueue(laundry_wq);
+}
+
static void
nfsd4_lm_notify(struct file_lock *fl)
{
@@ -6577,9 +6801,12 @@ nfsd4_lm_notify(struct file_lock *fl)
}
static const struct lock_manager_operations nfsd_posix_mng_ops = {
+ .lm_mod_owner = THIS_MODULE,
.lm_notify = nfsd4_lm_notify,
.lm_get_owner = nfsd4_lm_get_owner,
.lm_put_owner = nfsd4_lm_put_owner,
+ .lm_lock_expirable = nfsd4_lm_lock_expirable,
+ .lm_expire_lock = nfsd4_lm_expire_lock,
};
static inline void
@@ -7297,22 +7524,36 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
return status;
}
+/**
+ * nfsd4_release_lockowner - process NFSv4.0 RELEASE_LOCKOWNER operations
+ * @rqstp: RPC transaction
+ * @cstate: NFSv4 COMPOUND state
+ * @u: RELEASE_LOCKOWNER arguments
+ *
+ * The lockowner's so_count is bumped when a lock record is added
+ * or when copying a conflicting lock. The latter case is brief,
+ * but can lead to fleeting false positives when looking for
+ * locks-in-use.
+ *
+ * Return values:
+ * %nfs_ok: lockowner released or not found
+ * %nfserr_locks_held: lockowner still in use
+ * %nfserr_stale_clientid: clientid no longer active
+ * %nfserr_expired: clientid not recognized
+ */
__be32
nfsd4_release_lockowner(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
clientid_t *clid = &rlockowner->rl_clientid;
- struct nfs4_stateowner *sop;
- struct nfs4_lockowner *lo = NULL;
struct nfs4_ol_stateid *stp;
- struct xdr_netobj *owner = &rlockowner->rl_owner;
- unsigned int hashval = ownerstr_hashval(owner);
- __be32 status;
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfs4_lockowner *lo;
struct nfs4_client *clp;
- LIST_HEAD (reaplist);
+ LIST_HEAD(reaplist);
+ __be32 status;
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
clid->cl_boot, clid->cl_id);
@@ -7320,34 +7561,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
status = set_client(clid, cstate, nn);
if (status)
return status;
-
clp = cstate->clp;
- /* Find the matching lock stateowner */
- spin_lock(&clp->cl_lock);
- list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval],
- so_strhash) {
-
- if (sop->so_is_open_owner || !same_owner_str(sop, owner))
- continue;
- /* see if there are still any locks associated with it */
- lo = lockowner(sop);
- list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) {
- if (check_for_locks(stp->st_stid.sc_file, lo)) {
- status = nfserr_locks_held;
- spin_unlock(&clp->cl_lock);
- return status;
- }
- }
-
- nfs4_get_stateowner(sop);
- break;
- }
+ spin_lock(&clp->cl_lock);
+ lo = find_lockowner_str_locked(clp, &rlockowner->rl_owner);
if (!lo) {
spin_unlock(&clp->cl_lock);
- return status;
+ return nfs_ok;
+ }
+ if (atomic_read(&lo->lo_owner.so_count) != 2) {
+ spin_unlock(&clp->cl_lock);
+ nfs4_put_stateowner(&lo->lo_owner);
+ return nfserr_locks_held;
}
-
unhash_lockowner_locked(lo);
while (!list_empty(&lo->lo_owner.so_stateids)) {
stp = list_first_entry(&lo->lo_owner.so_stateids,
@@ -7357,11 +7583,11 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
put_ol_stateid_locked(stp, &reaplist);
}
spin_unlock(&clp->cl_lock);
+
free_ol_stateid_reaplist(&reaplist);
remove_blocked_locks(lo);
nfs4_put_stateowner(&lo->lo_owner);
-
- return status;
+ return nfs_ok;
}
static inline struct nfs4_client_reclaim *
@@ -7602,22 +7828,12 @@ nfs4_state_start(void)
{
int ret;
- laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
- if (laundry_wq == NULL) {
- ret = -ENOMEM;
- goto out;
- }
ret = nfsd4_create_callback_queue();
if (ret)
- goto out_free_laundry;
+ return ret;
set_max_delegations();
return 0;
-
-out_free_laundry:
- destroy_workqueue(laundry_wq);
-out:
- return ret;
}
void
@@ -7654,7 +7870,6 @@ nfs4_state_shutdown_net(struct net *net)
void
nfs4_state_shutdown(void)
{
- destroy_workqueue(laundry_wq);
nfsd4_destroy_callback_queue();
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index da92e7d2ab6a..61b2aae81abb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2411,7 +2411,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
- clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
+ __clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
return true;
}
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 0b3f12aa37ff..7da88bdc0d6c 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -206,7 +206,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
struct svc_cacherep *rp;
unsigned int i;
- nfsd_reply_cache_stats_destroy(nn);
unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
for (i = 0; i < nn->drc_hashsize; i++) {
@@ -217,6 +216,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
rp, nn);
}
}
+ nfsd_reply_cache_stats_destroy(nn);
kvfree(nn->drc_hashtbl);
nn->drc_hashtbl = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 16920e4512bd..0621c2faf242 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1535,20 +1535,25 @@ static int __init init_nfsd(void)
retval = create_proc_exports_entry();
if (retval)
goto out_free_lockd;
- retval = register_filesystem(&nfsd_fs_type);
- if (retval)
- goto out_free_exports;
retval = register_pernet_subsys(&nfsd_net_ops);
if (retval < 0)
- goto out_free_filesystem;
+ goto out_free_exports;
retval = register_cld_notifier();
if (retval)
+ goto out_free_subsys;
+ retval = nfsd4_create_laundry_wq();
+ if (retval)
+ goto out_free_cld;
+ retval = register_filesystem(&nfsd_fs_type);
+ if (retval)
goto out_free_all;
return 0;
out_free_all:
+ nfsd4_destroy_laundry_wq();
+out_free_cld:
+ unregister_cld_notifier();
+out_free_subsys:
unregister_pernet_subsys(&nfsd_net_ops);
-out_free_filesystem:
- unregister_filesystem(&nfsd_fs_type);
out_free_exports:
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
@@ -1566,6 +1571,8 @@ out_free_slabs:
static void __exit exit_nfsd(void)
{
+ unregister_filesystem(&nfsd_fs_type);
+ nfsd4_destroy_laundry_wq();
unregister_cld_notifier();
unregister_pernet_subsys(&nfsd_net_ops);
nfsd_drc_slab_free();
@@ -1575,7 +1582,6 @@ static void __exit exit_nfsd(void)
nfsd_lockd_shutdown();
nfsd4_free_slabs();
nfsd4_exit_pnfs();
- unregister_filesystem(&nfsd_fs_type);
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 4fc1fd639527..847b482155ae 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -162,6 +162,8 @@ void nfs4_state_shutdown_net(struct net *net);
int nfs4_reset_recoverydir(char *recdir);
char * nfs4_recoverydir(void);
bool nfsd4_spo_must_allow(struct svc_rqst *rqstp);
+int nfsd4_create_laundry_wq(void);
+void nfsd4_destroy_laundry_wq(void);
#else
static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
@@ -175,6 +177,8 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
{
return false;
}
+static inline int nfsd4_create_laundry_wq(void) { return 0; };
+static inline void nfsd4_destroy_laundry_wq(void) {};
#endif
/*
@@ -336,6 +340,7 @@ void nfsd_lockd_shutdown(void);
#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */
#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
+#define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */
/*
* The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 95457cfd37fc..f3d6313914ed 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -149,6 +149,7 @@ struct nfs4_delegation {
/* For recall: */
int dl_retries;
struct nfsd4_callback dl_recall;
+ bool dl_recalled;
};
#define cb_to_delegation(cb) \
@@ -283,6 +284,28 @@ struct nfsd4_sessionid {
#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */
/*
+ * State Meaning Where set
+ * --------------------------------------------------------------------------
+ * | NFSD4_ACTIVE | Confirmed, active | Default |
+ * |------------------- ----------------------------------------------------|
+ * | NFSD4_COURTESY | Courtesy state. | nfs4_get_client_reaplist |
+ * | | Lease/lock/share | |
+ * | | reservation conflict | |
+ * | | can cause Courtesy | |
+ * | | client to be expired | |
+ * |------------------------------------------------------------------------|
+ * | NFSD4_EXPIRABLE | Courtesy client to be| nfs4_laundromat |
+ * | | expired by Laundromat| try_to_expire_client |
+ * | | due to conflict | |
+ * |------------------------------------------------------------------------|
+ */
+enum {
+ NFSD4_ACTIVE = 0,
+ NFSD4_COURTESY,
+ NFSD4_EXPIRABLE,
+};
+
+/*
* struct nfs4_client - one per client. Clientids live here.
*
* The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
@@ -385,6 +408,9 @@ struct nfs4_client {
struct list_head async_copies; /* list of async copies */
spinlock_t async_lock; /* lock for async copies */
atomic_t cl_cb_inflight; /* Outstanding callbacks */
+
+ unsigned int cl_state;
+ atomic_t cl_delegs_in_recall;
};
/* struct nfs4_client_reset
@@ -702,4 +728,9 @@ extern void nfsd4_client_record_remove(struct nfs4_client *clp);
extern int nfsd4_client_record_check(struct nfs4_client *clp);
extern void nfsd4_record_grace_done(struct nfsd_net *nn);
+static inline bool try_to_expire_client(struct nfs4_client *clp)
+{
+ cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
+ return clp->cl_state == NFSD4_EXPIRABLE;
+}
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 242fa123e0e9..a60ead3b227a 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -692,12 +692,6 @@ DEFINE_CLID_EVENT(confirmed_r);
/*
* from fs/nfsd/filecache.h
*/
-TRACE_DEFINE_ENUM(NFSD_FILE_HASHED);
-TRACE_DEFINE_ENUM(NFSD_FILE_PENDING);
-TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ);
-TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_WRITE);
-TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED);
-
#define show_nf_flags(val) \
__print_flags(val, "|", \
{ 1 << NFSD_FILE_HASHED, "HASHED" }, \
@@ -784,6 +778,34 @@ TRACE_EVENT(nfsd_file_acquire,
__entry->nf_file, __entry->status)
);
+TRACE_EVENT(nfsd_file_open,
+ TP_PROTO(struct nfsd_file *nf, __be32 status),
+ TP_ARGS(nf, status),
+ TP_STRUCT__entry(
+ __field(unsigned int, nf_hashval)
+ __field(void *, nf_inode) /* cannot be dereferenced */
+ __field(int, nf_ref)
+ __field(unsigned long, nf_flags)
+ __field(unsigned long, nf_may)
+ __field(void *, nf_file) /* cannot be dereferenced */
+ ),
+ TP_fast_assign(
+ __entry->nf_hashval = nf->nf_hashval;
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_file = nf->nf_file;
+ ),
+ TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p",
+ __entry->nf_hashval,
+ __entry->nf_inode,
+ __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may),
+ __entry->nf_file)
+)
+
DECLARE_EVENT_CLASS(nfsd_file_search_class,
TP_PROTO(struct inode *inode, unsigned int hash, int found),
TP_ARGS(inode, hash, found),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c22ad0532e8e..840e3af63a6f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -827,14 +827,23 @@ retry:
return err;
}
+/**
+ * nfsd_open_verified - Open a regular file for the filecache
+ * @rqstp: RPC request
+ * @fhp: NFS filehandle of the file to open
+ * @may_flags: internal permission flags
+ * @filp: OUT: open "struct file *"
+ *
+ * Returns an nfsstat value in network byte order.
+ */
__be32
-nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
- int may_flags, struct file **filp)
+nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
+ struct file **filp)
{
__be32 err;
validate_process_creds();
- err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+ err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
validate_process_creds();
return err;
}
@@ -849,17 +858,11 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
struct svc_rqst *rqstp = sd->u.data;
- struct page **pp = rqstp->rq_next_page;
- struct page *page = buf->page;
- if (rqstp->rq_res.page_len == 0) {
- svc_rqst_replace_page(rqstp, page);
+ svc_rqst_replace_page(rqstp, buf->page);
+ if (rqstp->rq_res.page_len == 0)
rqstp->rq_res.page_base = buf->offset;
- } else if (page != pp[-1]) {
- svc_rqst_replace_page(rqstp, page);
- }
rqstp->rq_res.page_len += sd->len;
-
return sd->len;
}
@@ -1187,14 +1190,26 @@ out:
return err;
}
-static __be32
-nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
- struct iattr *iap)
+/**
+ * nfsd_create_setattr - Set a created file's attributes
+ * @rqstp: RPC transaction being executed
+ * @fhp: NFS filehandle of parent directory
+ * @resfhp: NFS filehandle of new object
+ * @iap: requested attributes of new object
+ *
+ * Returns nfs_ok on success, or an nfsstat in network byte order.
+ */
+__be32
+nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct iattr *iap)
{
+ __be32 status;
+
/*
- * Mode has already been set earlier in create:
+ * Mode has already been set by file creation.
*/
iap->ia_valid &= ~ATTR_MODE;
+
/*
* Setting uid/gid works only for root. Irix appears to
* send along the gid on create when it tries to implement
@@ -1202,10 +1217,31 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
*/
if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
+
+ /*
+ * Callers expect new file metadata to be committed even
+ * if the attributes have not changed.
+ */
if (iap->ia_valid)
- return nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0);
- /* Callers expect file metadata to be committed here */
- return nfserrno(commit_metadata(resfhp));
+ status = nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0);
+ else
+ status = nfserrno(commit_metadata(resfhp));
+
+ /*
+ * Transactional filesystems had a chance to commit changes
+ * for both parent and child simultaneously making the
+ * following commit_metadata a noop in many cases.
+ */
+ if (!status)
+ status = nfserrno(commit_metadata(fhp));
+
+ /*
+ * Update the new filehandle to pick up the new attributes.
+ */
+ if (!status)
+ status = fh_update(resfhp);
+
+ return status;
}
/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
@@ -1232,7 +1268,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct dentry *dentry, *dchild;
struct inode *dirp;
__be32 err;
- __be32 err2;
int host_err;
dentry = fhp->fh_dentry;
@@ -1305,22 +1340,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err < 0)
goto out_nfserr;
- err = nfsd_create_setattr(rqstp, resfhp, iap);
+ err = nfsd_create_setattr(rqstp, fhp, resfhp, iap);
- /*
- * nfsd_create_setattr already committed the child. Transactional
- * filesystems had a chance to commit changes for both parent and
- * child simultaneously making the following commit_metadata a
- * noop.
- */
- err2 = nfserrno(commit_metadata(fhp));
- if (err2)
- err = err2;
- /*
- * Update the file handle to get the new inode info.
- */
- if (!err)
- err = fh_update(resfhp);
out:
dput(dchild);
return err;
@@ -1376,172 +1397,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
/*
- * NFSv3 and NFSv4 version of nfsd_create
- */
-__be32
-do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
- char *fname, int flen, struct iattr *iap,
- struct svc_fh *resfhp, int createmode, u32 *verifier,
- bool *truncp, bool *created)
-{
- struct dentry *dentry, *dchild = NULL;
- struct inode *dirp;
- __be32 err;
- int host_err;
- __u32 v_mtime=0, v_atime=0;
-
- err = nfserr_perm;
- if (!flen)
- goto out;
- err = nfserr_exist;
- if (isdotent(fname, flen))
- goto out;
- if (!(iap->ia_valid & ATTR_MODE))
- iap->ia_mode = 0;
- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
- if (err)
- goto out;
-
- dentry = fhp->fh_dentry;
- dirp = d_inode(dentry);
-
- host_err = fh_want_write(fhp);
- if (host_err)
- goto out_nfserr;
-
- fh_lock_nested(fhp, I_MUTEX_PARENT);
-
- /*
- * Compose the response file handle.
- */
- dchild = lookup_one_len(fname, dentry, flen);
- host_err = PTR_ERR(dchild);
- if (IS_ERR(dchild))
- goto out_nfserr;
-
- /* If file doesn't exist, check for permissions to create one */
- if (d_really_is_negative(dchild)) {
- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
- if (err)
- goto out;
- }
-
- err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
- if (err)
- goto out;
-
- if (nfsd_create_is_exclusive(createmode)) {
- /* solaris7 gets confused (bugid 4218508) if these have
- * the high bit set, as do xfs filesystems without the
- * "bigtime" feature. So just clear the high bits. If this is
- * ever changed to use different attrs for storing the
- * verifier, then do_open_lookup() will also need to be fixed
- * accordingly.
- */
- v_mtime = verifier[0]&0x7fffffff;
- v_atime = verifier[1]&0x7fffffff;
- }
-
- if (d_really_is_positive(dchild)) {
- err = 0;
-
- switch (createmode) {
- case NFS3_CREATE_UNCHECKED:
- if (! d_is_reg(dchild))
- goto out;
- else if (truncp) {
- /* in nfsv4, we need to treat this case a little
- * differently. we don't want to truncate the
- * file now; this would be wrong if the OPEN
- * fails for some other reason. furthermore,
- * if the size is nonzero, we should ignore it
- * according to spec!
- */
- *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
- }
- else {
- iap->ia_valid &= ATTR_SIZE;
- goto set_attr;
- }
- break;
- case NFS3_CREATE_EXCLUSIVE:
- if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
- && d_inode(dchild)->i_atime.tv_sec == v_atime
- && d_inode(dchild)->i_size == 0 ) {
- if (created)
- *created = true;
- break;
- }
- fallthrough;
- case NFS4_CREATE_EXCLUSIVE4_1:
- if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
- && d_inode(dchild)->i_atime.tv_sec == v_atime
- && d_inode(dchild)->i_size == 0 ) {
- if (created)
- *created = true;
- goto set_attr;
- }
- fallthrough;
- case NFS3_CREATE_GUARDED:
- err = nfserr_exist;
- }
- fh_drop_write(fhp);
- goto out;
- }
-
- if (!IS_POSIXACL(dirp))
- iap->ia_mode &= ~current_umask();
-
- host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
- if (host_err < 0) {
- fh_drop_write(fhp);
- goto out_nfserr;
- }
- if (created)
- *created = true;
-
- nfsd_check_ignore_resizing(iap);
-
- if (nfsd_create_is_exclusive(createmode)) {
- /* Cram the verifier into atime/mtime */
- iap->ia_valid = ATTR_MTIME|ATTR_ATIME
- | ATTR_MTIME_SET|ATTR_ATIME_SET;
- /* XXX someone who knows this better please fix it for nsec */
- iap->ia_mtime.tv_sec = v_mtime;
- iap->ia_atime.tv_sec = v_atime;
- iap->ia_mtime.tv_nsec = 0;
- iap->ia_atime.tv_nsec = 0;
- }
-
- set_attr:
- err = nfsd_create_setattr(rqstp, resfhp, iap);
-
- /*
- * nfsd_create_setattr already committed the child
- * (and possibly also the parent).
- */
- if (!err)
- err = nfserrno(commit_metadata(fhp));
-
- /*
- * Update the filehandle to get the new inode info.
- */
- if (!err)
- err = fh_update(resfhp);
-
- out:
- fh_unlock(fhp);
- if (dchild && !IS_ERR(dchild))
- dput(dchild);
- fh_drop_write(fhp);
- return err;
-
- out_nfserr:
- err = nfserrno(host_err);
- goto out;
-}
-
-/*
* Read a symlink. On entry, *lenp must contain the maximum path length that
* fits into the buffer. On return, it contains the true length.
* N.B. After this call fhp needs an fh_put
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index ccb87b2864f6..26347d76f44a 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -69,10 +69,8 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
int type, dev_t rdev, struct svc_fh *res);
__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
-__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
- char *name, int len, struct iattr *attrs,
- struct svc_fh *res, int createmode,
- u32 *verifier, bool *truncp, bool *created);
+__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct iattr *iap);
__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
u64 offset, u32 count, __be32 *verf);
#ifdef CONFIG_NFSD_V4
@@ -88,7 +86,7 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
int nfsd_open_break_lease(struct inode *, int);
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
-__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t,
+__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *,
int, struct file **);
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
@@ -159,10 +157,4 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
AT_STATX_SYNC_AS_STAT));
}
-static inline int nfsd_create_is_exclusive(int createmode)
-{
- return createmode == NFS3_CREATE_EXCLUSIVE
- || createmode == NFS4_CREATE_EXCLUSIVE4_1;
-}
-
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 846ab6df9d48..7b744011f2d3 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -273,6 +273,7 @@ struct nfsd4_open {
bool op_truncate; /* used during processing */
bool op_created; /* used during processing */
struct nfs4_openowner *op_openowner; /* used during processing */
+ struct file *op_filp; /* used during processing */
struct nfs4_file *op_file; /* used during processing */
struct nfs4_ol_stateid *op_stp; /* used during processing */
struct nfs4_clnt_odstate *op_odstate; /* used during processing */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 829dd4a61b66..190aa717fa32 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -168,7 +168,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
- mutex_lock(&dnotify_group->mark_mutex);
+ fsnotify_group_lock(dnotify_group);
spin_lock(&fsn_mark->lock);
prev = &dn_mark->dn;
@@ -191,7 +191,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
free = true;
}
- mutex_unlock(&dnotify_group->mark_mutex);
+ fsnotify_group_unlock(dnotify_group);
if (free)
fsnotify_free_mark(fsn_mark);
@@ -324,7 +324,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
new_dn_mark->dn = NULL;
/* this is needed to prevent the fcntl/close race described below */
- mutex_lock(&dnotify_group->mark_mutex);
+ fsnotify_group_lock(dnotify_group);
/* add the new_fsn_mark or find an old one. */
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
@@ -334,7 +334,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
} else {
error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
if (error) {
- mutex_unlock(&dnotify_group->mark_mutex);
+ fsnotify_group_unlock(dnotify_group);
goto out_err;
}
spin_lock(&new_fsn_mark->lock);
@@ -383,7 +383,7 @@ out:
if (destroy)
fsnotify_detach_mark(fsn_mark);
- mutex_unlock(&dnotify_group->mark_mutex);
+ fsnotify_group_unlock(dnotify_group);
if (destroy)
fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
@@ -401,7 +401,8 @@ static int __init dnotify_init(void)
SLAB_PANIC|SLAB_ACCOUNT);
dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);
- dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
+ dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops,
+ FSNOTIFY_GROUP_NOFS);
if (IS_ERR(dnotify_group))
panic("unable to allocate fsnotify group for dnotify\n");
dnotify_sysctl_init();
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 985e995d2a39..4f897e109547 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -319,12 +319,8 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
return 0;
}
- fsnotify_foreach_iter_type(type) {
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
- mark = iter_info->marks[type];
-
- /* Apply ignore mask regardless of ISDIR and ON_CHILD flags */
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
+ /* Apply ignore mask regardless of mark's ISDIR flag */
marks_ignored_mask |= mark->ignored_mask;
/*
@@ -334,14 +330,6 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR))
continue;
- /*
- * If the event is on a child and this mark is on a parent not
- * watching children, don't send it!
- */
- if (type == FSNOTIFY_ITER_TYPE_PARENT &&
- !(mark->mask & FS_EVENT_ON_CHILD))
- continue;
-
marks_mask |= mark->mask;
/* Record the mark types of this group that matched the event */
@@ -849,16 +837,14 @@ out:
*/
static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
{
+ struct fsnotify_mark *mark;
int type;
__kernel_fsid_t fsid = {};
- fsnotify_foreach_iter_type(type) {
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
struct fsnotify_mark_connector *conn;
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
-
- conn = READ_ONCE(iter_info->marks[type]->connector);
+ conn = READ_ONCE(mark->connector);
/* Mark is just getting destroyed or created? */
if (!conn)
continue;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index a3d5b751cac5..80e0ec95b113 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -490,3 +490,15 @@ static inline unsigned int fanotify_event_hash_bucket(
{
return event->hash & FANOTIFY_HTABLE_MASK;
}
+
+static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
+{
+ unsigned int mflags = 0;
+
+ if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+ mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
+ if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)
+ mflags |= FAN_MARK_EVICTABLE;
+
+ return mflags;
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index a792e21c5309..c2255b440df9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -264,7 +264,7 @@ static int create_fd(struct fsnotify_group *group, struct path *path,
* originally opened O_WRONLY.
*/
new_file = dentry_open(path,
- group->fanotify_data.f_flags | FMODE_NONOTIFY,
+ group->fanotify_data.f_flags | __FMODE_NONOTIFY,
current_cred());
if (IS_ERR(new_file)) {
/*
@@ -1035,10 +1035,10 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
__u32 removed;
int destroy_mark;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
return -ENOENT;
}
@@ -1048,7 +1048,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
fsnotify_recalc_mask(fsn_mark->connector);
if (destroy_mark)
fsnotify_detach_mark(fsn_mark);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
if (destroy_mark)
fsnotify_free_mark(fsn_mark);
@@ -1081,47 +1081,63 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
flags, umask);
}
-static void fanotify_mark_add_ignored_mask(struct fsnotify_mark *fsn_mark,
- __u32 mask, unsigned int flags,
- __u32 *removed)
+static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
+ unsigned int fan_flags)
{
- fsn_mark->ignored_mask |= mask;
+ bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
+ bool recalc = false;
/*
* Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
* the removal of the FS_MODIFY bit in calculated mask if it was set
* because of an ignored mask that is now going to survive FS_MODIFY.
*/
- if ((flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+ if ((fan_flags & FAN_MARK_IGNORED_MASK) &&
+ (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
!(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
if (!(fsn_mark->mask & FS_MODIFY))
- *removed = FS_MODIFY;
+ recalc = true;
}
+
+ if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
+ want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+ return recalc;
+
+ /*
+ * NO_IREF may be removed from a mark, but not added.
+ * When removed, fsnotify_recalc_mask() will take the inode ref.
+ */
+ WARN_ON_ONCE(!want_iref);
+ fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
+
+ return true;
}
-static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
- __u32 mask, unsigned int flags,
- __u32 *removed)
+static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
+ __u32 mask, unsigned int fan_flags)
{
- __u32 oldmask, newmask;
+ bool recalc;
spin_lock(&fsn_mark->lock);
- oldmask = fsnotify_calc_mask(fsn_mark);
- if (!(flags & FAN_MARK_IGNORED_MASK)) {
+ if (!(fan_flags & FAN_MARK_IGNORED_MASK))
fsn_mark->mask |= mask;
- } else {
- fanotify_mark_add_ignored_mask(fsn_mark, mask, flags, removed);
- }
- newmask = fsnotify_calc_mask(fsn_mark);
+ else
+ fsn_mark->ignored_mask |= mask;
+
+ recalc = fsnotify_calc_mask(fsn_mark) &
+ ~fsnotify_conn_mask(fsn_mark->connector);
+
+ recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags);
spin_unlock(&fsn_mark->lock);
- return newmask & ~oldmask;
+ return recalc;
}
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp,
unsigned int obj_type,
+ unsigned int fan_flags,
__kernel_fsid_t *fsid)
{
struct ucounts *ucounts = group->fanotify_data.ucounts;
@@ -1144,6 +1160,9 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
}
fsnotify_init_mark(mark, group);
+ if (fan_flags & FAN_MARK_EVICTABLE)
+ mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
+
ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
if (ret) {
fsnotify_put_mark(mark);
@@ -1170,39 +1189,49 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group)
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int obj_type,
- __u32 mask, unsigned int flags,
+ __u32 mask, unsigned int fan_flags,
__kernel_fsid_t *fsid)
{
struct fsnotify_mark *fsn_mark;
- __u32 added, removed = 0;
+ bool recalc;
int ret = 0;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid);
+ fsn_mark = fanotify_add_new_mark(group, connp, obj_type,
+ fan_flags, fsid);
if (IS_ERR(fsn_mark)) {
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
return PTR_ERR(fsn_mark);
}
}
/*
+ * Non evictable mark cannot be downgraded to evictable mark.
+ */
+ if (fan_flags & FAN_MARK_EVICTABLE &&
+ !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ /*
* Error events are pre-allocated per group, only if strictly
* needed (i.e. FAN_FS_ERROR was requested).
*/
- if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
+ if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
ret = fanotify_group_init_error_pool(group);
if (ret)
goto out;
}
- added = fanotify_mark_add_to_mask(fsn_mark, mask, flags, &removed);
- if (removed || (added & ~fsnotify_conn_mask(fsn_mark->connector)))
+ recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags);
+ if (recalc)
fsnotify_recalc_mask(fsn_mark->connector);
out:
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
fsnotify_put_mark(fsn_mark);
return ret;
@@ -1348,14 +1377,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
(!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
return -EINVAL;
- f_flags = O_RDWR | FMODE_NONOTIFY;
+ f_flags = O_RDWR | __FMODE_NONOTIFY;
if (flags & FAN_CLOEXEC)
f_flags |= O_CLOEXEC;
if (flags & FAN_NONBLOCK)
f_flags |= O_NONBLOCK;
/* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
- group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
+ group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
+ FSNOTIFY_GROUP_USER | FSNOTIFY_GROUP_NOFS);
if (IS_ERR(group)) {
return PTR_ERR(group);
}
@@ -1598,6 +1628,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
goto fput_and_out;
/*
+ * Evictable is only relevant for inode marks, because only inode object
+ * can be evicted on memory pressure.
+ */
+ if (flags & FAN_MARK_EVICTABLE &&
+ mark_type != FAN_MARK_INODE)
+ goto fput_and_out;
+
+ /*
* Events that do not carry enough information to report
* event->fd require a group that supports reporting fid. Those
* events are not supported on a mount mark, because they do not
@@ -1762,7 +1800,7 @@ static int __init fanotify_user_setup(void)
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10);
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 57f0d5d9f934..59fb40abe33d 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -14,6 +14,7 @@
#include <linux/exportfs.h>
#include "inotify/inotify.h"
+#include "fanotify/fanotify.h"
#include "fdinfo.h"
#include "fsnotify.h"
@@ -28,13 +29,13 @@ static void show_fdinfo(struct seq_file *m, struct file *f,
struct fsnotify_group *group = f->private_data;
struct fsnotify_mark *mark;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
list_for_each_entry(mark, &group->marks_list, g_list) {
show(m, mark);
if (seq_has_overflowed(m))
break;
}
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
}
#if defined(CONFIG_EXPORTFS)
@@ -83,16 +84,9 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
inode = igrab(fsnotify_conn_inode(mark->connector));
if (inode) {
- /*
- * IN_ALL_EVENTS represents all of the mask bits
- * that we expose to userspace. There is at
- * least one bit (FS_EVENT_ON_CHILD) which is
- * used only internally to the kernel.
- */
- u32 mask = mark->mask & IN_ALL_EVENTS;
- seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
+ seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ",
inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
- mask, mark->ignored_mask);
+ inotify_mark_user_mask(mark));
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
@@ -110,12 +104,9 @@ void inotify_show_fdinfo(struct seq_file *m, struct file *f)
static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
{
- unsigned int mflags = 0;
+ unsigned int mflags = fanotify_mark_user_flags(mark);
struct inode *inode;
- if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
- mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
-
if (mark->connector->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = igrab(fsnotify_conn_inode(mark->connector));
if (!inode)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 70a8516b78bc..0b3e74935cb4 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -253,7 +253,7 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group,
if (WARN_ON_ONCE(!inode && !dir))
return 0;
- if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+ if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) &&
path && d_unlinked(path->dentry))
return 0;
@@ -290,22 +290,15 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
}
if (parent_mark) {
- /*
- * parent_mark indicates that the parent inode is watching
- * children and interested in this event, which is an event
- * possible on child. But is *this mark* watching children and
- * interested in this event?
- */
- if (parent_mark->mask & FS_EVENT_ON_CHILD) {
- ret = fsnotify_handle_inode_event(group, parent_mark, mask,
- data, data_type, dir, name, 0);
- if (ret)
- return ret;
- }
- if (!inode_mark)
- return 0;
+ ret = fsnotify_handle_inode_event(group, parent_mark, mask,
+ data, data_type, dir, name, 0);
+ if (ret)
+ return ret;
}
+ if (!inode_mark)
+ return 0;
+
if (mask & FS_EVENT_ON_CHILD) {
/*
* Some events can be sent on both parent dir and child marks
@@ -335,31 +328,23 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
struct fsnotify_mark *mark;
int type;
- if (WARN_ON(!iter_info->report_mask))
+ if (!iter_info->report_mask)
return 0;
/* clear ignored on inode modification */
if (mask & FS_MODIFY) {
- fsnotify_foreach_iter_type(type) {
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
- mark = iter_info->marks[type];
- if (mark &&
- !(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
+ if (!(mark->flags &
+ FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
mark->ignored_mask = 0;
}
}
- fsnotify_foreach_iter_type(type) {
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
- mark = iter_info->marks[type];
- /* does the object mark tell us to do something? */
- if (mark) {
- group = mark->group;
- marks_mask |= mark->mask;
- marks_ignored_mask |= mark->ignored_mask;
- }
+ /* Are any of the group marks interested in this event? */
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
+ group = mark->group;
+ marks_mask |= mark->mask;
+ marks_ignored_mask |= mark->ignored_mask;
}
pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
@@ -403,11 +388,11 @@ static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
/*
* iter_info is a multi head priority queue of marks.
- * Pick a subset of marks from queue heads, all with the
- * same group and set the report_mask for selected subset.
- * Returns the report_mask of the selected subset.
+ * Pick a subset of marks from queue heads, all with the same group
+ * and set the report_mask to a subset of the selected marks.
+ * Returns false if there are no more groups to iterate.
*/
-static unsigned int fsnotify_iter_select_report_types(
+static bool fsnotify_iter_select_report_types(
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_group *max_prio_group = NULL;
@@ -423,30 +408,48 @@ static unsigned int fsnotify_iter_select_report_types(
}
if (!max_prio_group)
- return 0;
+ return false;
/* Set the report mask for marks from same group as max prio group */
+ iter_info->current_group = max_prio_group;
iter_info->report_mask = 0;
fsnotify_foreach_iter_type(type) {
mark = iter_info->marks[type];
- if (mark &&
- fsnotify_compare_groups(max_prio_group, mark->group) == 0)
+ if (mark && mark->group == iter_info->current_group) {
+ /*
+ * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode
+ * is watching children and interested in this event,
+ * which is an event possible on child.
+ * But is *this mark* watching children?
+ */
+ if (type == FSNOTIFY_ITER_TYPE_PARENT &&
+ !(mark->mask & FS_EVENT_ON_CHILD))
+ continue;
+
fsnotify_iter_set_report_type(iter_info, type);
+ }
}
- return iter_info->report_mask;
+ return true;
}
/*
- * Pop from iter_info multi head queue, the marks that were iterated in the
+ * Pop from iter_info multi head queue, the marks that belong to the group of
* current iteration step.
*/
static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
+ struct fsnotify_mark *mark;
int type;
+ /*
+ * We cannot use fsnotify_foreach_iter_mark_type() here because we
+ * may need to advance a mark of type X that belongs to current_group
+ * but was not selected for reporting.
+ */
fsnotify_foreach_iter_type(type) {
- if (fsnotify_iter_should_report_type(iter_info, type))
+ mark = iter_info->marks[type];
+ if (mark && mark->group == iter_info->current_group)
iter_info->marks[type] =
fsnotify_next_mark(iter_info->marks[type]);
}
@@ -581,7 +584,7 @@ static __init int fsnotify_init(void)
{
int ret;
- BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
diff --git a/fs/notify/group.c b/fs/notify/group.c
index b7d4d64f87c2..1de6631a3925 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -112,8 +112,10 @@ void fsnotify_put_group(struct fsnotify_group *group)
EXPORT_SYMBOL_GPL(fsnotify_put_group);
static struct fsnotify_group *__fsnotify_alloc_group(
- const struct fsnotify_ops *ops, gfp_t gfp)
+ const struct fsnotify_ops *ops,
+ int flags, gfp_t gfp)
{
+ static struct lock_class_key nofs_marks_lock;
struct fsnotify_group *group;
group = kzalloc(sizeof(struct fsnotify_group), gfp);
@@ -133,6 +135,17 @@ static struct fsnotify_group *__fsnotify_alloc_group(
INIT_LIST_HEAD(&group->marks_list);
group->ops = ops;
+ group->flags = flags;
+ /*
+ * For most backends, eviction of inode with a mark is not expected,
+ * because marks hold a refcount on the inode against eviction.
+ *
+ * Use a different lockdep class for groups that support evictable
+ * inode marks, because with evictable marks, mark_mutex is NOT
+ * fs-reclaim safe - the mutex is taken when evicting inodes.
+ */
+ if (flags & FSNOTIFY_GROUP_NOFS)
+ lockdep_set_class(&group->mark_mutex, &nofs_marks_lock);
return group;
}
@@ -140,20 +153,15 @@ static struct fsnotify_group *__fsnotify_alloc_group(
/*
* Create a new fsnotify_group and hold a reference for the group returned.
*/
-struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops,
+ int flags)
{
- return __fsnotify_alloc_group(ops, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
+ gfp_t gfp = (flags & FSNOTIFY_GROUP_USER) ? GFP_KERNEL_ACCOUNT :
+ GFP_KERNEL;
-/*
- * Create a new fsnotify_group and hold a reference for the group returned.
- */
-struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops)
-{
- return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT);
+ return __fsnotify_alloc_group(ops, flags, gfp);
}
-EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group);
+EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
int fsnotify_fasync(int fd, struct file *file, int on)
{
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 2007e3711916..7d5df7a21539 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -22,6 +22,25 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
return container_of(fse, struct inotify_event_info, fse);
}
+/*
+ * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to
+ * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+#define INOTIFY_USER_MASK (IN_ALL_EVENTS)
+
+static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark)
+{
+ __u32 mask = fsn_mark->mask & INOTIFY_USER_MASK;
+
+ if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK)
+ mask |= IN_EXCL_UNLINK;
+ if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
+ mask |= IN_ONESHOT;
+
+ return mask;
+}
+
extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_group *group);
extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark,
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index d92d7b0adc9a..49cfe2ae6d23 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -122,7 +122,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
fsnotify_destroy_event(group, fsn_event);
}
- if (inode_mark->mask & IN_ONESHOT)
+ if (inode_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
fsnotify_destroy_mark(inode_mark, group);
return 0;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 54583f62dc44..ed42a189faa2 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -110,11 +110,26 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
mask |= FS_EVENT_ON_CHILD;
/* mask off the flags used to open the fd */
- mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
+ mask |= (arg & INOTIFY_USER_MASK);
return mask;
}
+#define INOTIFY_MARK_FLAGS \
+ (FSNOTIFY_MARK_FLAG_EXCL_UNLINK | FSNOTIFY_MARK_FLAG_IN_ONESHOT)
+
+static inline unsigned int inotify_arg_to_flags(u32 arg)
+{
+ unsigned int flags = 0;
+
+ if (arg & IN_EXCL_UNLINK)
+ flags |= FSNOTIFY_MARK_FLAG_EXCL_UNLINK;
+ if (arg & IN_ONESHOT)
+ flags |= FSNOTIFY_MARK_FLAG_IN_ONESHOT;
+
+ return flags;
+}
+
static inline u32 inotify_mask_to_arg(__u32 mask)
{
return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
@@ -526,13 +541,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
struct fsnotify_mark *fsn_mark;
struct inotify_inode_mark *i_mark;
__u32 old_mask, new_mask;
- __u32 mask;
- int add = (arg & IN_MASK_ADD);
+ int replace = !(arg & IN_MASK_ADD);
int create = (arg & IN_MASK_CREATE);
int ret;
- mask = inotify_arg_to_mask(inode, arg);
-
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark)
return -ENOENT;
@@ -545,10 +557,12 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
spin_lock(&fsn_mark->lock);
old_mask = fsn_mark->mask;
- if (add)
- fsn_mark->mask |= mask;
- else
- fsn_mark->mask = mask;
+ if (replace) {
+ fsn_mark->mask = 0;
+ fsn_mark->flags &= ~INOTIFY_MARK_FLAGS;
+ }
+ fsn_mark->mask |= inotify_arg_to_mask(inode, arg);
+ fsn_mark->flags |= inotify_arg_to_flags(arg);
new_mask = fsn_mark->mask;
spin_unlock(&fsn_mark->lock);
@@ -579,19 +593,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
u32 arg)
{
struct inotify_inode_mark *tmp_i_mark;
- __u32 mask;
int ret;
struct idr *idr = &group->inotify_data.idr;
spinlock_t *idr_lock = &group->inotify_data.idr_lock;
- mask = inotify_arg_to_mask(inode, arg);
-
tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
if (unlikely(!tmp_i_mark))
return -ENOMEM;
fsnotify_init_mark(&tmp_i_mark->fsn_mark, group);
- tmp_i_mark->fsn_mark.mask = mask;
+ tmp_i_mark->fsn_mark.mask = inotify_arg_to_mask(inode, arg);
+ tmp_i_mark->fsn_mark.flags = inotify_arg_to_flags(arg);
tmp_i_mark->wd = -1;
ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
@@ -628,13 +640,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
{
int ret = 0;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
/* try to update and existing watch with the new arg */
ret = inotify_update_existing_watch(group, inode, arg);
/* no mark present, try to add a new one */
if (ret == -ENOENT)
ret = inotify_new_watch(group, inode, arg);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
return ret;
}
@@ -644,7 +656,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
struct fsnotify_group *group;
struct inotify_event_info *oevent;
- group = fsnotify_alloc_user_group(&inotify_fsnotify_ops);
+ group = fsnotify_alloc_group(&inotify_fsnotify_ops,
+ FSNOTIFY_GROUP_USER);
if (IS_ERR(group))
return group;
@@ -845,9 +858,7 @@ static int __init inotify_user_setup(void)
BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
- BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
- BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 4853184f7dde..c74ef947447d 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -116,20 +116,64 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
return *fsnotify_conn_mask_p(conn);
}
-static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+static void fsnotify_get_inode_ref(struct inode *inode)
+{
+ ihold(inode);
+ atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
+}
+
+/*
+ * Grab or drop inode reference for the connector if needed.
+ *
+ * When it's time to drop the reference, we only clear the HAS_IREF flag and
+ * return the inode object. fsnotify_drop_object() will be resonsible for doing
+ * iput() outside of spinlocks. This happens when last mark that wanted iref is
+ * detached.
+ */
+static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
+ bool want_iref)
+{
+ bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
+ struct inode *inode = NULL;
+
+ if (conn->type != FSNOTIFY_OBJ_TYPE_INODE ||
+ want_iref == has_iref)
+ return NULL;
+
+ if (want_iref) {
+ /* Pin inode if any mark wants inode refcount held */
+ fsnotify_get_inode_ref(fsnotify_conn_inode(conn));
+ conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
+ } else {
+ /* Unpin inode after detach of last mark that wanted iref */
+ inode = fsnotify_conn_inode(conn);
+ conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
+ }
+
+ return inode;
+}
+
+static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
u32 new_mask = 0;
+ bool want_iref = false;
struct fsnotify_mark *mark;
assert_spin_locked(&conn->lock);
/* We can get detached connector here when inode is getting unlinked. */
if (!fsnotify_valid_obj_type(conn->type))
- return;
+ return NULL;
hlist_for_each_entry(mark, &conn->list, obj_list) {
- if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
- new_mask |= fsnotify_calc_mask(mark);
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
+ continue;
+ new_mask |= fsnotify_calc_mask(mark);
+ if (conn->type == FSNOTIFY_OBJ_TYPE_INODE &&
+ !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+ want_iref = true;
}
*fsnotify_conn_mask_p(conn) = new_mask;
+
+ return fsnotify_update_iref(conn, want_iref);
}
/*
@@ -169,12 +213,6 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
-static void fsnotify_get_inode_ref(struct inode *inode)
-{
- ihold(inode);
- atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
-}
-
static void fsnotify_put_inode_ref(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
@@ -213,6 +251,10 @@ static void *fsnotify_detach_connector_from_object(
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
+
+ /* Unpin inode when detaching from connector */
+ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
+ inode = NULL;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
@@ -274,7 +316,8 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
objp = fsnotify_detach_connector_from_object(conn, &type);
free_conn = true;
} else {
- __fsnotify_recalc_mask(conn);
+ objp = __fsnotify_recalc_mask(conn);
+ type = conn->type;
}
WRITE_ONCE(mark->connector, NULL);
spin_unlock(&conn->lock);
@@ -398,9 +441,7 @@ void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
*/
void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
- struct fsnotify_group *group = mark->group;
-
- WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
+ fsnotify_group_assert_locked(mark->group);
WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
refcount_read(&mark->refcnt) < 1 +
!!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
@@ -452,9 +493,9 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group)
{
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ fsnotify_group_lock(group);
fsnotify_detach_mark(mark);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
fsnotify_free_mark(mark);
}
EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
@@ -499,7 +540,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
unsigned int obj_type,
__kernel_fsid_t *fsid)
{
- struct inode *inode = NULL;
struct fsnotify_mark_connector *conn;
conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
@@ -507,6 +547,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
return -ENOMEM;
spin_lock_init(&conn->lock);
INIT_HLIST_HEAD(&conn->list);
+ conn->flags = 0;
conn->type = obj_type;
conn->obj = connp;
/* Cache fsid of filesystem containing the object */
@@ -517,10 +558,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
conn->fsid.val[0] = conn->fsid.val[1] = 0;
conn->flags = 0;
}
- if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
- inode = fsnotify_conn_inode(conn);
- fsnotify_get_inode_ref(inode);
- }
fsnotify_get_sb_connectors(conn);
/*
@@ -529,8 +566,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
*/
if (cmpxchg(connp, NULL, conn)) {
/* Someone else created list structure for us */
- if (inode)
- fsnotify_put_inode_ref(inode);
fsnotify_put_sb_connectors(conn);
kmem_cache_free(fsnotify_mark_connector_cachep, conn);
}
@@ -574,7 +609,7 @@ out:
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
fsnotify_connp_t *connp,
unsigned int obj_type,
- int allow_dups, __kernel_fsid_t *fsid)
+ int add_flags, __kernel_fsid_t *fsid)
{
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
@@ -633,7 +668,7 @@ restart:
if ((lmark->group == mark->group) &&
(lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
- !allow_dups) {
+ !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) {
err = -EEXIST;
goto out_err;
}
@@ -668,12 +703,12 @@ out_err:
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int obj_type,
- int allow_dups, __kernel_fsid_t *fsid)
+ int add_flags, __kernel_fsid_t *fsid)
{
struct fsnotify_group *group = mark->group;
int ret = 0;
- BUG_ON(!mutex_is_locked(&group->mark_mutex));
+ fsnotify_group_assert_locked(group);
/*
* LOCKING ORDER!!!!
@@ -688,12 +723,11 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, obj_type, allow_dups, fsid);
+ ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid);
if (ret)
goto err;
- if (mark->mask || mark->ignored_mask)
- fsnotify_recalc_mask(mark->connector);
+ fsnotify_recalc_mask(mark->connector);
return ret;
err:
@@ -708,15 +742,15 @@ err:
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
- unsigned int obj_type, int allow_dups,
+ unsigned int obj_type, int add_flags,
__kernel_fsid_t *fsid)
{
int ret;
struct fsnotify_group *group = mark->group;
- mutex_lock(&group->mark_mutex);
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, allow_dups, fsid);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_lock(group);
+ ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid);
+ fsnotify_group_unlock(group);
return ret;
}
EXPORT_SYMBOL_GPL(fsnotify_add_mark);
@@ -770,24 +804,24 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
* move marks to free to to_free list in one go and then free marks in
* to_free list one by one.
*/
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ fsnotify_group_lock(group);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
if (mark->connector->type == obj_type)
list_move(&mark->g_list, &to_free);
}
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
clear:
while (1) {
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ fsnotify_group_lock(group);
if (list_empty(head)) {
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
break;
}
mark = list_first_entry(head, struct fsnotify_mark, g_list);
fsnotify_get_mark(mark);
fsnotify_detach_mark(mark);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
}
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index e1392a9b8ceb..a8abe2296514 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1772,11 +1772,11 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
last_vcn = -1;
do {
VCN vcn;
- pgoff_t idx, start_idx;
+ pgoff_t start_idx;
unsigned ofs, do_pages, u;
size_t copied;
- start_idx = idx = pos >> PAGE_SHIFT;
+ start_idx = pos >> PAGE_SHIFT;
ofs = pos & ~PAGE_MASK;
bytes = PAGE_SIZE - ofs;
do_pages = 1;
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index a4fcdc7927ca..8e9d2b35175f 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -492,7 +492,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
down_write(&ni->file.run_lock);
err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size,
- &new_valid, true, NULL);
+ &new_valid, ni->mi.sbi->options->prealloc, NULL);
up_write(&ni->file.run_lock);
if (new_valid < ni->i_valid)
@@ -659,7 +659,13 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
/*
* Normal file: Allocate clusters, do not change 'valid' size.
*/
- err = ntfs_set_size(inode, max(end, i_size));
+ loff_t new_size = max(end, i_size);
+
+ err = inode_newsize_ok(inode, new_size);
+ if (err)
+ goto out;
+
+ err = ntfs_set_size(inode, new_size);
if (err)
goto out;
@@ -759,7 +765,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
inode_dio_wait(inode);
- if (attr->ia_size < oldsize)
+ if (attr->ia_size <= oldsize)
err = ntfs_truncate(inode, attr->ia_size);
else if (attr->ia_size > oldsize)
err = ntfs_extend(inode, attr->ia_size, 0, NULL);
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 6f47a9c17f89..18842998c8fa 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -1964,10 +1964,8 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
vcn += clen;
- if (vbo + bytes >= end) {
+ if (vbo + bytes >= end)
bytes = end - vbo;
- flags |= FIEMAP_EXTENT_LAST;
- }
if (vbo + bytes <= valid) {
;
@@ -1977,6 +1975,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
/* vbo < valid && valid < vbo + bytes */
u64 dlen = valid - vbo;
+ if (vbo + dlen >= end)
+ flags |= FIEMAP_EXTENT_LAST;
+
err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen,
flags);
if (err < 0)
@@ -1995,6 +1996,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
flags |= FIEMAP_EXTENT_UNWRITTEN;
}
+ if (vbo + bytes >= end)
+ flags |= FIEMAP_EXTENT_LAST;
+
err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags);
if (err < 0)
break;
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 06492f088d60..49b7df616778 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -1185,8 +1185,6 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
if (!r_page)
return -ENOMEM;
- memset(info, 0, sizeof(struct restart_info));
-
/* Determine which restart area we are looking for. */
if (first) {
vbo = 0;
@@ -3791,10 +3789,11 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
if (!log)
return -ENOMEM;
+ memset(&rst_info, 0, sizeof(struct restart_info));
+
log->ni = ni;
log->l_size = l_size;
log->one_page_buf = kmalloc(page_size, GFP_NOFS);
-
if (!log->one_page_buf) {
err = -ENOMEM;
goto out;
@@ -3842,6 +3841,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
if (rst_info.vbo)
goto check_restart_area;
+ memset(&rst_info2, 0, sizeof(struct restart_info));
err = log_read_rst(log, l_size, false, &rst_info2);
/* Determine which restart area to use. */
@@ -4085,8 +4085,10 @@ process_log:
if (client == LFS_NO_CLIENT_LE) {
/* Insert "NTFS" client LogFile. */
client = ra->client_idx[0];
- if (client == LFS_NO_CLIENT_LE)
- return -EINVAL;
+ if (client == LFS_NO_CLIENT_LE) {
+ err = -EINVAL;
+ goto out;
+ }
t16 = le16_to_cpu(client);
cr = ca + t16;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 74f60c457f28..be4ebdd8048b 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -758,6 +758,7 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t vbo = iocb->ki_pos;
loff_t end;
int wr = iov_iter_rw(iter) & WRITE;
+ size_t iter_count = iov_iter_count(iter);
loff_t valid;
ssize_t ret;
@@ -771,10 +772,13 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
wr ? ntfs_get_block_direct_IO_W
: ntfs_get_block_direct_IO_R);
- if (ret <= 0)
+ if (ret > 0)
+ end = vbo + ret;
+ else if (wr && ret == -EIOCBQUEUED)
+ end = vbo + iter_count;
+ else
goto out;
- end = vbo + ret;
valid = ni->i_valid;
if (wr) {
if (end > valid && !S_ISBLK(inode->i_mode)) {
@@ -1950,6 +1954,7 @@ const struct address_space_operations ntfs_aops = {
.direct_IO = ntfs_direct_IO,
.bmap = ntfs_bmap,
.dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
};
const struct address_space_operations ntfs_aops_cmpr = {
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 5781b9e8e3d8..0c6de6287737 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -668,9 +668,11 @@ static u32 format_size_gb(const u64 bytes, u32 *mb)
static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot)
{
- return boot->sectors_per_clusters <= 0x80
- ? boot->sectors_per_clusters
- : (1u << (0 - boot->sectors_per_clusters));
+ if (boot->sectors_per_clusters <= 0x80)
+ return boot->sectors_per_clusters;
+ if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */
+ return 1U << (0 - boot->sectors_per_clusters);
+ return -EINVAL;
}
/*
@@ -713,6 +715,8 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
/* cluster size: 512, 1K, 2K, 4K, ... 2M */
sct_per_clst = true_sectors_per_clst(boot);
+ if ((int)sct_per_clst < 0)
+ goto out;
if (!is_power_of_2(sct_per_clst))
goto out;
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index afd0ddad826f..5e0e0280e70d 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -112,7 +112,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
return -ENOMEM;
if (!size) {
- ;
+ /* EA info persists, but xattr is empty. Looks like EA problem. */
} else if (attr_ea->non_res) {
struct runs_tree run;
@@ -259,7 +259,7 @@ out:
static noinline int ntfs_set_ea(struct inode *inode, const char *name,
size_t name_len, const void *value,
- size_t val_size, int flags)
+ size_t val_size, int flags, bool locked)
{
struct ntfs_inode *ni = ntfs_i(inode);
struct ntfs_sb_info *sbi = ni->mi.sbi;
@@ -278,7 +278,8 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
u64 new_sz;
void *p;
- ni_lock(ni);
+ if (!locked)
+ ni_lock(ni);
run_init(&ea_run);
@@ -467,7 +468,8 @@ update_ea:
mark_inode_dirty(&ni->vfs_inode);
out:
- ni_unlock(ni);
+ if (!locked)
+ ni_unlock(ni);
run_close(&ea_run);
kfree(ea_all);
@@ -541,7 +543,7 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu)
static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
struct inode *inode, struct posix_acl *acl,
- int type)
+ int type, bool init_acl)
{
const char *name;
size_t size, name_len;
@@ -554,8 +556,9 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
switch (type) {
case ACL_TYPE_ACCESS:
- if (acl) {
- umode_t mode = inode->i_mode;
+ /* Do not change i_mode if we are in init_acl */
+ if (acl && !init_acl) {
+ umode_t mode;
err = posix_acl_update_mode(mnt_userns, inode, &mode,
&acl);
@@ -598,7 +601,7 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
flags = 0;
}
- err = ntfs_set_ea(inode, name, name_len, value, size, flags);
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
if (err == -ENODATA && !size)
err = 0; /* Removing non existed xattr. */
if (!err)
@@ -616,7 +619,68 @@ out:
int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type)
{
- return ntfs_set_acl_ex(mnt_userns, inode, acl, type);
+ return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false);
+}
+
+static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type, void *buffer,
+ size_t size)
+{
+ struct posix_acl *acl;
+ int err;
+
+ if (!(inode->i_sb->s_flags & SB_POSIXACL)) {
+ ntfs_inode_warn(inode, "add mount option \"acl\" to use acl");
+ return -EOPNOTSUPP;
+ }
+
+ acl = ntfs_get_acl(inode, type, false);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+
+ if (!acl)
+ return -ENODATA;
+
+ err = posix_acl_to_xattr(mnt_userns, acl, buffer, size);
+ posix_acl_release(acl);
+
+ return err;
+}
+
+static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type, const void *value,
+ size_t size)
+{
+ struct posix_acl *acl;
+ int err;
+
+ if (!(inode->i_sb->s_flags & SB_POSIXACL)) {
+ ntfs_inode_warn(inode, "add mount option \"acl\" to use acl");
+ return -EOPNOTSUPP;
+ }
+
+ if (!inode_owner_or_capable(mnt_userns, inode))
+ return -EPERM;
+
+ if (!value) {
+ acl = NULL;
+ } else {
+ acl = posix_acl_from_xattr(mnt_userns, value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+
+ if (acl) {
+ err = posix_acl_valid(mnt_userns, acl);
+ if (err)
+ goto release_and_out;
+ }
+ }
+
+ err = ntfs_set_acl(mnt_userns, inode, acl, type);
+
+release_and_out:
+ posix_acl_release(acl);
+ return err;
}
/*
@@ -636,7 +700,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (default_acl) {
err = ntfs_set_acl_ex(mnt_userns, inode, default_acl,
- ACL_TYPE_DEFAULT);
+ ACL_TYPE_DEFAULT, true);
posix_acl_release(default_acl);
} else {
inode->i_default_acl = NULL;
@@ -647,7 +711,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
else {
if (!err)
err = ntfs_set_acl_ex(mnt_userns, inode, acl,
- ACL_TYPE_ACCESS);
+ ACL_TYPE_ACCESS, true);
posix_acl_release(acl);
}
@@ -785,6 +849,23 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
goto out;
}
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+ if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 &&
+ !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
+ sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) ||
+ (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 &&
+ !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) {
+ /* TODO: init_user_ns? */
+ err = ntfs_xattr_get_acl(
+ &init_user_ns, inode,
+ name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1
+ ? ACL_TYPE_ACCESS
+ : ACL_TYPE_DEFAULT,
+ buffer, size);
+ goto out;
+ }
+#endif
/* Deal with NTFS extended attribute. */
err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL);
@@ -897,10 +978,29 @@ set_new_fa:
goto out;
}
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+ if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 &&
+ !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
+ sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) ||
+ (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 &&
+ !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) {
+ err = ntfs_xattr_set_acl(
+ mnt_userns, inode,
+ name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1
+ ? ACL_TYPE_ACCESS
+ : ACL_TYPE_DEFAULT,
+ value, size);
+ goto out;
+ }
+#endif
/* Deal with NTFS extended attribute. */
- err = ntfs_set_ea(inode, name, name_len, value, size, flags);
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
out:
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
+
return err;
}
@@ -913,35 +1013,37 @@ int ntfs_save_wsl_perm(struct inode *inode)
{
int err;
__le32 value;
+ struct ntfs_inode *ni = ntfs_i(inode);
- /* TODO: refactor this, so we don't lock 4 times in ntfs_set_ea */
+ ni_lock(ni);
value = cpu_to_le32(i_uid_read(inode));
err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value,
- sizeof(value), 0);
+ sizeof(value), 0, true); /* true == already locked. */
if (err)
goto out;
value = cpu_to_le32(i_gid_read(inode));
err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value,
- sizeof(value), 0);
+ sizeof(value), 0, true);
if (err)
goto out;
value = cpu_to_le32(inode->i_mode);
err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value,
- sizeof(value), 0);
+ sizeof(value), 0, true);
if (err)
goto out;
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
value = cpu_to_le32(inode->i_rdev);
err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value,
- sizeof(value), 0);
+ sizeof(value), 0, true);
if (err)
goto out;
}
out:
+ ni_unlock(ni);
/* In case of error should we delete all WSL xattr? */
return err;
}
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index d442cf5dda8a..be5e9ed7da8d 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -541,7 +541,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
struct debug_lockres *dl = m->private;
struct dlm_ctxt *dlm = dl->dl_ctxt;
struct dlm_lock_resource *oldres = dl->dl_res;
- struct dlm_lock_resource *res = NULL;
+ struct dlm_lock_resource *res = NULL, *iter;
struct list_head *track_list;
spin_lock(&dlm->track_lock);
@@ -556,11 +556,11 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
}
}
- list_for_each_entry(res, track_list, tracking) {
- if (&res->tracking == &dlm->tracking_list)
- res = NULL;
- else
- dlm_lockres_get(res);
+ list_for_each_entry(iter, track_list, tracking) {
+ if (&iter->tracking != &dlm->tracking_list) {
+ dlm_lockres_get(iter);
+ res = iter;
+ }
break;
}
spin_unlock(&dlm->track_lock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 61103b2d69fb..7318e4794ef9 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -392,9 +392,9 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_ctxt *dlm = data;
struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
- struct dlm_lock *lock = NULL;
+ struct dlm_lock *lock = NULL, *iter;
enum dlm_status status = DLM_NORMAL;
- int found = 0, i;
+ int i;
struct dlm_lockstatus *lksb = NULL;
int ignore;
u32 flags;
@@ -437,7 +437,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
queue=&res->granted;
- found = 0;
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
spin_unlock(&res->spinlock);
@@ -461,21 +460,21 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
for (i=0; i<3; i++) {
- list_for_each_entry(lock, queue, list) {
- if (lock->ml.cookie == unlock->cookie &&
- lock->ml.node == unlock->node_idx) {
- dlm_lock_get(lock);
- found = 1;
+ list_for_each_entry(iter, queue, list) {
+ if (iter->ml.cookie == unlock->cookie &&
+ iter->ml.node == unlock->node_idx) {
+ dlm_lock_get(iter);
+ lock = iter;
break;
}
}
- if (found)
+ if (lock)
break;
/* scan granted -> converting -> blocked queues */
queue++;
}
spin_unlock(&res->spinlock);
- if (!found) {
+ if (!lock) {
status = DLM_IVLOCKID;
goto not_found;
}
@@ -505,7 +504,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
dlm_kick_thread(dlm, res);
not_found:
- if (!found)
+ if (!lock)
mlog(ML_ERROR, "failed to find lock to unlock! "
"cookie=%u:%llu\n",
dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)),
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 29f183a15798..617c92e7b925 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -433,6 +433,11 @@ again:
}
spin_lock(&lockres->l_lock);
+ if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+ spin_unlock(&lockres->l_lock);
+ status = -EAGAIN;
+ goto bail;
+ }
/* We only compare against the currently granted level
* here. If the lock is blocked waiting on a downconvert,
@@ -595,7 +600,7 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
spin_lock(&lockres->l_lock);
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
spin_unlock(&lockres->l_lock);
- return 0;
+ goto bail;
}
lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
@@ -609,22 +614,30 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
}
if (lockres->l_ro_holders || lockres->l_ex_holders) {
+ lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN;
spin_unlock(&lockres->l_lock);
goto bail;
}
status = 0;
if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+ /*
+ * lock is never requested, leave USER_LOCK_IN_TEARDOWN set
+ * to avoid new lock request coming in.
+ */
spin_unlock(&lockres->l_lock);
goto bail;
}
- lockres->l_flags &= ~USER_LOCK_ATTACHED;
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
if (status) {
+ spin_lock(&lockres->l_lock);
+ lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN;
+ lockres->l_flags &= ~USER_LOCK_BUSY;
+ spin_unlock(&lockres->l_lock);
user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
goto bail;
}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 5739dc301569..bb116c39b581 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -125,6 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
+ journal_t *journal = osb->journal->j_journal;
trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
sysfile_type);
@@ -171,11 +172,10 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
* part of the transaction - the inode could have been reclaimed and
* now it is reread from disk.
*/
- if (osb->journal) {
+ if (journal) {
transaction_t *transaction;
tid_t tid;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- journal_t *journal = osb->journal->j_journal;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 1887a2708709..fa87d89cf754 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -810,22 +810,20 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
write_unlock(&journal->j_state_lock);
}
-int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
+/*
+ * alloc & initialize skeleton for journal structure.
+ * ocfs2_journal_init() will make fs have journal ability.
+ */
+int ocfs2_journal_alloc(struct ocfs2_super *osb)
{
- int status = -1;
- struct inode *inode = NULL; /* the journal inode */
- journal_t *j_journal = NULL;
- struct ocfs2_journal *journal = NULL;
- struct ocfs2_dinode *di = NULL;
- struct buffer_head *bh = NULL;
- int inode_lock = 0;
+ int status = 0;
+ struct ocfs2_journal *journal;
- /* initialize our journal structure */
journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
if (!journal) {
mlog(ML_ERROR, "unable to alloc journal\n");
status = -ENOMEM;
- goto done;
+ goto bail;
}
osb->journal = journal;
journal->j_osb = osb;
@@ -839,6 +837,21 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
journal->j_state = OCFS2_JOURNAL_FREE;
+bail:
+ return status;
+}
+
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
+{
+ int status = -1;
+ struct inode *inode = NULL; /* the journal inode */
+ journal_t *j_journal = NULL;
+ struct ocfs2_journal *journal = osb->journal;
+ struct ocfs2_dinode *di = NULL;
+ struct buffer_head *bh = NULL;
+ int inode_lock = 0;
+
+ BUG_ON(!journal);
/* already have the inode for our journal */
inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
osb->slot_num);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 8dcb2f2cadbc..969d0aa28718 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -154,6 +154,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
* Journal Control:
* Initialize, Load, Shutdown, Wipe a journal.
*
+ * ocfs2_journal_alloc - Initialize skeleton for journal structure.
* ocfs2_journal_init - Initialize journal structures in the OSB.
* ocfs2_journal_load - Load the given journal off disk. Replay it if
* there's transactions still in there.
@@ -167,6 +168,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
*/
void ocfs2_set_journal_params(struct ocfs2_super *osb);
+int ocfs2_journal_alloc(struct ocfs2_super *osb);
int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty);
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index b1a8b046f4c2..5022b3e9bfcd 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -921,19 +921,19 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
{
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
- struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_quota_chunk *chunk = NULL, *iter;
struct ocfs2_local_disk_chunk *dchunk;
int found = 0, len;
- list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+ list_for_each_entry(iter, &oinfo->dqi_chunk, qc_chunk) {
dchunk = (struct ocfs2_local_disk_chunk *)
- chunk->qc_headerbh->b_data;
+ iter->qc_headerbh->b_data;
if (le32_to_cpu(dchunk->dqc_free) > 0) {
- found = 1;
+ chunk = iter;
break;
}
}
- if (!found)
+ if (!chunk)
return NULL;
if (chunk->qc_num < oinfo->dqi_chunks - 1) {
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 769e466887b0..a9d1296d736d 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -198,7 +198,7 @@ void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
resv->r_flags |= flags;
}
-int ocfs2_resmap_init(struct ocfs2_super *osb,
+void ocfs2_resmap_init(struct ocfs2_super *osb,
struct ocfs2_reservation_map *resmap)
{
memset(resmap, 0, sizeof(*resmap));
@@ -207,8 +207,6 @@ int ocfs2_resmap_init(struct ocfs2_super *osb,
resmap->m_reservations = RB_ROOT;
/* m_bitmap_len is initialized to zero by the above memset. */
INIT_LIST_HEAD(&resmap->m_lru);
-
- return 0;
}
static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 677c50663595..ec8101ef5717 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -73,15 +73,10 @@ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
/**
* ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @osb: struct ocfs2_super to be saved in resmap
* @resmap: struct ocfs2_reservation_map to initialize
- * @obj: unused for now
- * @ops: unused for now
- * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
- *
- * Only possible return value other than '0' is -ENOMEM for failure to
- * allocation mirror bitmap.
*/
-int ocfs2_resmap_init(struct ocfs2_super *osb,
+void ocfs2_resmap_init(struct ocfs2_super *osb,
struct ocfs2_reservation_map *resmap);
/**
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 477cdf94122e..f7298816d8d9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -989,28 +989,27 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
status = -EINVAL;
- goto read_super_error;
+ goto out;
}
/* probe for superblock */
status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
if (status < 0) {
mlog(ML_ERROR, "superblock probe failed!\n");
- goto read_super_error;
+ goto out;
}
status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
- osb = OCFS2_SB(sb);
- if (status < 0) {
- mlog_errno(status);
- goto read_super_error;
- }
brelse(bh);
bh = NULL;
+ if (status < 0)
+ goto out;
+
+ osb = OCFS2_SB(sb);
if (!ocfs2_check_set_options(sb, &parsed_options)) {
status = -EINVAL;
- goto read_super_error;
+ goto out_super;
}
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
@@ -1027,7 +1026,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
- goto read_super_error;
+ goto out_super;
sb->s_magic = OCFS2_SUPER_MAGIC;
@@ -1041,7 +1040,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = -EACCES;
mlog(ML_ERROR, "Readonly device detected but readonly "
"mount was not specified.\n");
- goto read_super_error;
+ goto out_super;
}
/* You should not be able to start a local heartbeat
@@ -1050,7 +1049,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = -EROFS;
mlog(ML_ERROR, "Local heartbeat specified on readonly "
"device.\n");
- goto read_super_error;
+ goto out_super;
}
status = ocfs2_check_journals_nolocks(osb);
@@ -1059,9 +1058,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
mlog(ML_ERROR, "Recovery required on readonly "
"file system, but write access is "
"unavailable.\n");
- else
- mlog_errno(status);
- goto read_super_error;
+ goto out_super;
}
ocfs2_set_ro_flag(osb, 1);
@@ -1077,10 +1074,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
}
status = ocfs2_verify_heartbeat(osb);
- if (status < 0) {
- mlog_errno(status);
- goto read_super_error;
- }
+ if (status < 0)
+ goto out_super;
osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
ocfs2_debugfs_root);
@@ -1094,15 +1089,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = ocfs2_mount_volume(sb);
if (status < 0)
- goto read_super_error;
+ goto out_debugfs;
if (osb->root_inode)
inode = igrab(osb->root_inode);
if (!inode) {
status = -EIO;
- mlog_errno(status);
- goto read_super_error;
+ goto out_dismount;
}
osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
@@ -1110,7 +1104,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
if (!osb->osb_dev_kset) {
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id);
- goto read_super_error;
+ goto out_dismount;
}
/* Create filecheck sysfs related directories/files at
@@ -1119,14 +1113,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create filecheck sysfs directory at "
"/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id);
- goto read_super_error;
+ goto out_dismount;
}
root = d_make_root(inode);
if (!root) {
status = -ENOMEM;
- mlog_errno(status);
- goto read_super_error;
+ goto out_dismount;
}
sb->s_root = root;
@@ -1178,17 +1171,21 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
return status;
-read_super_error:
- brelse(bh);
-
- if (status)
- mlog_errno(status);
+out_dismount:
+ atomic_set(&osb->vol_state, VOLUME_DISABLED);
+ wake_up(&osb->osb_mount_event);
+ ocfs2_dismount_volume(sb, 1);
+ goto out;
- if (osb) {
- atomic_set(&osb->vol_state, VOLUME_DISABLED);
- wake_up(&osb->osb_mount_event);
- ocfs2_dismount_volume(sb, 1);
- }
+out_debugfs:
+ debugfs_remove_recursive(osb->osb_debug_root);
+out_super:
+ ocfs2_release_system_inodes(osb);
+ kfree(osb->recovery_map);
+ ocfs2_delete_osb(osb);
+ kfree(osb);
+out:
+ mlog_errno(status);
return status;
}
@@ -1803,11 +1800,10 @@ static int ocfs2_get_sector(struct super_block *sb,
static int ocfs2_mount_volume(struct super_block *sb)
{
int status = 0;
- int unlock_super = 0;
struct ocfs2_super *osb = OCFS2_SB(sb);
if (ocfs2_is_hard_readonly(osb))
- goto leave;
+ goto out;
mutex_init(&osb->obs_trim_fs_mutex);
@@ -1817,44 +1813,56 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (status == -EBADR && ocfs2_userspace_stack(osb))
mlog(ML_ERROR, "couldn't mount because cluster name on"
" disk does not match the running cluster name.\n");
- goto leave;
+ goto out;
}
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_dlm;
}
- unlock_super = 1;
/* This will load up the node map and add ourselves to it. */
status = ocfs2_find_slot(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_super_lock;
}
/* load all node-local system inodes */
status = ocfs2_init_local_system_inodes(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_super_lock;
}
status = ocfs2_check_volume(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_system_inodes;
}
status = ocfs2_truncate_log_init(osb);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ goto out_system_inodes;
+ }
-leave:
- if (unlock_super)
- ocfs2_super_unlock(osb, 1);
+ ocfs2_super_unlock(osb, 1);
+ return 0;
+out_system_inodes:
+ if (osb->local_alloc_state == OCFS2_LA_ENABLED)
+ ocfs2_shutdown_local_alloc(osb);
+ ocfs2_release_system_inodes(osb);
+ /* before journal shutdown, we should release slot_info */
+ ocfs2_free_slot_info(osb);
+ ocfs2_journal_shutdown(osb);
+out_super_lock:
+ ocfs2_super_unlock(osb, 1);
+out_dlm:
+ ocfs2_dlm_shutdown(osb, 0);
+out:
return status;
}
@@ -2022,7 +2030,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out;
}
sb->s_fs_info = osb;
@@ -2083,7 +2091,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
osb->max_slots);
status = -EINVAL;
- goto bail;
+ goto out;
}
ocfs2_orphan_scan_init(osb);
@@ -2092,7 +2100,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (status) {
mlog(ML_ERROR, "Unable to initialize recovery state\n");
mlog_errno(status);
- goto bail;
+ goto out;
}
init_waitqueue_head(&osb->checkpoint_event);
@@ -2110,17 +2118,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
init_waitqueue_head(&osb->osb_mount_event);
- status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_resmap_init(osb, &osb->osb_la_resmap);
osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
if (!osb->vol_label) {
mlog(ML_ERROR, "unable to alloc vol label\n");
status = -ENOMEM;
- goto bail;
+ goto out_recovery_map;
}
osb->slot_recovery_generations =
@@ -2129,7 +2133,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb->slot_recovery_generations) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_vol_label;
}
init_waitqueue_head(&osb->osb_wipe_event);
@@ -2139,7 +2143,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb->osb_orphan_wipes) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_slot_recovery_gen;
}
osb->osb_rf_lock_tree = RB_ROOT;
@@ -2155,13 +2159,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "couldn't mount because of unsupported "
"optional features (%x).\n", i);
status = -EINVAL;
- goto bail;
+ goto out_orphan_wipes;
}
if (!sb_rdonly(osb->sb) && (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
mlog(ML_ERROR, "couldn't mount RDWR because of "
"unsupported optional features (%x).\n", i);
status = -EINVAL;
- goto bail;
+ goto out_orphan_wipes;
}
if (ocfs2_clusterinfo_valid(osb)) {
@@ -2182,7 +2186,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
"cluster stack label (%s) \n",
osb->osb_cluster_stack);
status = -EINVAL;
- goto bail;
+ goto out_orphan_wipes;
}
memcpy(osb->osb_cluster_name,
OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
@@ -2195,6 +2199,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
get_random_bytes(&osb->s_next_generation, sizeof(u32));
+ /*
+ * FIXME
+ * This should be done in ocfs2_journal_init(), but any inode
+ * writes back operation will cause the filesystem to crash.
+ */
+ status = ocfs2_journal_alloc(osb);
+ if (status < 0)
+ goto out_orphan_wipes;
+
INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
init_llist_head(&osb->dquot_drop_list);
@@ -2208,7 +2221,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
osb->s_clustersize);
status = -EINVAL;
- goto bail;
+ goto out_journal;
}
total_blocks = ocfs2_clusters_to_blocks(osb->sb,
@@ -2220,14 +2233,14 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "Volume too large "
"to mount safely on this system");
status = -EFBIG;
- goto bail;
+ goto out_journal;
}
if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
sizeof(di->id2.i_super.s_uuid))) {
mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
status = -ENOMEM;
- goto bail;
+ goto out_journal;
}
strlcpy(osb->vol_label, di->id2.i_super.s_label,
@@ -2247,7 +2260,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb->osb_dlm_debug) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_uuid_str;
}
atomic_set(&osb->vol_state, VOLUME_INIT);
@@ -2256,7 +2269,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = ocfs2_init_global_system_inodes(osb);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto out_dlm_out;
}
/*
@@ -2267,7 +2280,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!inode) {
status = -EINVAL;
mlog_errno(status);
- goto bail;
+ goto out_system_inodes;
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
@@ -2280,16 +2293,39 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = ocfs2_init_slot_info(osb);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto out_system_inodes;
}
osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM);
if (!osb->ocfs2_wq) {
status = -ENOMEM;
mlog_errno(status);
+ goto out_slot_info;
}
-bail:
+ return status;
+
+out_slot_info:
+ ocfs2_free_slot_info(osb);
+out_system_inodes:
+ ocfs2_release_system_inodes(osb);
+out_dlm_out:
+ ocfs2_put_dlm_debug(osb->osb_dlm_debug);
+out_uuid_str:
+ kfree(osb->uuid_str);
+out_journal:
+ kfree(osb->journal);
+out_orphan_wipes:
+ kfree(osb->osb_orphan_wipes);
+out_slot_recovery_gen:
+ kfree(osb->slot_recovery_generations);
+out_vol_label:
+ kfree(osb->vol_label);
+out_recovery_map:
+ kfree(osb->recovery_map);
+out:
+ kfree(osb);
+ sb->s_fs_info = NULL;
return status;
}
@@ -2483,6 +2519,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
kfree(osb->osb_orphan_wipes);
kfree(osb->slot_recovery_generations);
+ /* FIXME
+ * This belongs in journal shutdown, but because we have to
+ * allocate osb->journal at the middle of ocfs2_initialize_super(),
+ * we free it here.
+ */
+ kfree(osb->journal);
kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
kfree(osb->vol_label);
diff --git a/fs/open.c b/fs/open.c
index 1315253e0247..1d57fbde2feb 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -224,6 +224,21 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
}
#endif /* BITS_PER_LONG == 32 */
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
+COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
+ compat_arg_u64_dual(length))
+{
+ return ksys_truncate(pathname, compat_arg_u64_glue(length));
+}
+#endif
+
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64)
+COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
+ compat_arg_u64_dual(length))
+{
+ return ksys_ftruncate(fd, compat_arg_u64_glue(length));
+}
+#endif
int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
@@ -339,6 +354,15 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
return ksys_fallocate(fd, mode, offset, len);
}
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE)
+COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset),
+ compat_arg_u64_dual(len))
+{
+ return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset),
+ compat_arg_u64_glue(len));
+}
+#endif
+
/*
* access() needs to use the real uid/gid, not the effective uid/gid.
* We do this by temporarily clearing all FS-related capabilities and
@@ -834,16 +858,15 @@ static int do_dentry_open(struct file *f,
if ((f->f_mode & FMODE_WRITE) &&
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
+ if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
+ f->f_mode |= FMODE_CAN_ODIRECT;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
- /* NB: we're sure to have correct a_ops only after f_op->open */
- if (f->f_flags & O_DIRECT) {
- if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
/*
* XXX: Huge page cache doesn't support writing yet. Drop all page
@@ -981,6 +1004,48 @@ struct file *dentry_open(const struct path *path, int flags,
}
EXPORT_SYMBOL(dentry_open);
+/**
+ * dentry_create - Create and open a file
+ * @path: path to create
+ * @flags: O_ flags
+ * @mode: mode bits for new file
+ * @cred: credentials to use
+ *
+ * Caller must hold the parent directory's lock, and have prepared
+ * a negative dentry, placed in @path->dentry, for the new file.
+ *
+ * Caller sets @path->mnt to the vfsmount of the filesystem where
+ * the new file is to be created. The parent directory and the
+ * negative dentry must reside on the same filesystem instance.
+ *
+ * On success, returns a "struct file *". Otherwise a ERR_PTR
+ * is returned.
+ */
+struct file *dentry_create(const struct path *path, int flags, umode_t mode,
+ const struct cred *cred)
+{
+ struct file *f;
+ int error;
+
+ validate_creds(cred);
+ f = alloc_empty_file(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ error = vfs_create(mnt_user_ns(path->mnt),
+ d_inode(path->dentry->d_parent),
+ path->dentry, mode, true);
+ if (!error)
+ error = vfs_open(path, f);
+
+ if (unlikely(error)) {
+ fput(f);
+ return ERR_PTR(error);
+ }
+ return f;
+}
+EXPORT_SYMBOL(dentry_create);
+
struct file *open_with_fake_path(const struct path *path, int flags,
struct inode *inode, const struct cred *cred)
{
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index e040970408d4..714ec569d25b 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -44,9 +44,9 @@ static bool ovl_must_copy_xattr(const char *name)
!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
}
-int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
- struct dentry *new)
+int ovl_copy_xattr(struct super_block *sb, struct path *oldpath, struct dentry *new)
{
+ struct dentry *old = oldpath->dentry;
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int error = 0;
@@ -94,9 +94,9 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
continue; /* Discard */
}
retry:
- size = vfs_getxattr(&init_user_ns, old, name, value, value_size);
+ size = ovl_do_getxattr(oldpath, name, value, value_size);
if (size == -ERANGE)
- size = vfs_getxattr(&init_user_ns, old, name, NULL, 0);
+ size = ovl_do_getxattr(oldpath, name, NULL, 0);
if (size < 0) {
error = size;
@@ -117,7 +117,7 @@ retry:
goto retry;
}
- error = vfs_setxattr(&init_user_ns, new, name, value, size, 0);
+ error = ovl_do_setxattr(OVL_FS(sb), new, name, value, size, 0);
if (error) {
if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
break;
@@ -292,17 +292,19 @@ out_fput:
return error;
}
-static int ovl_set_size(struct dentry *upperdentry, struct kstat *stat)
+static int ovl_set_size(struct ovl_fs *ofs,
+ struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid = ATTR_SIZE,
.ia_size = stat->size,
};
- return notify_change(&init_user_ns, upperdentry, &attr, NULL);
+ return ovl_do_notify_change(ofs, upperdentry, &attr);
}
-static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
+static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry,
+ struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
@@ -311,10 +313,11 @@ static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
.ia_mtime = stat->mtime,
};
- return notify_change(&init_user_ns, upperdentry, &attr, NULL);
+ return ovl_do_notify_change(ofs, upperdentry, &attr);
}
-int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
+int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry,
+ struct kstat *stat)
{
int err = 0;
@@ -323,7 +326,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
- err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
+ err = ovl_do_notify_change(ofs, upperdentry, &attr);
}
if (!err) {
struct iattr attr = {
@@ -331,10 +334,10 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
- err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
+ err = ovl_do_notify_change(ofs, upperdentry, &attr);
}
if (!err)
- ovl_set_timestamps(upperdentry, stat);
+ ovl_set_timestamps(ofs, upperdentry, stat);
return err;
}
@@ -433,7 +436,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
if (IS_ERR(fh))
return PTR_ERR(fh);
- err = ovl_do_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len);
+ err = ovl_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len);
kfree(fh);
return err;
@@ -474,7 +477,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
if (err)
return err;
- temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0));
+ temp = ovl_create_temp(ofs, indexdir, OVL_CATTR(S_IFDIR | 0));
err = PTR_ERR(temp);
if (IS_ERR(temp))
goto free_name;
@@ -483,16 +486,16 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
if (err)
goto out;
- index = lookup_one_len(name.name, indexdir, name.len);
+ index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
} else {
- err = ovl_do_rename(dir, temp, dir, index, 0);
+ err = ovl_do_rename(ofs, dir, temp, dir, index, 0);
dput(index);
}
out:
if (err)
- ovl_cleanup(dir, temp);
+ ovl_cleanup(ofs, dir, temp);
dput(temp);
free_name:
kfree(name.name);
@@ -519,6 +522,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
int err;
struct dentry *upper;
struct dentry *upperdir = ovl_dentry_upper(c->parent);
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *udir = d_inode(upperdir);
/* Mark parent "impure" because it may now contain non-pure upper */
@@ -531,16 +535,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
return err;
inode_lock_nested(udir, I_MUTEX_PARENT);
- upper = lookup_one_len(c->dentry->d_name.name, upperdir,
- c->dentry->d_name.len);
+ upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir,
+ c->dentry->d_name.len);
err = PTR_ERR(upper);
if (!IS_ERR(upper)) {
- err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper);
+ err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper);
dput(upper);
if (!err) {
/* Restore timestamps on parent (best effort) */
- ovl_set_timestamps(upperdir, &c->pstat);
+ ovl_set_timestamps(ofs, upperdir, &c->pstat);
ovl_dentry_set_upper_alias(c->dentry);
}
}
@@ -578,7 +582,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
return err;
}
- err = ovl_copy_xattr(c->dentry->d_sb, c->lowerpath.dentry, temp);
+ err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp);
if (err)
return err;
@@ -614,9 +618,9 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
inode_lock(temp->d_inode);
if (S_ISREG(c->stat.mode))
- err = ovl_set_size(temp, &c->stat);
+ err = ovl_set_size(ofs, temp, &c->stat);
if (!err)
- err = ovl_set_attr(temp, &c->stat);
+ err = ovl_set_attr(ofs, temp, &c->stat);
inode_unlock(temp->d_inode);
return err;
@@ -656,6 +660,7 @@ static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
*/
static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *inode;
struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
struct dentry *temp, *upper;
@@ -677,7 +682,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
if (err)
goto unlock;
- temp = ovl_create_temp(c->workdir, &cattr);
+ temp = ovl_create_temp(ofs, c->workdir, &cattr);
ovl_revert_cu_creds(&cc);
err = PTR_ERR(temp);
@@ -694,12 +699,13 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
goto cleanup;
}
- upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
+ upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
+ c->destname.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto cleanup;
- err = ovl_do_rename(wdir, temp, udir, upper, 0);
+ err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0);
dput(upper);
if (err)
goto cleanup;
@@ -716,7 +722,7 @@ unlock:
return err;
cleanup:
- ovl_cleanup(wdir, temp);
+ ovl_cleanup(ofs, wdir, temp);
dput(temp);
goto unlock;
}
@@ -724,6 +730,7 @@ cleanup:
/* Copyup using O_TMPFILE which does not require cross dir locking */
static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *udir = d_inode(c->destdir);
struct dentry *temp, *upper;
struct ovl_cu_creds cc;
@@ -733,7 +740,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
if (err)
return err;
- temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
+ temp = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
ovl_revert_cu_creds(&cc);
if (IS_ERR(temp))
@@ -745,10 +752,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
inode_lock_nested(udir, I_MUTEX_PARENT);
- upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
+ upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
+ c->destname.len);
err = PTR_ERR(upper);
if (!IS_ERR(upper)) {
- err = ovl_do_link(temp, udir, upper);
+ err = ovl_do_link(ofs, temp, udir, upper);
dput(upper);
}
inode_unlock(udir);
@@ -836,7 +844,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
/* Restore timestamps on parent (best effort) */
inode_lock(udir);
- ovl_set_timestamps(c->destdir, &c->pstat);
+ ovl_set_timestamps(ofs, c->destdir, &c->pstat);
inode_unlock(udir);
ovl_dentry_set_upper_alias(c->dentry);
@@ -865,12 +873,12 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode,
return true;
}
-static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value)
+static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value)
{
ssize_t res;
char *buf;
- res = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
+ res = ovl_do_getxattr(path, name, NULL, 0);
if (res == -ENODATA || res == -EOPNOTSUPP)
res = 0;
@@ -879,7 +887,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value)
if (!buf)
return -ENOMEM;
- res = vfs_getxattr(&init_user_ns, dentry, name, buf, res);
+ res = ovl_do_getxattr(path, name, buf, res);
if (res < 0)
kfree(buf);
else
@@ -906,8 +914,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
return -EIO;
if (c->stat.size) {
- err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS,
- &capability);
+ err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS,
+ &capability);
if (cap_size < 0)
goto out;
}
@@ -921,14 +929,14 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
* don't want that to happen for normal copy-up operation.
*/
if (capability) {
- err = vfs_setxattr(&init_user_ns, upperpath.dentry,
- XATTR_NAME_CAPS, capability, cap_size, 0);
+ err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS,
+ capability, cap_size, 0);
if (err)
goto out_free;
}
- err = ovl_do_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY);
+ err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY);
if (err)
goto out_free;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index f18490813170..6b03457f72bb 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -23,15 +23,15 @@ MODULE_PARM_DESC(redirect_max,
static int ovl_set_redirect(struct dentry *dentry, bool samedir);
-int ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
+int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
- err = ovl_do_rmdir(wdir, wdentry);
+ err = ovl_do_rmdir(ofs, wdir, wdentry);
else
- err = ovl_do_unlink(wdir, wdentry);
+ err = ovl_do_unlink(ofs, wdir, wdentry);
dput(wdentry);
if (err) {
@@ -42,7 +42,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
return err;
}
-struct dentry *ovl_lookup_temp(struct dentry *workdir)
+struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
{
struct dentry *temp;
char name[20];
@@ -51,7 +51,7 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir)
/* counter is allowed to wrap, since temp dentries are ephemeral */
snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id));
- temp = lookup_one_len(name, workdir, strlen(name));
+ temp = ovl_lookup_upper(ofs, name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("workdir/%s already exists\n", name);
dput(temp);
@@ -70,11 +70,11 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
struct inode *wdir = workdir->d_inode;
if (!ofs->whiteout) {
- whiteout = ovl_lookup_temp(workdir);
+ whiteout = ovl_lookup_temp(ofs, workdir);
if (IS_ERR(whiteout))
goto out;
- err = ovl_do_whiteout(wdir, whiteout);
+ err = ovl_do_whiteout(ofs, wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
@@ -84,11 +84,11 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
}
if (ofs->share_whiteout) {
- whiteout = ovl_lookup_temp(workdir);
+ whiteout = ovl_lookup_temp(ofs, workdir);
if (IS_ERR(whiteout))
goto out;
- err = ovl_do_link(ofs->whiteout, wdir, whiteout);
+ err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout);
if (!err)
goto out;
@@ -122,27 +122,28 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
if (d_is_dir(dentry))
flags = RENAME_EXCHANGE;
- err = ovl_do_rename(wdir, whiteout, dir, dentry, flags);
+ err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags);
if (err)
goto kill_whiteout;
if (flags)
- ovl_cleanup(wdir, dentry);
+ ovl_cleanup(ofs, wdir, dentry);
out:
dput(whiteout);
return err;
kill_whiteout:
- ovl_cleanup(wdir, whiteout);
+ ovl_cleanup(ofs, wdir, whiteout);
goto out;
}
-int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode)
+int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry **newdentry, umode_t mode)
{
int err;
struct dentry *d, *dentry = *newdentry;
- err = ovl_do_mkdir(dir, dentry, mode);
+ err = ovl_do_mkdir(ofs, dir, dentry, mode);
if (err)
return err;
@@ -154,8 +155,8 @@ int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode)
* to it unhashed and negative. If that happens, try to
* lookup a new hashed and positive dentry.
*/
- d = lookup_one_len(dentry->d_name.name, dentry->d_parent,
- dentry->d_name.len);
+ d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent,
+ dentry->d_name.len);
if (IS_ERR(d)) {
pr_warn("failed lookup after mkdir (%pd2, err=%i).\n",
dentry, err);
@@ -167,8 +168,8 @@ int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode)
return 0;
}
-struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
- struct ovl_cattr *attr)
+struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry *newdentry, struct ovl_cattr *attr)
{
int err;
@@ -180,28 +181,28 @@ struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
goto out;
if (attr->hardlink) {
- err = ovl_do_link(attr->hardlink, dir, newdentry);
+ err = ovl_do_link(ofs, attr->hardlink, dir, newdentry);
} else {
switch (attr->mode & S_IFMT) {
case S_IFREG:
- err = ovl_do_create(dir, newdentry, attr->mode);
+ err = ovl_do_create(ofs, dir, newdentry, attr->mode);
break;
case S_IFDIR:
/* mkdir is special... */
- err = ovl_mkdir_real(dir, &newdentry, attr->mode);
+ err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
- err = ovl_do_mknod(dir, newdentry, attr->mode,
+ err = ovl_do_mknod(ofs, dir, newdentry, attr->mode,
attr->rdev);
break;
case S_IFLNK:
- err = ovl_do_symlink(dir, newdentry, attr->link);
+ err = ovl_do_symlink(ofs, dir, newdentry, attr->link);
break;
default:
@@ -223,10 +224,11 @@ out:
return newdentry;
}
-struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr)
+struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
+ struct ovl_cattr *attr)
{
- return ovl_create_real(d_inode(workdir), ovl_lookup_temp(workdir),
- attr);
+ return ovl_create_real(ofs, d_inode(workdir),
+ ovl_lookup_temp(ofs, workdir), attr);
}
static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper,
@@ -330,10 +332,9 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
attr->mode &= ~current_umask();
inode_lock_nested(udir, I_MUTEX_PARENT);
- newdentry = ovl_create_real(udir,
- lookup_one_len(dentry->d_name.name,
- upperdir,
- dentry->d_name.len),
+ newdentry = ovl_create_real(ofs, udir,
+ ovl_lookup_upper(ofs, dentry->d_name.name,
+ upperdir, dentry->d_name.len),
attr);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
@@ -353,7 +354,7 @@ out_unlock:
return err;
out_cleanup:
- ovl_cleanup(udir, newdentry);
+ ovl_cleanup(ofs, udir, newdentry);
dput(newdentry);
goto out_unlock;
}
@@ -361,6 +362,7 @@ out_cleanup:
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
@@ -391,12 +393,12 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (upper->d_parent->d_inode != udir)
goto out_unlock;
- opaquedir = ovl_create_temp(workdir, OVL_CATTR(stat.mode));
+ opaquedir = ovl_create_temp(ofs, workdir, OVL_CATTR(stat.mode));
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
- err = ovl_copy_xattr(dentry->d_sb, upper, opaquedir);
+ err = ovl_copy_xattr(dentry->d_sb, &upperpath, opaquedir);
if (err)
goto out_cleanup;
@@ -405,17 +407,17 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
goto out_cleanup;
inode_lock(opaquedir->d_inode);
- err = ovl_set_attr(opaquedir, &stat);
+ err = ovl_set_attr(ofs, opaquedir, &stat);
inode_unlock(opaquedir->d_inode);
if (err)
goto out_cleanup;
- err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+ err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
- ovl_cleanup_whiteouts(upper, list);
- ovl_cleanup(wdir, upper);
+ ovl_cleanup_whiteouts(ofs, upper, list);
+ ovl_cleanup(ofs, wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
@@ -424,7 +426,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
return opaquedir;
out_cleanup:
- ovl_cleanup(wdir, opaquedir);
+ ovl_cleanup(ofs, wdir, opaquedir);
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
@@ -432,8 +434,8 @@ out:
return ERR_PTR(err);
}
-static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name,
- const struct posix_acl *acl)
+static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry,
+ const char *name, const struct posix_acl *acl)
{
void *buffer;
size_t size;
@@ -451,7 +453,7 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name,
if (err < 0)
goto out_free;
- err = vfs_setxattr(&init_user_ns, upperdentry, name, buffer, size, XATTR_CREATE);
+ err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE);
out_free:
kfree(buffer);
return err;
@@ -460,6 +462,7 @@ out_free:
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct ovl_cattr *cattr)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
@@ -484,8 +487,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out;
- upper = lookup_one_len(dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
@@ -494,7 +497,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper)))
goto out_dput;
- newdentry = ovl_create_temp(workdir, cattr);
+ newdentry = ovl_create_temp(ofs, workdir, cattr);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput;
@@ -510,19 +513,19 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
.ia_mode = cattr->mode,
};
inode_lock(newdentry->d_inode);
- err = notify_change(&init_user_ns, newdentry, &attr, NULL);
+ err = ovl_do_notify_change(ofs, newdentry, &attr);
inode_unlock(newdentry->d_inode);
if (err)
goto out_cleanup;
}
if (!hardlink) {
- err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS,
- acl);
+ err = ovl_set_upper_acl(ofs, newdentry,
+ XATTR_NAME_POSIX_ACL_ACCESS, acl);
if (err)
goto out_cleanup;
- err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT,
- default_acl);
+ err = ovl_set_upper_acl(ofs, newdentry,
+ XATTR_NAME_POSIX_ACL_DEFAULT, default_acl);
if (err)
goto out_cleanup;
}
@@ -532,20 +535,20 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
- err = ovl_do_rename(wdir, newdentry, udir, upper,
+ err = ovl_do_rename(ofs, wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
- ovl_cleanup(wdir, upper);
+ ovl_cleanup(ofs, wdir, upper);
} else {
- err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+ err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
err = ovl_instantiate(dentry, inode, newdentry, hardlink);
if (err) {
- ovl_cleanup(udir, newdentry);
+ ovl_cleanup(ofs, udir, newdentry);
dput(newdentry);
}
out_dput:
@@ -560,7 +563,7 @@ out:
return err;
out_cleanup:
- ovl_cleanup(wdir, newdentry);
+ ovl_cleanup(ofs, wdir, newdentry);
dput(newdentry);
goto out_dput;
}
@@ -767,8 +770,8 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
if (err)
goto out_dput;
- upper = lookup_one_len(dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
@@ -800,6 +803,7 @@ out:
static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
struct list_head *list)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
@@ -814,8 +818,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
}
inode_lock_nested(dir, I_MUTEX_PARENT);
- upper = lookup_one_len(dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
@@ -826,9 +830,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
goto out_dput_upper;
if (is_dir)
- err = vfs_rmdir(&init_user_ns, dir, upper);
+ err = ovl_do_rmdir(ofs, dir, upper);
else
- err = vfs_unlink(&init_user_ns, dir, upper, NULL);
+ err = ovl_do_unlink(ofs, dir, upper);
ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry));
/*
@@ -880,7 +884,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
int err;
const struct cred *old_cred;
- struct dentry *upperdentry;
bool lower_positive = ovl_lower_positive(dentry);
LIST_HEAD(list);
@@ -923,9 +926,8 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
* Note: we fail to update ctime if there was no copy-up, only a
* whiteout
*/
- upperdentry = ovl_dentry_upper(dentry);
- if (upperdentry)
- ovl_copyattr(d_inode(upperdentry), d_inode(dentry));
+ if (ovl_dentry_upper(dentry))
+ ovl_copyattr(d_inode(dentry));
out_drop_write:
ovl_drop_write(dentry);
@@ -1095,6 +1097,7 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
bool samedir = olddir == newdir;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
+ struct ovl_fs *ofs = OVL_FS(old->d_sb);
LIST_HEAD(list);
err = -EINVAL;
@@ -1189,8 +1192,8 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
trap = lock_rename(new_upperdir, old_upperdir);
- olddentry = lookup_one_len(old->d_name.name, old_upperdir,
- old->d_name.len);
+ olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
+ old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
@@ -1199,8 +1202,8 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
if (!ovl_matches_upper(old, olddentry))
goto out_dput_old;
- newdentry = lookup_one_len(new->d_name.name, new_upperdir,
- new->d_name.len);
+ newdentry = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir,
+ new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
@@ -1251,13 +1254,13 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
if (err)
goto out_dput;
- err = ovl_do_rename(old_upperdir->d_inode, olddentry,
+ err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry, flags);
if (err)
goto out_dput;
if (cleanup_whiteout)
- ovl_cleanup(old_upperdir->d_inode, newdentry);
+ ovl_cleanup(ofs, old_upperdir->d_inode, newdentry);
if (overwrite && d_inode(new)) {
if (new_is_dir)
@@ -1272,9 +1275,9 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
(d_inode(new) && ovl_type_origin(new)));
/* copy ctime: */
- ovl_copyattr(d_inode(olddentry), d_inode(old));
+ ovl_copyattr(d_inode(old));
if (d_inode(new) && ovl_dentry_upper(new))
- ovl_copyattr(d_inode(newdentry), d_inode(new));
+ ovl_copyattr(d_inode(new));
out_dput:
dput(newdentry);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index ebde05c9cf62..2eada97bbd23 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -391,6 +391,11 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
* pointer because we hold no lock on the real dentry.
*/
take_dentry_name_snapshot(&name, real);
+ /*
+ * No mnt_userns handling here: it's an internal lookup. Could skip
+ * permission checking altogether, but for now just use non-mnt_userns
+ * transformed ids.
+ */
this = lookup_one_len(name.name.name, connected, name.name.len);
release_dentry_name_snapshot(&name);
err = PTR_ERR(this);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index fa125feed0ff..daff601b5c41 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -38,9 +38,11 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
static struct file *ovl_open_realfile(const struct file *file,
- struct inode *realinode)
+ struct path *realpath)
{
+ struct inode *realinode = d_inode(realpath->dentry);
struct inode *inode = file_inode(file);
+ struct user_namespace *real_mnt_userns;
struct file *realfile;
const struct cred *old_cred;
int flags = file->f_flags | OVL_OPEN_FLAGS;
@@ -51,11 +53,12 @@ static struct file *ovl_open_realfile(const struct file *file,
acc_mode |= MAY_APPEND;
old_cred = ovl_override_creds(inode->i_sb);
- err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode);
+ real_mnt_userns = mnt_user_ns(realpath->mnt);
+ err = inode_permission(real_mnt_userns, realinode, MAY_OPEN | acc_mode);
if (err) {
realfile = ERR_PTR(err);
} else {
- if (!inode_owner_or_capable(&init_user_ns, realinode))
+ if (!inode_owner_or_capable(real_mnt_userns, realinode))
flags &= ~O_NOATIME;
realfile = open_with_fake_path(&file->f_path, flags, realinode,
@@ -82,11 +85,8 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
return -EPERM;
- if (flags & O_DIRECT) {
- if (!file->f_mapping->a_ops ||
- !file->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
if (file->f_op->check_flags) {
err = file->f_op->check_flags(flags);
@@ -104,21 +104,21 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
bool allow_meta)
{
- struct inode *inode = file_inode(file);
- struct inode *realinode;
+ struct dentry *dentry = file_dentry(file);
+ struct path realpath;
real->flags = 0;
real->file = file->private_data;
if (allow_meta)
- realinode = ovl_inode_real(inode);
+ ovl_path_real(dentry, &realpath);
else
- realinode = ovl_inode_realdata(inode);
+ ovl_path_realdata(dentry, &realpath);
/* Has it been copied up since we'd opened it? */
- if (unlikely(file_inode(real->file) != realinode)) {
+ if (unlikely(file_inode(real->file) != d_inode(realpath.dentry))) {
real->flags = FDPUT_FPUT;
- real->file = ovl_open_realfile(file, realinode);
+ real->file = ovl_open_realfile(file, &realpath);
return PTR_ERR_OR_ZERO(real->file);
}
@@ -144,17 +144,20 @@ static int ovl_real_fdget(const struct file *file, struct fd *real)
static int ovl_open(struct inode *inode, struct file *file)
{
+ struct dentry *dentry = file_dentry(file);
struct file *realfile;
+ struct path realpath;
int err;
- err = ovl_maybe_copy_up(file_dentry(file), file->f_flags);
+ err = ovl_maybe_copy_up(dentry, file->f_flags);
if (err)
return err;
/* No longer need these flags, so don't pass them on to underlying fs */
file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
- realfile = ovl_open_realfile(file, ovl_inode_realdata(inode));
+ ovl_path_realdata(dentry, &realpath);
+ realfile = ovl_open_realfile(file, &realpath);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
@@ -273,7 +276,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
__sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
SB_FREEZE_WRITE);
file_end_write(iocb->ki_filp);
- ovl_copyattr(ovl_inode_real(inode), inode);
+ ovl_copyattr(inode);
}
orig_iocb->ki_pos = iocb->ki_pos;
@@ -306,8 +309,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ret = -EINVAL;
if (iocb->ki_flags & IOCB_DIRECT &&
- (!real.file->f_mapping->a_ops ||
- !real.file->f_mapping->a_ops->direct_IO))
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
goto out_fdput;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
@@ -356,7 +358,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
inode_lock(inode);
/* Update mode */
- ovl_copyattr(ovl_inode_real(inode), inode);
+ ovl_copyattr(inode);
ret = file_remove_privs(file);
if (ret)
goto out_unlock;
@@ -367,8 +369,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
ret = -EINVAL;
if (iocb->ki_flags & IOCB_DIRECT &&
- (!real.file->f_mapping->a_ops ||
- !real.file->f_mapping->a_ops->direct_IO))
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
goto out_fdput;
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
@@ -381,7 +382,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
ovl_iocb_to_rwf(ifl));
file_end_write(real.file);
/* Update size */
- ovl_copyattr(ovl_inode_real(inode), inode);
+ ovl_copyattr(inode);
} else {
struct ovl_aio_req *aio_req;
@@ -431,12 +432,11 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
struct fd real;
const struct cred *old_cred;
struct inode *inode = file_inode(out);
- struct inode *realinode = ovl_inode_real(inode);
ssize_t ret;
inode_lock(inode);
/* Update mode */
- ovl_copyattr(realinode, inode);
+ ovl_copyattr(inode);
ret = file_remove_privs(out);
if (ret)
goto out_unlock;
@@ -452,7 +452,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
file_end_write(real.file);
/* Update size */
- ovl_copyattr(realinode, inode);
+ ovl_copyattr(inode);
revert_creds(old_cred);
fdput(real);
@@ -526,7 +526,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
revert_creds(old_cred);
/* Update size */
- ovl_copyattr(ovl_inode_real(inode), inode);
+ ovl_copyattr(inode);
fdput(real);
@@ -598,7 +598,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
revert_creds(old_cred);
/* Update size */
- ovl_copyattr(ovl_inode_real(inode_out), inode_out);
+ ovl_copyattr(inode_out);
fdput(real_in);
fdput(real_out);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 1f36158c7dbe..492eddeb481f 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -21,6 +21,7 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr)
{
int err;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
bool full_copy_up = false;
struct dentry *upperdentry;
const struct cred *old_cred;
@@ -77,10 +78,10 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
inode_lock(upperdentry->d_inode);
old_cred = ovl_override_creds(dentry->d_sb);
- err = notify_change(&init_user_ns, upperdentry, attr, NULL);
+ err = ovl_do_notify_change(ofs, upperdentry, attr);
revert_creds(old_cred);
if (!err)
- ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
+ ovl_copyattr(dentry->d_inode);
inode_unlock(upperdentry->d_inode);
if (winode)
@@ -279,12 +280,14 @@ int ovl_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
struct inode *upperinode = ovl_inode_upper(inode);
- struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
+ struct inode *realinode;
+ struct path realpath;
const struct cred *old_cred;
int err;
/* Careful in RCU walk mode */
- if (!realinode) {
+ ovl_i_path_real(inode, &realpath);
+ if (!realpath.dentry) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
return -ECHILD;
}
@@ -297,6 +300,7 @@ int ovl_permission(struct user_namespace *mnt_userns,
if (err)
return err;
+ realinode = d_inode(realpath.dentry);
old_cred = ovl_override_creds(inode->i_sb);
if (!upperinode &&
!special_file(realinode->i_mode) && mask & MAY_WRITE) {
@@ -304,7 +308,7 @@ int ovl_permission(struct user_namespace *mnt_userns,
/* Make sure mounter can read file for copy up later */
mask |= MAY_READ;
}
- err = inode_permission(&init_user_ns, realinode, mask);
+ err = inode_permission(mnt_user_ns(realpath.mnt), realinode, mask);
revert_creds(old_cred);
return err;
@@ -342,8 +346,10 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
int err;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *upperdentry = ovl_i_dentry_upper(inode);
struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
+ struct path realpath;
const struct cred *old_cred;
err = ovl_want_write(dentry);
@@ -351,8 +357,9 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
goto out;
if (!value && !upperdentry) {
+ ovl_path_lower(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0);
+ err = vfs_getxattr(mnt_user_ns(realpath.mnt), realdentry, name, NULL, 0);
revert_creds(old_cred);
if (err < 0)
goto out_drop_write;
@@ -367,17 +374,17 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
}
old_cred = ovl_override_creds(dentry->d_sb);
- if (value)
- err = vfs_setxattr(&init_user_ns, realdentry, name, value, size,
- flags);
- else {
+ if (value) {
+ err = ovl_do_setxattr(ofs, realdentry, name, value, size,
+ flags);
+ } else {
WARN_ON(flags != XATTR_REPLACE);
- err = vfs_removexattr(&init_user_ns, realdentry, name);
+ err = ovl_do_removexattr(ofs, realdentry, name);
}
revert_creds(old_cred);
/* copy c/mtime */
- ovl_copyattr(d_inode(realdentry), inode);
+ ovl_copyattr(inode);
out_drop_write:
ovl_drop_write(dentry);
@@ -390,11 +397,11 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
{
ssize_t res;
const struct cred *old_cred;
- struct dentry *realdentry =
- ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
+ struct path realpath;
+ ovl_i_path_real(inode, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
- res = vfs_getxattr(&init_user_ns, realdentry, name, value, size);
+ res = vfs_getxattr(mnt_user_ns(realpath.mnt), realpath.dentry, name, value, size);
revert_creds(old_cred);
return res;
}
@@ -535,7 +542,7 @@ int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa)
if (err)
return err;
- return vfs_fileattr_set(&init_user_ns, realpath->dentry, fa);
+ return vfs_fileattr_set(mnt_user_ns(realpath->mnt), realpath->dentry, fa);
}
int ovl_fileattr_set(struct user_namespace *mnt_userns,
@@ -579,7 +586,7 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns,
inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK);
/* Update ctime */
- ovl_copyattr(ovl_inode_real(inode), inode);
+ ovl_copyattr(inode);
}
ovl_drop_write(dentry);
out:
@@ -777,16 +784,19 @@ void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
unsigned long ino, int fsid)
{
struct inode *realinode;
+ struct ovl_inode *oi = OVL_I(inode);
if (oip->upperdentry)
- OVL_I(inode)->__upperdentry = oip->upperdentry;
- if (oip->lowerpath && oip->lowerpath->dentry)
- OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry));
+ oi->__upperdentry = oip->upperdentry;
+ if (oip->lowerpath && oip->lowerpath->dentry) {
+ oi->lowerpath.dentry = dget(oip->lowerpath->dentry);
+ oi->lowerpath.layer = oip->lowerpath->layer;
+ }
if (oip->lowerdata)
- OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata));
+ oi->lowerdata = igrab(d_inode(oip->lowerdata));
realinode = ovl_inode_real(inode);
- ovl_copyattr(realinode, inode);
+ ovl_copyattr(inode);
ovl_copyflags(realinode, inode);
ovl_map_ino(inode, ino, fsid);
}
@@ -871,8 +881,8 @@ static int ovl_set_nlink_common(struct dentry *dentry,
if (WARN_ON(len >= sizeof(buf)))
return -EIO;
- return ovl_do_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry),
- OVL_XATTR_NLINK, buf, len);
+ return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry),
+ OVL_XATTR_NLINK, buf, len);
}
int ovl_set_nlink_upper(struct dentry *dentry)
@@ -897,8 +907,8 @@ unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
return fallback;
- err = ovl_do_getxattr(ofs, upperdentry, OVL_XATTR_NLINK,
- &buf, sizeof(buf) - 1);
+ err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK,
+ &buf, sizeof(buf) - 1);
if (err < 0)
goto fail;
@@ -1102,6 +1112,10 @@ struct inode *ovl_get_inode(struct super_block *sb,
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
struct inode *inode;
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
+ struct path realpath = {
+ .dentry = upperdentry ?: lowerdentry,
+ .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt,
+ };
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
oip->index);
int fsid = bylower ? lowerpath->layer->fsid : 0;
@@ -1175,7 +1189,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
/* Check for non-merge dir that may have whiteouts */
if (is_dir) {
if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
- ovl_check_origin_xattr(ofs, upperdentry ?: lowerdentry)) {
+ ovl_path_check_origin_xattr(ofs, &realpath)) {
ovl_set_flag(OVL_WHITEOUTS, inode);
}
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 1a9b515fc45d..65c4346a5b43 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -16,6 +16,7 @@
struct ovl_lookup_data {
struct super_block *sb;
+ struct vfsmount *mnt;
struct qstr name;
bool is_dir;
bool opaque;
@@ -25,14 +26,14 @@ struct ovl_lookup_data {
bool metacopy;
};
-static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d,
+static int ovl_check_redirect(struct path *path, struct ovl_lookup_data *d,
size_t prelen, const char *post)
{
int res;
char *buf;
struct ovl_fs *ofs = OVL_FS(d->sb);
- buf = ovl_get_redirect_xattr(ofs, dentry, prelen + strlen(post));
+ buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post));
if (IS_ERR_OR_NULL(buf))
return PTR_ERR(buf);
@@ -105,13 +106,13 @@ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len)
return 0;
}
-static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *dentry,
+static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *upperdentry,
enum ovl_xattr ox)
{
int res, err;
struct ovl_fh *fh = NULL;
- res = ovl_do_getxattr(ofs, dentry, ox, NULL, 0);
+ res = ovl_getxattr_upper(ofs, upperdentry, ox, NULL, 0);
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
return NULL;
@@ -125,7 +126,7 @@ static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *dentry,
if (!fh)
return ERR_PTR(-ENOMEM);
- res = ovl_do_getxattr(ofs, dentry, ox, fh->buf, res);
+ res = ovl_getxattr_upper(ofs, upperdentry, ox, fh->buf, res);
if (res < 0)
goto fail;
@@ -193,16 +194,17 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
return real;
}
-static bool ovl_is_opaquedir(struct super_block *sb, struct dentry *dentry)
+static bool ovl_is_opaquedir(struct ovl_fs *ofs, struct path *path)
{
- return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_OPAQUE);
+ return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE);
}
-static struct dentry *ovl_lookup_positive_unlocked(const char *name,
+static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
+ const char *name,
struct dentry *base, int len,
bool drop_negative)
{
- struct dentry *ret = lookup_one_len_unlocked(name, base, len);
+ struct dentry *ret = lookup_one_unlocked(mnt_user_ns(d->mnt), name, base, len);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
if (drop_negative && ret->d_lockref.count == 1) {
@@ -224,10 +226,11 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
struct dentry **ret, bool drop_negative)
{
struct dentry *this;
+ struct path path;
int err;
bool last_element = !post[0];
- this = ovl_lookup_positive_unlocked(name, base, namelen, drop_negative);
+ this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
if (IS_ERR(this)) {
err = PTR_ERR(this);
this = NULL;
@@ -253,12 +256,15 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
d->stop = true;
goto put_and_out;
}
+
+ path.dentry = this;
+ path.mnt = d->mnt;
if (!d_can_lookup(this)) {
if (d->is_dir || !last_element) {
d->stop = true;
goto put_and_out;
}
- err = ovl_check_metacopy_xattr(OVL_FS(d->sb), this);
+ err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path);
if (err < 0)
goto out_err;
@@ -278,14 +284,14 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
if (d->last)
goto out;
- if (ovl_is_opaquedir(d->sb, this)) {
+ if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) {
d->stop = true;
if (last_element)
d->opaque = true;
goto out;
}
}
- err = ovl_check_redirect(this, d, prelen, post);
+ err = ovl_check_redirect(&path, d, prelen, post);
if (err)
goto out_err;
out:
@@ -464,7 +470,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
err = ovl_verify_fh(ofs, dentry, ox, fh);
if (set && err == -ENODATA)
- err = ovl_do_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
+ err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
if (err)
goto fail;
@@ -704,7 +710,8 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
if (err)
return ERR_PTR(err);
- index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len);
+ index = lookup_one_positive_unlocked(ovl_upper_mnt_userns(ofs), name.name,
+ ofs->indexdir, name.len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
if (err == -ENOENT) {
@@ -856,6 +863,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
old_cred = ovl_override_creds(dentry->d_sb);
upperdir = ovl_dentry_upper(dentry->d_parent);
if (upperdir) {
+ d.mnt = ovl_upper_mnt(ofs);
err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
if (err)
goto out;
@@ -911,6 +919,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
else
d.last = lower.layer->idx == roe->numlower;
+ d.mnt = lower.layer->mnt;
err = ovl_lookup_layer(lower.dentry, &d, &this, false);
if (err)
goto out_put;
@@ -1071,14 +1080,18 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (upperdentry)
ovl_dentry_set_upper_alias(dentry);
else if (index) {
- upperdentry = dget(index);
- upperredirect = ovl_get_redirect_xattr(ofs, upperdentry, 0);
+ struct path upperpath = {
+ .dentry = upperdentry = dget(index),
+ .mnt = ovl_upper_mnt(ofs),
+ };
+
+ upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
if (IS_ERR(upperredirect)) {
err = PTR_ERR(upperredirect);
upperredirect = NULL;
goto out_free_oe;
}
- err = ovl_check_metacopy_xattr(ofs, upperdentry);
+ err = ovl_check_metacopy_xattr(ofs, &upperpath);
if (err < 0)
goto out_free_oe;
uppermetacopy = err;
@@ -1163,8 +1176,8 @@ bool ovl_lower_positive(struct dentry *dentry)
struct dentry *this;
struct dentry *lowerdir = poe->lowerstack[i].dentry;
- this = lookup_positive_unlocked(name->name, lowerdir,
- name->len);
+ this = lookup_one_positive_unlocked(mnt_user_ns(poe->lowerstack[i].layer->mnt),
+ name->name, lowerdir, name->len);
if (IS_ERR(this)) {
switch (PTR_ERR(this)) {
case -ENOENT:
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 2cd5741c873b..4f34b7e02eee 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/uuid.h>
#include <linux/fs.h>
+#include <linux/namei.h>
#include "ovl_entry.h"
#undef pr_fmt
@@ -122,109 +123,180 @@ static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
return ovl_xattr_table[ox][ofs->config.userxattr];
}
-static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
+/*
+ * When changing ownership of an upper object map the intended ownership
+ * according to the upper layer's idmapping. When an upper mount idmaps files
+ * that are stored on-disk as owned by id 1001 to id 1000 this means stat on
+ * this object will report it as being owned by id 1000 when calling stat via
+ * the upper mount.
+ * In order to change ownership of an object so stat reports id 1000 when
+ * called on an idmapped upper mount the value written to disk - i.e., the
+ * value stored in ia_*id - must 1001. The mount mapping helper will thus take
+ * care to map 1000 to 1001.
+ * The mnt idmapping helpers are nops if the upper layer isn't idmapped.
+ */
+static inline int ovl_do_notify_change(struct ovl_fs *ofs,
+ struct dentry *upperdentry,
+ struct iattr *attr)
+{
+ struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs);
+ struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry));
+
+ if (attr->ia_valid & ATTR_UID)
+ attr->ia_uid = mapped_kuid_user(upper_mnt_userns,
+ fs_userns, attr->ia_uid);
+ if (attr->ia_valid & ATTR_GID)
+ attr->ia_gid = mapped_kgid_user(upper_mnt_userns,
+ fs_userns, attr->ia_gid);
+
+ return notify_change(upper_mnt_userns, upperdentry, attr, NULL);
+}
+
+static inline int ovl_do_rmdir(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry)
{
- int err = vfs_rmdir(&init_user_ns, dir, dentry);
+ int err = vfs_rmdir(ovl_upper_mnt_userns(ofs), dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
-static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
+static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry *dentry)
{
- int err = vfs_unlink(&init_user_ns, dir, dentry, NULL);
+ int err = vfs_unlink(ovl_upper_mnt_userns(ofs), dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
-static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *new_dentry)
+static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry,
+ struct inode *dir, struct dentry *new_dentry)
{
- int err = vfs_link(old_dentry, &init_user_ns, dir, new_dentry, NULL);
+ int err = vfs_link(old_dentry, ovl_upper_mnt_userns(ofs), dir, new_dentry, NULL);
pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
return err;
}
-static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
+static inline int ovl_do_create(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
umode_t mode)
{
- int err = vfs_create(&init_user_ns, dir, dentry, mode, true);
+ int err = vfs_create(ovl_upper_mnt_userns(ofs), dir, dentry, mode, true);
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
-static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
+static inline int ovl_do_mkdir(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
umode_t mode)
{
- int err = vfs_mkdir(&init_user_ns, dir, dentry, mode);
+ int err = vfs_mkdir(ovl_upper_mnt_userns(ofs), dir, dentry, mode);
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
-static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
+static inline int ovl_do_mknod(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev)
{
- int err = vfs_mknod(&init_user_ns, dir, dentry, mode, dev);
+ int err = vfs_mknod(ovl_upper_mnt_userns(ofs), dir, dentry, mode, dev);
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
return err;
}
-static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
+static inline int ovl_do_symlink(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
const char *oldname)
{
- int err = vfs_symlink(&init_user_ns, dir, dentry, oldname);
+ int err = vfs_symlink(ovl_upper_mnt_userns(ofs), dir, dentry, oldname);
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
-static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry,
- enum ovl_xattr ox, void *value,
- size_t size)
+static inline ssize_t ovl_do_getxattr(struct path *path, const char *name,
+ void *value, size_t size)
{
- const char *name = ovl_xattr(ofs, ox);
- int err = vfs_getxattr(&init_user_ns, dentry, name, value, size);
- int len = (value && err > 0) ? err : 0;
+ int err, len;
+
+ WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb);
+
+ err = vfs_getxattr(mnt_user_ns(path->mnt), path->dentry,
+ name, value, size);
+ len = (value && err > 0) ? err : 0;
pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
- dentry, name, min(len, 48), value, size, err);
+ path->dentry, name, min(len, 48), value, size, err);
return err;
}
+static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs,
+ struct dentry *upperdentry,
+ enum ovl_xattr ox, void *value,
+ size_t size)
+{
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+
+ return ovl_do_getxattr(&upperpath, ovl_xattr(ofs, ox), value, size);
+}
+
+static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs,
+ struct path *path,
+ enum ovl_xattr ox, void *value,
+ size_t size)
+{
+ return ovl_do_getxattr(path, ovl_xattr(ofs, ox), value, size);
+}
+
static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
- enum ovl_xattr ox, const void *value,
- size_t size)
+ const char *name, const void *value,
+ size_t size, int flags)
{
- const char *name = ovl_xattr(ofs, ox);
- int err = vfs_setxattr(&init_user_ns, dentry, name, value, size, 0);
- pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
- dentry, name, min((int)size, 48), value, size, err);
+ int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags);
+
+ pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n",
+ dentry, name, min((int)size, 48), value, size, flags, err);
return err;
}
+static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, const void *value,
+ size_t size)
+{
+ return ovl_do_setxattr(ofs, dentry, ovl_xattr(ofs, ox), value, size, 0);
+}
+
static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
- enum ovl_xattr ox)
+ const char *name)
{
- const char *name = ovl_xattr(ofs, ox);
- int err = vfs_removexattr(&init_user_ns, dentry, name);
+ int err = vfs_removexattr(ovl_upper_mnt_userns(ofs), dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
-static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
- struct inode *newdir, struct dentry *newdentry,
- unsigned int flags)
+static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox)
+{
+ return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox));
+}
+
+static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
+ struct dentry *olddentry, struct inode *newdir,
+ struct dentry *newdentry, unsigned int flags)
{
int err;
struct renamedata rd = {
- .old_mnt_userns = &init_user_ns,
+ .old_mnt_userns = ovl_upper_mnt_userns(ofs),
.old_dir = olddir,
.old_dentry = olddentry,
- .new_mnt_userns = &init_user_ns,
+ .new_mnt_userns = ovl_upper_mnt_userns(ofs),
.new_dir = newdir,
.new_dentry = newdentry,
.flags = flags,
@@ -239,22 +311,31 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
return err;
}
-static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
+static inline int ovl_do_whiteout(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry)
{
- int err = vfs_whiteout(&init_user_ns, dir, dentry);
+ int err = vfs_whiteout(ovl_upper_mnt_userns(ofs), dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
-static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode)
+static inline struct dentry *ovl_do_tmpfile(struct ovl_fs *ofs,
+ struct dentry *dentry, umode_t mode)
{
- struct dentry *ret = vfs_tmpfile(&init_user_ns, dentry, mode, 0);
+ struct dentry *ret = vfs_tmpfile(ovl_upper_mnt_userns(ofs), dentry, mode, 0);
int err = PTR_ERR_OR_ZERO(ret);
pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
return ret;
}
+static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs,
+ const char *name,
+ struct dentry *base, int len)
+{
+ return lookup_one(ovl_upper_mnt_userns(ofs), name, base, len);
+}
+
static inline bool ovl_open_flags_need_copy_up(int flags)
{
if (!flags)
@@ -293,10 +374,13 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
void ovl_path_lowerdata(struct dentry *dentry, struct path *path);
+void ovl_i_path_real(struct inode *inode, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_lowerdata(struct dentry *dentry);
+const struct ovl_layer *ovl_i_layer_lower(struct inode *inode);
const struct ovl_layer *ovl_layer_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_i_dentry_upper(struct inode *inode);
@@ -330,9 +414,20 @@ struct file *ovl_path_open(struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
bool ovl_already_copied_up(struct dentry *dentry, int flags);
-bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry);
-bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry,
- enum ovl_xattr ox);
+bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path,
+ enum ovl_xattr ox);
+bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path);
+
+static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs,
+ struct dentry *upperdentry)
+{
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+ return ovl_path_check_origin_xattr(ofs, &upperpath);
+}
+
int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
enum ovl_xattr ox, const void *value, size_t size,
int xerr);
@@ -344,10 +439,9 @@ bool ovl_need_index(struct dentry *dentry);
int ovl_nlink_start(struct dentry *dentry);
void ovl_nlink_end(struct dentry *dentry);
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
-int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry);
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
-char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
- int padding);
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding);
int ovl_sync_status(struct ovl_fs *ofs);
static inline void ovl_set_flag(unsigned long flag, struct inode *inode)
@@ -366,9 +460,15 @@ static inline bool ovl_test_flag(unsigned long flag, struct inode *inode)
}
static inline bool ovl_is_impuredir(struct super_block *sb,
- struct dentry *dentry)
+ struct dentry *upperdentry)
{
- return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_IMPURE);
+ struct ovl_fs *ofs = OVL_FS(sb);
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+
+ return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE);
}
/*
@@ -461,12 +561,13 @@ static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index,
extern const struct file_operations ovl_dir_operations;
struct file *ovl_dir_real_file(const struct file *file, bool want_upper);
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
-void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
+void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
+ struct list_head *list);
void ovl_cache_free(struct list_head *list);
void ovl_dir_cache_free(struct inode *inode);
int ovl_check_d_type_supported(struct path *realpath);
-int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
- struct dentry *dentry, int level);
+int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
+ struct vfsmount *mnt, struct dentry *dentry, int level);
int ovl_indexdir_cleanup(struct ovl_fs *ofs);
/*
@@ -520,16 +621,7 @@ bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir);
struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir);
struct inode *ovl_get_inode(struct super_block *sb,
struct ovl_inode_params *oip);
-static inline void ovl_copyattr(struct inode *from, struct inode *to)
-{
- to->i_uid = from->i_uid;
- to->i_gid = from->i_gid;
- to->i_mode = from->i_mode;
- to->i_atime = from->i_atime;
- to->i_mtime = from->i_mtime;
- to->i_ctime = from->i_ctime;
- i_size_write(to, i_size_read(from));
-}
+void ovl_copyattr(struct inode *to);
/* vfs inode flags copied from real to ovl inode */
#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE)
@@ -570,12 +662,15 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
-int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode);
-struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
+int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry **newdentry, umode_t mode);
+struct dentry *ovl_create_real(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *newdentry,
+ struct ovl_cattr *attr);
+int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry);
+struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir);
+struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
struct ovl_cattr *attr);
-int ovl_cleanup(struct inode *dir, struct dentry *dentry);
-struct dentry *ovl_lookup_temp(struct dentry *workdir);
-struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
/* file.c */
extern const struct file_operations ovl_file_operations;
@@ -591,9 +686,8 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns,
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_with_data(struct dentry *dentry);
int ovl_maybe_copy_up(struct dentry *dentry, int flags);
-int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
- struct dentry *new);
-int ovl_set_attr(struct dentry *upper, struct kstat *stat);
+int ovl_copy_xattr(struct super_block *sb, struct path *path, struct dentry *new);
+int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
bool is_upper);
int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 63efee554f69..e1af8f660698 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -90,6 +90,11 @@ static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs)
return ofs->layers[0].mnt;
}
+static inline struct user_namespace *ovl_upper_mnt_userns(struct ovl_fs *ofs)
+{
+ return mnt_user_ns(ovl_upper_mnt(ofs));
+}
+
static inline struct ovl_fs *OVL_FS(struct super_block *sb)
{
return (struct ovl_fs *)sb->s_fs_info;
@@ -129,7 +134,7 @@ struct ovl_inode {
unsigned long flags;
struct inode vfs_inode;
struct dentry *__upperdentry;
- struct inode *lower;
+ struct ovl_path lowerpath;
/* synchronize copy up and more */
struct mutex lock;
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 150fdf3bc68d..78f62cc1797b 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -264,11 +264,11 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name,
return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}
-static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
+static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
- struct dentry *dentry;
+ struct dentry *dentry, *dir = path->dentry;
const struct cred *old_cred;
old_cred = ovl_override_creds(rdd->dentry->d_sb);
@@ -278,7 +278,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
- dentry = lookup_one_len(p->name, dir, p->len);
+ dentry = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
@@ -312,7 +312,7 @@ static inline int ovl_dir_read(struct path *realpath,
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout && rdd->dentry)
- err = ovl_check_whiteouts(realpath->dentry, rdd);
+ err = ovl_check_whiteouts(realpath, rdd);
fput(realfile);
@@ -479,7 +479,7 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
goto get;
}
}
- this = lookup_one_len(p->name, dir, p->len);
+ this = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len);
if (IS_ERR_OR_NULL(this) || !this->d_inode) {
/* Mark a stale entry */
p->is_whiteout = true;
@@ -623,8 +623,8 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path)
* Removing the "impure" xattr is best effort.
*/
if (!ovl_want_write(dentry)) {
- ovl_do_removexattr(ofs, ovl_dentry_upper(dentry),
- OVL_XATTR_IMPURE);
+ ovl_removexattr(ofs, ovl_dentry_upper(dentry),
+ OVL_XATTR_IMPURE);
ovl_drop_write(dentry);
}
ovl_clear_flag(OVL_IMPURE, d_inode(dentry));
@@ -1001,7 +1001,8 @@ del_entry:
return err;
}
-void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
+void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
+ struct list_head *list)
{
struct ovl_cache_entry *p;
@@ -1012,7 +1013,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
if (WARN_ON(!p->is_whiteout || !p->is_upper))
continue;
- dentry = lookup_one_len(p->name, upper, p->len);
+ dentry = ovl_lookup_upper(ofs, p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
@@ -1020,7 +1021,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
continue;
}
if (dentry->d_inode)
- ovl_cleanup(upper->d_inode, dentry);
+ ovl_cleanup(ofs, upper->d_inode, dentry);
dput(dentry);
}
inode_unlock(upper->d_inode);
@@ -1064,7 +1065,8 @@ int ovl_check_d_type_supported(struct path *realpath)
#define OVL_INCOMPATDIR_NAME "incompat"
-static int ovl_workdir_cleanup_recurse(struct path *path, int level)
+static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, struct path *path,
+ int level)
{
int err;
struct inode *dir = path->dentry->d_inode;
@@ -1111,11 +1113,11 @@ static int ovl_workdir_cleanup_recurse(struct path *path, int level)
err = -EINVAL;
break;
}
- dentry = lookup_one_len(p->name, path->dentry, p->len);
+ dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len);
if (IS_ERR(dentry))
continue;
if (dentry->d_inode)
- err = ovl_workdir_cleanup(dir, path->mnt, dentry, level);
+ err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level);
dput(dentry);
if (err)
break;
@@ -1126,24 +1128,24 @@ out:
return err;
}
-int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
- struct dentry *dentry, int level)
+int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
+ struct vfsmount *mnt, struct dentry *dentry, int level)
{
int err;
if (!d_is_dir(dentry) || level > 1) {
- return ovl_cleanup(dir, dentry);
+ return ovl_cleanup(ofs, dir, dentry);
}
- err = ovl_do_rmdir(dir, dentry);
+ err = ovl_do_rmdir(ofs, dir, dentry);
if (err) {
struct path path = { .mnt = mnt, .dentry = dentry };
inode_unlock(dir);
- err = ovl_workdir_cleanup_recurse(&path, level + 1);
+ err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1);
inode_lock_nested(dir, I_MUTEX_PARENT);
if (!err)
- err = ovl_cleanup(dir, dentry);
+ err = ovl_cleanup(ofs, dir, dentry);
}
return err;
@@ -1179,7 +1181,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
if (p->len == 2 && p->name[1] == '.')
continue;
}
- index = lookup_one_len(p->name, indexdir, p->len);
+ index = ovl_lookup_upper(ofs, p->name, indexdir, p->len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
index = NULL;
@@ -1187,7 +1189,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
}
/* Cleanup leftover from index create/cleanup attempt */
if (index->d_name.name[0] == '#') {
- err = ovl_workdir_cleanup(dir, path.mnt, index, 1);
+ err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1);
if (err)
break;
goto next;
@@ -1197,7 +1199,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
goto next;
} else if (err == -ESTALE) {
/* Cleanup stale index entries */
- err = ovl_cleanup(dir, index);
+ err = ovl_cleanup(ofs, dir, index);
} else if (err != -ENOENT) {
/*
* Abort mount to avoid corrupting the index if
@@ -1213,7 +1215,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
err = ovl_cleanup_and_whiteout(ofs, dir, index);
} else {
/* Cleanup orphan index entries */
- err = ovl_cleanup(dir, index);
+ err = ovl_cleanup(ofs, dir, index);
}
if (err)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 001cdbb8f015..e0a2e0468ee7 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -184,7 +184,8 @@ static struct inode *ovl_alloc_inode(struct super_block *sb)
oi->version = 0;
oi->flags = 0;
oi->__upperdentry = NULL;
- oi->lower = NULL;
+ oi->lowerpath.dentry = NULL;
+ oi->lowerpath.layer = NULL;
oi->lowerdata = NULL;
mutex_init(&oi->lock);
@@ -205,7 +206,7 @@ static void ovl_destroy_inode(struct inode *inode)
struct ovl_inode *oi = OVL_I(inode);
dput(oi->__upperdentry);
- iput(oi->lower);
+ dput(oi->lowerpath.dentry);
if (S_ISDIR(inode->i_mode))
ovl_dir_cache_free(inode);
else
@@ -761,7 +762,7 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
inode_lock_nested(dir, I_MUTEX_PARENT);
retry:
- work = lookup_one_len(name, ofs->workbasedir, strlen(name));
+ work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name));
if (!IS_ERR(work)) {
struct iattr attr = {
@@ -778,7 +779,7 @@ retry:
goto out_unlock;
retried = true;
- err = ovl_workdir_cleanup(dir, mnt, work, 0);
+ err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0);
dput(work);
if (err == -EINVAL) {
work = ERR_PTR(err);
@@ -787,7 +788,7 @@ retry:
goto retry;
}
- err = ovl_mkdir_real(dir, &work, attr.ia_mode);
+ err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode);
if (err)
goto out_dput;
@@ -809,19 +810,19 @@ retry:
* allowed as upper are limited to "normal" ones, where checking
* for the above two errors is sufficient.
*/
- err = vfs_removexattr(&init_user_ns, work,
- XATTR_NAME_POSIX_ACL_DEFAULT);
+ err = ovl_do_removexattr(ofs, work,
+ XATTR_NAME_POSIX_ACL_DEFAULT);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
- err = vfs_removexattr(&init_user_ns, work,
- XATTR_NAME_POSIX_ACL_ACCESS);
+ err = ovl_do_removexattr(ofs, work,
+ XATTR_NAME_POSIX_ACL_ACCESS);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
/* Clear any inherited mode bits */
inode_lock(work->d_inode);
- err = notify_change(&init_user_ns, work, &attr, NULL);
+ err = ovl_do_notify_change(ofs, work, &attr);
inode_unlock(work->d_inode);
if (err)
goto out_dput;
@@ -873,10 +874,6 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
pr_err("filesystem on '%s' not supported\n", name);
goto out_put;
}
- if (is_idmapped_mnt(path->mnt)) {
- pr_err("idmapped layers are currently not supported\n");
- goto out_put;
- }
if (!d_is_dir(path->dentry)) {
pr_err("'%s' not a directory\n", name);
goto out_put;
@@ -1256,8 +1253,9 @@ out:
* Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and
* negative values if error is encountered.
*/
-static int ovl_check_rename_whiteout(struct dentry *workdir)
+static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
{
+ struct dentry *workdir = ofs->workdir;
struct inode *dir = d_inode(workdir);
struct dentry *temp;
struct dentry *dest;
@@ -1267,12 +1265,12 @@ static int ovl_check_rename_whiteout(struct dentry *workdir)
inode_lock_nested(dir, I_MUTEX_PARENT);
- temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0));
+ temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0));
err = PTR_ERR(temp);
if (IS_ERR(temp))
goto out_unlock;
- dest = ovl_lookup_temp(workdir);
+ dest = ovl_lookup_temp(ofs, workdir);
err = PTR_ERR(dest);
if (IS_ERR(dest)) {
dput(temp);
@@ -1281,14 +1279,14 @@ static int ovl_check_rename_whiteout(struct dentry *workdir)
/* Name is inline and stable - using snapshot as a copy helper */
take_dentry_name_snapshot(&name, temp);
- err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT);
+ err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT);
if (err) {
if (err == -EINVAL)
err = 0;
goto cleanup_temp;
}
- whiteout = lookup_one_len(name.name.name, workdir, name.name.len);
+ whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto cleanup_temp;
@@ -1297,11 +1295,11 @@ static int ovl_check_rename_whiteout(struct dentry *workdir)
/* Best effort cleanup of whiteout and temp file */
if (err)
- ovl_cleanup(dir, whiteout);
+ ovl_cleanup(ofs, dir, whiteout);
dput(whiteout);
cleanup_temp:
- ovl_cleanup(dir, temp);
+ ovl_cleanup(ofs, dir, temp);
release_dentry_name_snapshot(&name);
dput(temp);
dput(dest);
@@ -1312,16 +1310,17 @@ out_unlock:
return err;
}
-static struct dentry *ovl_lookup_or_create(struct dentry *parent,
+static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs,
+ struct dentry *parent,
const char *name, umode_t mode)
{
size_t len = strlen(name);
struct dentry *child;
inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
- child = lookup_one_len(name, parent, len);
+ child = ovl_lookup_upper(ofs, name, parent, len);
if (!IS_ERR(child) && !child->d_inode)
- child = ovl_create_real(parent->d_inode, child,
+ child = ovl_create_real(ofs, parent->d_inode, child,
OVL_CATTR(mode));
inode_unlock(parent->d_inode);
dput(parent);
@@ -1343,7 +1342,7 @@ static int ovl_create_volatile_dirty(struct ovl_fs *ofs)
const char *const *name = volatile_path;
for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) {
- d = ovl_lookup_or_create(d, *name, ctr > 1 ? S_IFDIR : S_IFREG);
+ d = ovl_lookup_or_create(ofs, d, *name, ctr > 1 ? S_IFDIR : S_IFREG);
if (IS_ERR(d))
return PTR_ERR(d);
}
@@ -1391,7 +1390,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
pr_warn("upper fs needs to support d_type.\n");
/* Check if upper/work fs supports O_TMPFILE */
- temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0);
+ temp = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0);
ofs->tmpfile = !IS_ERR(temp);
if (ofs->tmpfile)
dput(temp);
@@ -1400,7 +1399,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
/* Check if upper/work fs supports RENAME_WHITEOUT */
- err = ovl_check_rename_whiteout(ofs->workdir);
+ err = ovl_check_rename_whiteout(ofs);
if (err < 0)
goto out;
@@ -1411,7 +1410,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
/*
* Check if upper/work fs supports (trusted|user).overlay.* xattr
*/
- err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1);
+ err = ovl_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1);
if (err) {
ofs->noxattr = true;
if (ofs->config.index || ofs->config.metacopy) {
@@ -1429,7 +1428,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
}
err = 0;
} else {
- ovl_do_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE);
+ ovl_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE);
}
/*
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index f48284a2a896..87f811c089e4 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -194,6 +194,20 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
return type;
}
+enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path)
+{
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ WARN_ON_ONCE(d_is_dir(dentry));
+
+ if (!OVL_TYPE_UPPER(type) || OVL_TYPE_MERGE(type))
+ ovl_path_lowerdata(dentry, path);
+ else
+ ovl_path_upper(dentry, path);
+
+ return type;
+}
+
struct dentry *ovl_dentry_upper(struct dentry *dentry)
{
return ovl_upperdentry_dereference(OVL_I(d_inode(dentry)));
@@ -236,6 +250,17 @@ struct dentry *ovl_i_dentry_upper(struct inode *inode)
return ovl_upperdentry_dereference(OVL_I(inode));
}
+void ovl_i_path_real(struct inode *inode, struct path *path)
+{
+ path->dentry = ovl_i_dentry_upper(inode);
+ if (!path->dentry) {
+ path->dentry = OVL_I(inode)->lowerpath.dentry;
+ path->mnt = OVL_I(inode)->lowerpath.layer->mnt;
+ } else {
+ path->mnt = ovl_upper_mnt(OVL_FS(inode->i_sb));
+ }
+}
+
struct inode *ovl_inode_upper(struct inode *inode)
{
struct dentry *upperdentry = ovl_i_dentry_upper(inode);
@@ -245,7 +270,9 @@ struct inode *ovl_inode_upper(struct inode *inode)
struct inode *ovl_inode_lower(struct inode *inode)
{
- return OVL_I(inode)->lower;
+ struct dentry *lowerdentry = OVL_I(inode)->lowerpath.dentry;
+
+ return lowerdentry ? d_inode(lowerdentry) : NULL;
}
struct inode *ovl_inode_real(struct inode *inode)
@@ -443,7 +470,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity)
void ovl_dir_modified(struct dentry *dentry, bool impurity)
{
/* Copy mtime/ctime */
- ovl_copyattr(d_inode(ovl_dentry_upper(dentry)), d_inode(dentry));
+ ovl_copyattr(d_inode(dentry));
ovl_dir_version_inc(dentry, impurity);
}
@@ -466,6 +493,7 @@ bool ovl_is_whiteout(struct dentry *dentry)
struct file *ovl_path_open(struct path *path, int flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt);
int err, acc_mode;
if (flags & ~(O_ACCMODE | O_LARGEFILE))
@@ -482,12 +510,12 @@ struct file *ovl_path_open(struct path *path, int flags)
BUG();
}
- err = inode_permission(&init_user_ns, inode, acc_mode | MAY_OPEN);
+ err = inode_permission(real_mnt_userns, inode, acc_mode | MAY_OPEN);
if (err)
return ERR_PTR(err);
/* O_NOATIME is an optimization, don't fail if not permitted */
- if (inode_owner_or_capable(&init_user_ns, inode))
+ if (inode_owner_or_capable(real_mnt_userns, inode))
flags |= O_NOATIME;
return dentry_open(path, flags, current_cred());
@@ -550,11 +578,11 @@ void ovl_copy_up_end(struct dentry *dentry)
ovl_inode_unlock(d_inode(dentry));
}
-bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry)
+bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path)
{
int res;
- res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_ORIGIN, NULL, 0);
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_ORIGIN, NULL, 0);
/* Zero size value means "copied up but origin unknown" */
if (res >= 0)
@@ -563,16 +591,16 @@ bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry)
return false;
}
-bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry,
- enum ovl_xattr ox)
+bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path,
+ enum ovl_xattr ox)
{
int res;
char val;
- if (!d_is_dir(dentry))
+ if (!d_is_dir(path->dentry))
return false;
- res = ovl_do_getxattr(OVL_FS(sb), dentry, ox, &val, 1);
+ res = ovl_path_getxattr(ofs, path, ox, &val, 1);
if (res == 1 && val == 'y')
return true;
@@ -612,7 +640,7 @@ int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
if (ofs->noxattr)
return xerr;
- err = ovl_do_setxattr(ofs, upperdentry, ox, value, size);
+ err = ovl_setxattr(ofs, upperdentry, ox, value, size);
if (err == -EOPNOTSUPP) {
pr_warn("cannot set %s xattr on upper\n", ovl_xattr(ofs, ox));
@@ -652,8 +680,8 @@ void ovl_check_protattr(struct inode *inode, struct dentry *upper)
char buf[OVL_PROTATTR_MAX+1];
int res, n;
- res = ovl_do_getxattr(ofs, upper, OVL_XATTR_PROTATTR, buf,
- OVL_PROTATTR_MAX);
+ res = ovl_getxattr_upper(ofs, upper, OVL_XATTR_PROTATTR, buf,
+ OVL_PROTATTR_MAX);
if (res < 0)
return;
@@ -708,7 +736,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper,
err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR,
buf, len, -EPERM);
} else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) {
- err = ovl_do_removexattr(ofs, upper, OVL_XATTR_PROTATTR);
+ err = ovl_removexattr(ofs, upper, OVL_XATTR_PROTATTR);
if (err == -EOPNOTSUPP || err == -ENODATA)
err = 0;
}
@@ -824,7 +852,7 @@ static void ovl_cleanup_index(struct dentry *dentry)
}
inode_lock_nested(dir, I_MUTEX_PARENT);
- index = lookup_one_len(name.name, indexdir, name.len);
+ index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
err = PTR_ERR(index);
if (IS_ERR(index)) {
index = NULL;
@@ -834,7 +862,7 @@ static void ovl_cleanup_index(struct dentry *dentry)
dir, index);
} else {
/* Cleanup orphan index entries */
- err = ovl_cleanup(dir, index);
+ err = ovl_cleanup(ofs, dir, index);
}
inode_unlock(dir);
@@ -943,15 +971,15 @@ err:
}
/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */
-int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry)
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path)
{
int res;
/* Only regular files can have metacopy xattr */
- if (!S_ISREG(d_inode(dentry)->i_mode))
+ if (!S_ISREG(d_inode(path->dentry)->i_mode))
return 0;
- res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_METACOPY, NULL, 0);
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, NULL, 0);
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
return 0;
@@ -987,13 +1015,12 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry)
return (oe->numlower > 1);
}
-char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
- int padding)
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding)
{
int res;
char *s, *next, *buf = NULL;
- res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, NULL, 0);
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, NULL, 0);
if (res == -ENODATA || res == -EOPNOTSUPP)
return NULL;
if (res < 0)
@@ -1005,7 +1032,7 @@ char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
if (!buf)
return ERR_PTR(-ENOMEM);
- res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, buf, res);
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, buf, res);
if (res < 0)
goto fail;
if (res == 0)
@@ -1060,3 +1087,33 @@ int ovl_sync_status(struct ovl_fs *ofs)
return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq);
}
+
+/*
+ * ovl_copyattr() - copy inode attributes from layer to ovl inode
+ *
+ * When overlay copies inode information from an upper or lower layer to the
+ * relevant overlay inode it will apply the idmapping of the upper or lower
+ * layer when doing so ensuring that the ovl inode ownership will correctly
+ * reflect the ownership of the idmapped upper or lower layer. For example, an
+ * idmapped upper or lower layer mapping id 1001 to id 1000 will take care to
+ * map any lower or upper inode owned by id 1001 to id 1000. These mapping
+ * helpers are nops when the relevant layer isn't idmapped.
+ */
+void ovl_copyattr(struct inode *inode)
+{
+ struct path realpath;
+ struct inode *realinode;
+ struct user_namespace *real_mnt_userns;
+
+ ovl_i_path_real(inode, &realpath);
+ realinode = d_inode(realpath.dentry);
+ real_mnt_userns = mnt_user_ns(realpath.mnt);
+
+ inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode);
+ inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode);
+ inode->i_mode = realinode->i_mode;
+ inode->i_atime = realinode->i_atime;
+ inode->i_mtime = realinode->i_mtime;
+ inode->i_ctime = realinode->i_ctime;
+ i_size_write(inode, i_size_read(realinode));
+}
diff --git a/fs/pipe.c b/fs/pipe.c
index e140ea150bbb..74ae9fafd25a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -653,7 +653,7 @@ pipe_poll(struct file *filp, poll_table *wait)
unsigned int head, tail;
/* Epoll has some historical nasty semantics, this enables them */
- pipe->poll_usage = 1;
+ WRITE_ONCE(pipe->poll_usage, true);
/*
* Reading pipe state only -- no need for acquiring the semaphore.
@@ -1245,30 +1245,33 @@ unsigned int round_pipe_size(unsigned long size)
/*
* Resize the pipe ring to a number of slots.
+ *
+ * Note the pipe can be reduced in capacity, but only if the current
+ * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
+ * returned instead.
*/
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
struct pipe_buffer *bufs;
unsigned int head, tail, mask, n;
- /*
- * We can shrink the pipe, if arg is greater than the ring occupancy.
- * Since we don't expect a lot of shrink+grow operations, just free and
- * allocate again like we would do for growing. If the pipe currently
- * contains more buffers than arg, then return busy.
- */
- mask = pipe->ring_size - 1;
- head = pipe->head;
- tail = pipe->tail;
- n = pipe_occupancy(pipe->head, pipe->tail);
- if (nr_slots < n)
- return -EBUSY;
-
bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs))
return -ENOMEM;
+ spin_lock_irq(&pipe->rd_wait.lock);
+ mask = pipe->ring_size - 1;
+ head = pipe->head;
+ tail = pipe->tail;
+
+ n = pipe_occupancy(head, tail);
+ if (nr_slots < n) {
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ kfree(bufs);
+ return -EBUSY;
+ }
+
/*
* The pipe array wraps around, so just start the new one at zero
* and adjust the indices.
@@ -1300,6 +1303,8 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
pipe->tail = tail;
pipe->head = head;
+ spin_unlock_irq(&pipe->rd_wait.lock);
+
/* This might have made more room for writers */
wake_up_interruptible(&pipe->wr_wait);
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c1031843cc6a..8dfa36a99c74 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3154,6 +3154,22 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_LIVEPATCH */
+#ifdef CONFIG_KSM
+static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "%lu\n", mm->ksm_merging_pages);
+ mmput(mm);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_KSM */
+
#ifdef CONFIG_STACKLEAK_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
@@ -3285,6 +3301,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3618,6 +3637,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f2132407e133..587b91d9d998 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -448,6 +448,9 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
ent->proc_dops = &proc_misc_dentry_ops;
+ /* Revalidate everything under /proc/${pid}/net */
+ if ((*parent)->proc_dops == &proc_net_dentry_ops)
+ pde_force_lookup(ent);
out:
return ent;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 982e694aae77..dff921f7ca33 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -479,10 +479,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
* the previous entry, search for a matching entry.
*/
if (!m || start < m->addr || start >= m->addr + m->size) {
- list_for_each_entry(m, &kclist_head, list) {
- if (start >= m->addr &&
- start < m->addr + m->size)
+ struct kcore_list *iter;
+
+ m = NULL;
+ list_for_each_entry(iter, &kclist_head, list) {
+ if (start >= iter->addr &&
+ start < iter->addr + iter->size) {
+ m = iter;
break;
+ }
}
}
@@ -492,12 +497,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
page_offline_freeze();
}
- if (&m->list == &kclist_head) {
+ if (!m) {
if (clear_user(buffer, tsz)) {
ret = -EFAULT;
goto out;
}
- m = NULL; /* skip the list anchor */
goto skip;
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 6fa761c9cc78..6e89f0e2fd20 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -86,6 +86,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap);
+#ifdef CONFIG_ZSWAP
+ seq_printf(m, "Zswap: %8lu kB\n",
+ (unsigned long)(zswap_pool_total_size >> 10));
+ seq_printf(m, "Zswapped: %8lu kB\n",
+ (unsigned long)atomic_read(&zswap_stored_pages) <<
+ (PAGE_SHIFT - 10));
+#endif
show_val_kb(m, "Dirty: ",
global_node_page_state(NR_FILE_DIRTY));
show_val_kb(m, "Writeback: ",
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index e1cfeda397f3..913e5acefbb6 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -376,6 +376,9 @@ static __net_init int proc_net_ns_init(struct net *net)
proc_set_user(netd, uid, gid);
+ /* Seed dentry revalidation for /proc/${pid}/net */
+ pde_force_lookup(netd);
+
err = -EEXIST;
net_statd = proc_net_mkdir(net, "stat", netd);
if (!net_statd)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 7d9cfc730bd4..021e83fe831f 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -19,6 +19,9 @@
#include <linux/kmemleak.h>
#include "internal.h"
+#define list_for_each_table_entry(entry, table) \
+ for ((entry) = (table); (entry)->procname; (entry)++)
+
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
static const struct inode_operations proc_sys_inode_operations;
@@ -26,7 +29,7 @@ static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
/* shared constants to be used in various sysctls */
-const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX, 65535 };
+const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
EXPORT_SYMBOL(sysctl_vals);
const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
@@ -215,15 +218,19 @@ static void init_header(struct ctl_table_header *head,
INIT_HLIST_HEAD(&head->inodes);
if (node) {
struct ctl_table *entry;
- for (entry = table; entry->procname; entry++, node++)
+
+ list_for_each_table_entry(entry, table) {
node->header = head;
+ node++;
+ }
}
}
static void erase_header(struct ctl_table_header *head)
{
struct ctl_table *entry;
- for (entry = head->ctl_table; entry->procname; entry++)
+
+ list_for_each_table_entry(entry, head->ctl_table)
erase_entry(head, entry);
}
@@ -248,7 +255,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
err = insert_links(header);
if (err)
goto fail_links;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header->ctl_table) {
err = insert_entry(header, entry);
if (err)
goto fail;
@@ -978,7 +985,6 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
table = (struct ctl_table *)(node + 1);
new_name = (char *)(table + 2);
memcpy(new_name, name, namelen);
- new_name[namelen] = '\0';
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
init_header(&new->header, set->dir.header.root, set, node, table);
@@ -1130,35 +1136,36 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
static int sysctl_check_table(const char *path, struct ctl_table *table)
{
+ struct ctl_table *entry;
int err = 0;
- for (; table->procname; table++) {
- if (table->child)
- err |= sysctl_err(path, table, "Not a file");
-
- if ((table->proc_handler == proc_dostring) ||
- (table->proc_handler == proc_dointvec) ||
- (table->proc_handler == proc_douintvec) ||
- (table->proc_handler == proc_douintvec_minmax) ||
- (table->proc_handler == proc_dointvec_minmax) ||
- (table->proc_handler == proc_dou8vec_minmax) ||
- (table->proc_handler == proc_dointvec_jiffies) ||
- (table->proc_handler == proc_dointvec_userhz_jiffies) ||
- (table->proc_handler == proc_dointvec_ms_jiffies) ||
- (table->proc_handler == proc_doulongvec_minmax) ||
- (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
- if (!table->data)
- err |= sysctl_err(path, table, "No data");
- if (!table->maxlen)
- err |= sysctl_err(path, table, "No maxlen");
+ list_for_each_table_entry(entry, table) {
+ if (entry->child)
+ err |= sysctl_err(path, entry, "Not a file");
+
+ if ((entry->proc_handler == proc_dostring) ||
+ (entry->proc_handler == proc_dointvec) ||
+ (entry->proc_handler == proc_douintvec) ||
+ (entry->proc_handler == proc_douintvec_minmax) ||
+ (entry->proc_handler == proc_dointvec_minmax) ||
+ (entry->proc_handler == proc_dou8vec_minmax) ||
+ (entry->proc_handler == proc_dointvec_jiffies) ||
+ (entry->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (entry->proc_handler == proc_dointvec_ms_jiffies) ||
+ (entry->proc_handler == proc_doulongvec_minmax) ||
+ (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!entry->data)
+ err |= sysctl_err(path, entry, "No data");
+ if (!entry->maxlen)
+ err |= sysctl_err(path, entry, "No maxlen");
else
- err |= sysctl_check_table_array(path, table);
+ err |= sysctl_check_table_array(path, entry);
}
- if (!table->proc_handler)
- err |= sysctl_err(path, table, "No proc_handler");
+ if (!entry->proc_handler)
+ err |= sysctl_err(path, entry, "No proc_handler");
- if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
- err |= sysctl_err(path, table, "bogus .mode 0%o",
- table->mode);
+ if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode)
+ err |= sysctl_err(path, entry, "bogus .mode 0%o",
+ entry->mode);
}
return err;
}
@@ -1174,7 +1181,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
name_bytes = 0;
nr_entries = 0;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
nr_entries++;
name_bytes += strlen(entry->procname) + 1;
}
@@ -1191,14 +1198,16 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
node = (struct ctl_node *)(links + 1);
link_table = (struct ctl_table *)(node + nr_entries);
link_name = (char *)&link_table[nr_entries + 1];
+ link = link_table;
- for (link = link_table, entry = table; entry->procname; link++, entry++) {
+ list_for_each_table_entry(entry, table) {
int len = strlen(entry->procname) + 1;
memcpy(link_name, entry->procname, len);
link->procname = link_name;
link->mode = S_IFLNK|S_IRWXUGO;
link->data = link_root;
link_name += len;
+ link++;
}
init_header(links, dir->header.root, dir->header.set, node, link_table);
links->nreg = nr_entries;
@@ -1213,7 +1222,7 @@ static bool get_links(struct ctl_dir *dir,
struct ctl_table *entry, *link;
/* Are there links available for every entry in table? */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
const char *procname = entry->procname;
link = find_entry(&head, dir, procname, strlen(procname));
if (!link)
@@ -1226,7 +1235,7 @@ static bool get_links(struct ctl_dir *dir,
}
/* The checks passed. Increase the registration count on the links */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
const char *procname = entry->procname;
link = find_entry(&head, dir, procname, strlen(procname));
head->nreg++;
@@ -1329,11 +1338,11 @@ struct ctl_table_header *__register_sysctl_table(
struct ctl_node *node;
int nr_entries = 0;
- for (entry = table; entry->procname; entry++)
+ list_for_each_table_entry(entry, table)
nr_entries++;
header = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+ sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT);
if (!header)
return NULL;
@@ -1456,7 +1465,7 @@ static int count_subheaders(struct ctl_table *table)
if (!table || !table->procname)
return 1;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
if (entry->child)
nr_subheaders += count_subheaders(entry->child);
else
@@ -1475,7 +1484,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
int nr_dirs = 0;
int err = -ENOMEM;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
if (entry->child)
nr_dirs++;
else
@@ -1492,7 +1501,9 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
goto out;
ctl_table_arg = files;
- for (new = files, entry = table; entry->procname; entry++) {
+ new = files;
+
+ list_for_each_table_entry(entry, table) {
if (entry->child)
continue;
*new = *entry;
@@ -1516,7 +1527,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
}
/* Recurse into the subdirectories. */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
char *child_pos;
if (!entry->child)
@@ -1670,7 +1681,7 @@ static void put_links(struct ctl_table_header *header)
if (IS_ERR(core_parent))
return;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header->ctl_table) {
struct ctl_table_header *link_head;
struct ctl_table *link;
const char *name = entry->procname;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f46060eb91b5..2d04e3470d4c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1421,6 +1421,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
+ if (pte_marker_entry_uffd_wp(entry))
+ flags |= PM_UFFD_WP;
}
if (page && !PageAnon(page))
@@ -1556,10 +1558,15 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
if (page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
+ if (huge_pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+
flags |= PM_PRESENT;
if (pm->show_pfn)
frame = pte_pfn(pte) +
((addr & ~hmask) >> PAGE_SHIFT);
+ } else if (pte_swp_uffd_wp_any(pte)) {
+ flags |= PM_UFFD_WP;
}
for (; addr != end; addr += PAGE_SIZE) {
@@ -1873,8 +1880,6 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
return 0;
page = pte_page(huge_pte);
- if (!page)
- return 0;
md = walk->private;
gather_stats(page, md, pte_dirty(huge_pte), 1);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 6f1b8ddc6f7a..4eaeb645e759 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -26,6 +26,7 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <linux/cc_platform.h>
#include <asm/io.h>
#include "internal.h"
@@ -128,9 +129,8 @@ static int open_vmcore(struct inode *inode, struct file *file)
}
/* Reads a page from the oldmem device from given offset. */
-ssize_t read_from_oldmem(char *buf, size_t count,
- u64 *ppos, int userbuf,
- bool encrypted)
+ssize_t read_from_oldmem(struct iov_iter *iter, size_t count,
+ u64 *ppos, bool encrypted)
{
unsigned long pfn, offset;
size_t nr_bytes;
@@ -152,29 +152,23 @@ ssize_t read_from_oldmem(char *buf, size_t count,
/* If pfn is not ram, return zeros for sparse dump files */
if (!pfn_is_ram(pfn)) {
- tmp = 0;
- if (!userbuf)
- memset(buf, 0, nr_bytes);
- else if (clear_user(buf, nr_bytes))
- tmp = -EFAULT;
+ tmp = iov_iter_zero(nr_bytes, iter);
} else {
if (encrypted)
- tmp = copy_oldmem_page_encrypted(pfn, buf,
+ tmp = copy_oldmem_page_encrypted(iter, pfn,
nr_bytes,
- offset,
- userbuf);
+ offset);
else
- tmp = copy_oldmem_page(pfn, buf, nr_bytes,
- offset, userbuf);
+ tmp = copy_oldmem_page(iter, pfn, nr_bytes,
+ offset);
}
- if (tmp < 0) {
+ if (tmp < nr_bytes) {
srcu_read_unlock(&vmcore_cb_srcu, idx);
- return tmp;
+ return -EFAULT;
}
*ppos += nr_bytes;
count -= nr_bytes;
- buf += nr_bytes;
read += nr_bytes;
++pfn;
offset = 0;
@@ -203,7 +197,12 @@ void __weak elfcorehdr_free(unsigned long long addr)
*/
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, false);
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos, false);
}
/*
@@ -211,7 +210,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
}
/*
@@ -228,29 +233,14 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
/*
* Architectures which support memory encryption override this.
*/
-ssize_t __weak
-copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf)
+ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter,
+ unsigned long pfn, size_t csize, unsigned long offset)
{
- return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
-}
-
-/*
- * Copy to either kernel or user space
- */
-static int copy_to(void *target, void *src, size_t size, int userbuf)
-{
- if (userbuf) {
- if (copy_to_user((char __user *) target, src, size))
- return -EFAULT;
- } else {
- memcpy(target, src, size);
- }
- return 0;
+ return copy_oldmem_page(iter, pfn, csize, offset);
}
#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
-static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
+static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size)
{
struct vmcoredd_node *dump;
u64 offset = 0;
@@ -263,14 +253,13 @@ static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
if (start < offset + dump->size) {
tsz = min(offset + (u64)dump->size - start, (u64)size);
buf = dump->buf + start - offset;
- if (copy_to(dst, buf, tsz, userbuf)) {
+ if (copy_to_iter(buf, tsz, iter) < tsz) {
ret = -EFAULT;
goto out_unlock;
}
size -= tsz;
start += tsz;
- dst += tsz;
/* Leave now if buffer filled already */
if (!size)
@@ -326,33 +315,28 @@ out_unlock:
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
-static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
- int userbuf)
+static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
{
ssize_t acc = 0, tmp;
size_t tsz;
u64 start;
struct vmcore *m = NULL;
- if (buflen == 0 || *fpos >= vmcore_size)
+ if (!iov_iter_count(iter) || *fpos >= vmcore_size)
return 0;
- /* trim buflen to not go beyond EOF */
- if (buflen > vmcore_size - *fpos)
- buflen = vmcore_size - *fpos;
+ iov_iter_truncate(iter, vmcore_size - *fpos);
/* Read ELF core header */
if (*fpos < elfcorebuf_sz) {
- tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
- if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf))
+ tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter));
+ if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz)
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
@@ -373,35 +357,32 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
/* Read device dumps */
if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) {
tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
- (size_t)*fpos, buflen);
+ (size_t)*fpos, iov_iter_count(iter));
start = *fpos - elfcorebuf_sz;
- if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf))
+ if (vmcoredd_copy_dumps(iter, start, tsz))
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (!buflen)
+ if (!iov_iter_count(iter))
return acc;
}
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
/* Read remaining elf notes */
- tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos,
+ iov_iter_count(iter));
kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz;
- if (copy_to(buffer, kaddr, tsz, userbuf))
+ if (copy_to_iter(kaddr, tsz, iter) < tsz)
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
@@ -409,19 +390,17 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
if (*fpos < m->offset + m->size) {
tsz = (size_t)min_t(unsigned long long,
m->offset + m->size - *fpos,
- buflen);
+ iov_iter_count(iter));
start = m->paddr + *fpos - m->offset;
- tmp = read_from_oldmem(buffer, tsz, &start,
- userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+ tmp = read_from_oldmem(iter, tsz, &start,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
if (tmp < 0)
return tmp;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
}
@@ -429,15 +408,14 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
return acc;
}
-static ssize_t read_vmcore(struct file *file, char __user *buffer,
- size_t buflen, loff_t *fpos)
+static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter)
{
- return __read_vmcore((__force char *) buffer, buflen, fpos, 1);
+ return __read_vmcore(iter, &iocb->ki_pos);
}
/*
* The vmcore fault handler uses the page cache and fills data using the
- * standard __vmcore_read() function.
+ * standard __read_vmcore() function.
*
* On s390 the fault handler is used for memory regions that can't be mapped
* directly with remap_pfn_range().
@@ -447,9 +425,10 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
#ifdef CONFIG_S390
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
pgoff_t index = vmf->pgoff;
+ struct iov_iter iter;
+ struct kvec kvec;
struct page *page;
loff_t offset;
- char *buf;
int rc;
page = find_or_create_page(mapping, index, GFP_KERNEL);
@@ -457,8 +436,11 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
if (!PageUptodate(page)) {
offset = (loff_t) index << PAGE_SHIFT;
- buf = __va((page_to_pfn(page) << PAGE_SHIFT));
- rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
+ kvec.iov_base = page_address(page);
+ kvec.iov_len = PAGE_SIZE;
+ iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE);
+
+ rc = __read_vmcore(&iter, &offset);
if (rc < 0) {
unlock_page(page);
put_page(page);
@@ -708,7 +690,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
static const struct proc_ops vmcore_proc_ops = {
.proc_open = open_vmcore,
- .proc_read = read_vmcore,
+ .proc_read_iter = read_vmcore,
.proc_lseek = default_llseek,
.proc_mmap = mmap_vmcore,
};
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a74aef99bd3d..09d1307959d0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -79,6 +79,7 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/sched/mm.h>
#include "../internal.h" /* ugh */
#include <linux/uaccess.h>
@@ -425,9 +426,11 @@ EXPORT_SYMBOL(mark_info_dirty);
int dquot_acquire(struct dquot *dquot)
{
int ret = 0, ret2 = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
if (ret < 0)
@@ -458,6 +461,7 @@ int dquot_acquire(struct dquot *dquot)
smp_mb__before_atomic();
set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
@@ -469,9 +473,11 @@ EXPORT_SYMBOL(dquot_acquire);
int dquot_commit(struct dquot *dquot)
{
int ret = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
if (!clear_dquot_dirty(dquot))
goto out_lock;
/* Inactive dquot can be only if there was error during read/init
@@ -481,6 +487,7 @@ int dquot_commit(struct dquot *dquot)
else
ret = -EIO;
out_lock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
@@ -492,9 +499,11 @@ EXPORT_SYMBOL(dquot_commit);
int dquot_release(struct dquot *dquot)
{
int ret = 0, ret2 = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
/* Check whether we are not racing with some other dqget() */
if (dquot_is_busy(dquot))
goto out_dqlock;
@@ -510,6 +519,7 @@ int dquot_release(struct dquot *dquot)
}
clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_dqlock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index e643aec2b0ef..b1b1cdfee9d3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -682,6 +682,14 @@ SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
return ksys_pread64(fd, buf, count, pos);
}
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
+COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
+ size_t, count, compat_arg_u64_dual(pos))
+{
+ return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
+}
+#endif
+
ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
size_t count, loff_t pos)
{
@@ -708,6 +716,14 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
return ksys_pwrite64(fd, buf, count, pos);
}
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
+COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
+ size_t, count, compat_arg_u64_dual(pos))
+{
+ return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
+}
+#endif
+
static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
loff_t *ppos, int type, rwf_t flags)
{
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 7ab8a58c29b6..9456a2032224 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -931,6 +931,38 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
}
EXPORT_SYMBOL(seq_list_next);
+struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos)
+{
+ struct list_head *lh;
+
+ list_for_each_rcu(lh, head)
+ if (pos-- == 0)
+ return lh;
+
+ return NULL;
+}
+EXPORT_SYMBOL(seq_list_start_rcu);
+
+struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos)
+{
+ if (!pos)
+ return head;
+
+ return seq_list_start_rcu(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_list_start_head_rcu);
+
+struct list_head *seq_list_next_rcu(void *v, struct list_head *head,
+ loff_t *ppos)
+{
+ struct list_head *lh;
+
+ lh = list_next_rcu((struct list_head *)v);
+ ++*ppos;
+ return lh == head ? NULL : lh;
+}
+EXPORT_SYMBOL(seq_list_next_rcu);
+
/**
* seq_hlist_start - start an iteration of a hlist
* @head: the head of the hlist
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
index 0507aecfc669..2cab413fffee 100644
--- a/fs/smbfs_common/smb2pdu.h
+++ b/fs/smbfs_common/smb2pdu.h
@@ -1244,6 +1244,106 @@ struct file_zero_data_information {
__le64 BeyondFinalZero;
} __packed;
+/* See MS-FSCC 2.3.7 */
+struct duplicate_extents_to_file {
+ __u64 PersistentFileHandle; /* source file handle, opaque endianness */
+ __u64 VolatileFileHandle;
+ __le64 SourceFileOffset;
+ __le64 TargetFileOffset;
+ __le64 ByteCount; /* Bytes to be copied */
+} __packed;
+
+/* See MS-FSCC 2.3.8 */
+#define DUPLICATE_EXTENTS_DATA_EX_SOURCE_ATOMIC 0x00000001
+struct duplicate_extents_to_file_ex {
+ __u64 PersistentFileHandle; /* source file handle, opaque endianness */
+ __u64 VolatileFileHandle;
+ __le64 SourceFileOffset;
+ __le64 TargetFileOffset;
+ __le64 ByteCount; /* Bytes to be copied */
+ __le32 Flags;
+ __le32 Reserved;
+} __packed;
+
+
+/* See MS-FSCC 2.3.20 */
+struct fsctl_get_integrity_information_rsp {
+ __le16 ChecksumAlgorithm;
+ __le16 Reserved;
+ __le32 Flags;
+ __le32 ChecksumChunkSizeInBytes;
+ __le32 ClusterSizeInBytes;
+} __packed;
+
+/* See MS-FSCC 2.3.55 */
+struct fsctl_query_file_regions_req {
+ __le64 FileOffset;
+ __le64 Length;
+ __le32 DesiredUsage;
+ __le32 Reserved;
+} __packed;
+
+/* DesiredUsage flags see MS-FSCC 2.3.56.1 */
+#define FILE_USAGE_INVALID_RANGE 0x00000000
+#define FILE_USAGE_VALID_CACHED_DATA 0x00000001
+#define FILE_USAGE_NONCACHED_DATA 0x00000002
+
+struct file_region_info {
+ __le64 FileOffset;
+ __le64 Length;
+ __le32 DesiredUsage;
+ __le32 Reserved;
+} __packed;
+
+/* See MS-FSCC 2.3.56 */
+struct fsctl_query_file_region_rsp {
+ __le32 Flags;
+ __le32 TotalRegionEntryCount;
+ __le32 RegionEntryCount;
+ __u32 Reserved;
+ struct file_region_info Regions[];
+} __packed;
+
+/* See MS-FSCC 2.3.58 */
+struct fsctl_query_on_disk_vol_info_rsp {
+ __le64 DirectoryCount;
+ __le64 FileCount;
+ __le16 FsFormatMajVersion;
+ __le16 FsFormatMinVersion;
+ __u8 FsFormatName[24];
+ __le64 FormatTime;
+ __le64 LastUpdateTime;
+ __u8 CopyrightInfo[68];
+ __u8 AbstractInfo[68];
+ __u8 FormatImplInfo[68];
+ __u8 LastModifyImplInfo[68];
+} __packed;
+
+/* See MS-FSCC 2.3.73 */
+struct fsctl_set_integrity_information_req {
+ __le16 ChecksumAlgorithm;
+ __le16 Reserved;
+ __le32 Flags;
+} __packed;
+
+/* See MS-FSCC 2.3.75 */
+struct fsctl_set_integrity_info_ex_req {
+ __u8 EnableIntegrity;
+ __u8 KeepState;
+ __u16 Reserved;
+ __le32 Flags;
+ __u8 Version;
+ __u8 Reserved2[7];
+} __packed;
+
+/* Integrity ChecksumAlgorithm choices for above */
+#define CHECKSUM_TYPE_NONE 0x0000
+#define CHECKSUM_TYPE_CRC64 0x0002
+#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */
+
+/* Integrity flags for above */
+#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
+
/* Reparse structures - see MS-FSCC 2.1.2 */
/* struct fsctl_reparse_info_req is empty, only response structs (see below) */
@@ -1304,13 +1404,6 @@ struct validate_negotiate_info_rsp {
__le16 Dialect; /* Dialect in use for the connection */
} __packed;
-struct duplicate_extents_to_file {
- __u64 PersistentFileHandle; /* source file handle, opaque endianness */
- __u64 VolatileFileHandle;
- __le64 SourceFileOffset;
- __le64 TargetFileOffset;
- __le64 ByteCount; /* Bytes to be copied */
-} __packed;
/* Possible InfoType values */
#define SMB2_O_INFO_FILE 0x01
@@ -1419,6 +1512,7 @@ struct smb2_query_info_rsp {
* PDU query infolevel structure definitions
*/
+/* See MS-FSCC 2.3.52 */
struct file_allocated_range_buffer {
__le64 file_offset;
__le64 length;
diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h
index d51939c43ad7..edd7fc2a7921 100644
--- a/fs/smbfs_common/smbfsctl.h
+++ b/fs/smbfs_common/smbfsctl.h
@@ -88,21 +88,27 @@
#define FSCTL_READ_RAW_ENCRYPTED 0x000900E3 /* BB add struct */
#define FSCTL_READ_FILE_USN_DATA 0x000900EB /* BB add struct */
#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
+#define FSCTL_MARK_HANDLE 0x000900FC /* BB add struct */
#define FSCTL_SIS_COPYFILE 0x00090100 /* BB add struct */
#define FSCTL_RECALL_FILE 0x00090117 /* BB add struct */
#define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */
+#define FSCTL_QUERY_ON_DISK_VOLUME_INFO 0x0009013C
#define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */
#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C
+#define FSCTL_QUERY_FILE_REGIONS 0x00090284
#define FSCTL_GET_REFS_VOLUME_DATA 0x000902D8 /* See MS-FSCC 2.3.24 */
#define FSCTL_SET_INTEGRITY_INFORMATION_EXT 0x00090380
#define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3
#define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b
#define FSCTL_REFS_STREAM_SNAPSHOT_MANAGEMENT 0x00090440
#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF
+#define FSCTL_OFFLOAD_READ 0x00094264 /* BB add struct */
+#define FSCTL_OFFLOAD_WRITE 0x00098268 /* BB add struct */
#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */
#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344
+#define FSCTL_DUPLICATE_EXTENTS_TO_FILE_EX 0x000983E8
#define FSCTL_SIS_LINK_FILES 0x0009C104
#define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280
#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */
diff --git a/fs/stat.c b/fs/stat.c
index 5c2c94464e8b..9ced8860e0f3 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -659,7 +659,7 @@ SYSCALL_DEFINE5(statx,
return ret;
}
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT)
static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
struct compat_stat tmp;
diff --git a/fs/sync.c b/fs/sync.c
index c7690016453e..dc725914e1ed 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -373,6 +373,15 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
return ksys_sync_file_range(fd, offset, nbytes, flags);
}
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_SYNC_FILE_RANGE)
+COMPAT_SYSCALL_DEFINE6(sync_file_range, int, fd, compat_arg_u64_dual(offset),
+ compat_arg_u64_dual(nbytes), unsigned int, flags)
+{
+ return ksys_sync_file_range(fd, compat_arg_u64_glue(offset),
+ compat_arg_u64_glue(nbytes), flags);
+}
+#endif
+
/* It would be nice if people remember that not all the world's an i386
when they introduce new system calls */
SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index d1def0771a40..3365a30dc1e0 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -312,7 +312,9 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
sbi->s_firstinodezone = 2;
flavour_setup[sbi->s_type](sbi, &sb->s_max_links);
-
+ if (sbi->s_firstdatazone < sbi->s_firstinodezone)
+ return 0;
+
sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone;
sbi->s_inodes_per_block = bsize >> 6;
sbi->s_inodes_per_block_1 = (bsize >> 6)-1;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index de7252715b12..81d26abf486f 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -553,7 +553,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
*
* Only one instances directory is allowed.
*
- * The instances directory is special as it allows for mkdir and rmdir to
+ * The instances directory is special as it allows for mkdir and rmdir
* to be done by userspace. When a mkdir or rmdir is performed, the inode
* locks are released and the methods passed in (@mkdir and @rmdir) are
* called without locks and with the name of the directory being created
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index c0b84e960b20..e8b9b756f0ac 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -65,7 +65,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write)
*/
static int run_gc(struct ubifs_info *c)
{
- int err, lnum;
+ int lnum;
/* Make some free space by garbage-collecting dirty space */
down_read(&c->commit_sem);
@@ -76,10 +76,7 @@ static int run_gc(struct ubifs_info *c)
/* GC freed one LEB, return it to lprops */
dbg_budg("GC freed LEB %d", lnum);
- err = ubifs_return_leb(c, lnum);
- if (err)
- return err;
- return 0;
+ return ubifs_return_leb(c, lnum);
}
/**
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e4f193eae4b2..e4c4761aff7f 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -677,7 +677,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
int err;
err = security_inode_init_security(inode, dentry, qstr,
- &init_xattrs, 0);
+ &init_xattrs, NULL);
if (err) {
struct ubifs_info *c = dentry->i_sb->s_fs_info;
ubifs_err(c, "cannot initialize security for inode %lu, error %d",
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index aa0c47cb0d16..e943370107d0 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -29,6 +29,7 @@
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/swapops.h>
int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -249,9 +250,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us.
+ * changes under us. PTE markers should be handled the same as none
+ * ptes here.
*/
- if (huge_pte_none(pte))
+ if (huge_pte_none_mostly(pte))
ret = true;
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
ret = true;
@@ -330,9 +332,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pte = pte_offset_map(pmd, address);
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us.
+ * changes under us. PTE markers should be handled the same as none
+ * ptes here.
*/
- if (pte_none(*pte))
+ if (pte_none_mostly(*pte))
ret = true;
if (!pte_write(*pte) && (reason & VM_UFFD_WP))
ret = true;
@@ -1255,24 +1258,6 @@ static __always_inline int validate_range(struct mm_struct *mm,
return 0;
}
-static inline bool vma_can_userfault(struct vm_area_struct *vma,
- unsigned long vm_flags)
-{
- /* FIXME: add WP support to hugetlbfs and shmem */
- if (vm_flags & VM_UFFD_WP) {
- if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
- return false;
- }
-
- if (vm_flags & VM_UFFD_MINOR) {
- if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
- return false;
- }
-
- return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
- vma_is_shmem(vma);
-}
-
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
@@ -1954,6 +1939,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
#endif
+#ifndef CONFIG_PTE_MARKER_UFFD_WP
+ uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 04611a1068b4..b056cfc6398e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -102,6 +102,7 @@ xfs-y += xfs_log.o \
xfs_buf_item_recover.o \
xfs_dquot_item_recover.o \
xfs_extfree_item.o \
+ xfs_attr_item.o \
xfs_icreate_item.o \
xfs_inode_item.o \
xfs_inode_item_recover.o \
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 1e4ee042d52f..3e920cf1b454 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -173,7 +173,6 @@ __xfs_free_perag(
struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
- ASSERT(atomic_read(&pag->pag_ref) == 0);
kmem_free(pag);
}
@@ -192,7 +191,7 @@ xfs_free_perag(
pag = radix_tree_delete(&mp->m_perag_tree, agno);
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
- ASSERT(atomic_read(&pag->pag_ref) == 0);
+ XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
cancel_delayed_work_sync(&pag->pag_blockgc_work);
xfs_iunlink_destroy(pag);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index b52ed339727f..d3f2886fdc08 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2511,7 +2511,7 @@ __xfs_free_extent_later(
ASSERT(bno != NULLFSBLOCK);
ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
ASSERT(!isnullstartblock(bno));
agno = XFS_FSB_TO_AGNO(mp, bno);
agbno = XFS_FSB_TO_AGBNO(mp, bno);
@@ -2777,7 +2777,7 @@ xfs_alloc_get_freelist(
xfs_agblock_t bno;
__be32 *agfl_bno;
int error;
- int logflags;
+ uint32_t logflags;
struct xfs_mount *mp = tp->t_mountp;
struct xfs_perag *pag;
@@ -2830,9 +2830,9 @@ xfs_alloc_get_freelist(
*/
void
xfs_alloc_log_agf(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *bp, /* buffer for a.g. freelist header */
- int fields) /* mask of fields to be logged (XFS_AGF_...) */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
{
int first; /* first byte offset */
int last; /* last byte offset */
@@ -2902,7 +2902,7 @@ xfs_alloc_put_freelist(
struct xfs_perag *pag;
__be32 *blockp;
int error;
- int logflags;
+ uint32_t logflags;
__be32 *agfl_bno;
int startoff;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d4c057b764f9..84ca09b2223f 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -121,7 +121,7 @@ void
xfs_alloc_log_agf(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *bp, /* buffer for a.g. freelist header */
- int fields);/* mask of fields to be logged (XFS_AGF_...) */
+ uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */
/*
* Interface for inode allocation to force the pag data to be initialized.
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 23523b802539..1824f61621a2 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -24,6 +24,10 @@
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
+#include "xfs_attr_item.h"
+#include "xfs_xattr.h"
+
+struct kmem_cache *xfs_attr_intent_cache;
/*
* xfs_attr.c
@@ -53,26 +57,22 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp);
*/
STATIC int xfs_attr_node_get(xfs_da_args_t *args);
STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args);
-STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_addname_clear_incomplete(
- struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_hasname(xfs_da_args_t *args,
- struct xfs_da_state **state);
-STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp);
-STATIC int xfs_attr_node_removename(struct xfs_da_args *args,
- struct xfs_da_state *state);
+static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_lookup(struct xfs_da_args *args,
+ struct xfs_da_state *state);
int
xfs_inode_hasattr(
struct xfs_inode *ip)
{
- if (!XFS_IFORK_Q(ip) ||
- (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0))
+ if (!XFS_IFORK_Q(ip))
+ return 0;
+ if (!ip->i_afp)
+ return 0;
+ if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_afp->if_nextents == 0)
return 0;
return 1;
}
@@ -97,6 +97,123 @@ xfs_attr_is_leaf(
return imap.br_startoff == 0 && imap.br_blockcount == 1;
}
+/*
+ * XXX (dchinner): name path state saving and refilling is an optimisation to
+ * avoid needing to look up name entries after rolling transactions removing
+ * remote xattr blocks between the name entry lookup and name entry removal.
+ * This optimisation got sidelined when combining the set and remove state
+ * machines, but the code has been left in place because it is worthwhile to
+ * restore the optimisation once the combined state machine paths have settled.
+ *
+ * This comment is a public service announcement to remind Future Dave that he
+ * still needs to restore this code to working order.
+ */
+#if 0
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+static int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level;
+
+ trace_xfs_attr_fillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+static int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level, error;
+
+ trace_xfs_attr_refillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ return 0;
+}
+#else
+static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; }
+#endif
+
/*========================================================================
* Overall external interface routines.
*========================================================================*/
@@ -166,7 +283,7 @@ xfs_attr_get(
/*
* Calculate how many blocks we need for the new attribute,
*/
-STATIC int
+int
xfs_attr_calc_size(
struct xfs_da_args *args,
int *local)
@@ -199,6 +316,33 @@ xfs_attr_calc_size(
return nblks;
}
+/* Initialize transaction reservation for attr operations */
+void
+xfs_init_attr_trans(
+ struct xfs_da_args *args,
+ struct xfs_trans_res *tres,
+ unsigned int *total)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+
+ if (args->value) {
+ tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres *
+ args->total;
+ tres->tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres->tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ *total = args->total;
+ } else {
+ *tres = M_RES(mp)->tr_attrrm;
+ *total = XFS_ATTRRM_SPACE_RES(mp);
+ }
+}
+
+/*
+ * Add an attr to a shortform fork. If there is no space,
+ * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC.
+ * to use.
+ */
STATIC int
xfs_attr_try_sf_addname(
struct xfs_inode *dp,
@@ -230,411 +374,487 @@ xfs_attr_try_sf_addname(
return error;
}
-/*
- * Check to see if the attr should be upgraded from non-existent or shortform to
- * single-leaf-block attribute list.
- */
-static inline bool
-xfs_attr_is_shortform(
- struct xfs_inode *ip)
+static int
+xfs_attr_sf_addname(
+ struct xfs_attr_intent *attr)
{
- return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
- (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0);
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ int error = 0;
+
+ error = xfs_attr_try_sf_addname(dp, args);
+ if (error != -ENOSPC) {
+ ASSERT(!error || error == -EEXIST);
+ attr->xattri_dela_state = XFS_DAS_DONE;
+ goto out;
+ }
+
+ /*
+ * It won't fit in the shortform, transform to a leaf block. GROT:
+ * another possible req'mt for a double-split btree op.
+ */
+ error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp);
+ if (error)
+ return error;
+
+ /*
+ * Prevent the leaf buffer from being unlocked so that a concurrent AIL
+ * push cannot grab the half-baked leaf buffer and run into problems
+ * with the write verifier.
+ */
+ xfs_trans_bhold(args->trans, attr->xattri_leaf_bp);
+ attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
+out:
+ trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
+ return error;
}
/*
- * Checks to see if a delayed attribute transaction should be rolled. If so,
- * transaction is finished or rolled as needed.
+ * Handle the state change on completion of a multi-state attr operation.
+ *
+ * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first
+ * modification in a attr replace operation and we still have to do the second
+ * state, indicated by @replace_state.
+ *
+ * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on
+ * completion of the second half of the attr replace operation we correctly
+ * signal that it is done.
*/
-STATIC int
-xfs_attr_trans_roll(
- struct xfs_delattr_context *dac)
+static enum xfs_delattr_state
+xfs_attr_complete_op(
+ struct xfs_attr_intent *attr,
+ enum xfs_delattr_state replace_state)
{
- struct xfs_da_args *args = dac->da_args;
- int error;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
+
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ if (do_replace) {
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ return replace_state;
+ }
+ return XFS_DAS_DONE;
+}
+
+static int
+xfs_attr_leaf_addname(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ ASSERT(xfs_attr_is_leaf(args->dp));
+
+ /*
+ * Use the leaf buffer we may already hold locked as a result of
+ * a sf-to-leaf conversion. The held buffer is no longer valid
+ * after this call, regardless of the result.
+ */
+ error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp);
+ attr->xattri_leaf_bp = NULL;
+
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
+ return error;
- if (dac->flags & XFS_DAC_DEFER_FINISH) {
/*
- * The caller wants us to finish all the deferred ops so that we
- * avoid pinning the log tail with a large number of deferred
- * ops.
+ * We're not in leaf format anymore, so roll the transaction and
+ * retry the add to the newly allocated node block.
*/
- dac->flags &= ~XFS_DAC_DEFER_FINISH;
- error = xfs_defer_finish(&args->trans);
- } else
- error = xfs_trans_roll_inode(&args->trans, args->dp);
+ attr->xattri_dela_state = XFS_DAS_NODE_ADD;
+ goto out;
+ }
+ if (error)
+ return error;
+ /*
+ * We need to commit and roll if we need to allocate remote xattr blocks
+ * or perform more xattr manipulations. Otherwise there is nothing more
+ * to do and we can return success.
+ */
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_LEAF_REPLACE);
+out:
+ trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp);
return error;
}
/*
- * Set the attribute specified in @args.
+ * Add an entry to a node format attr tree.
+ *
+ * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell
+ * the difference between leaf + remote attr blocks and a node format tree,
+ * so we may still end up having to convert from leaf to node format here.
*/
-int
-xfs_attr_set_args(
- struct xfs_da_args *args)
+static int
+xfs_attr_node_addname(
+ struct xfs_attr_intent *attr)
{
- struct xfs_buf *leaf_bp = NULL;
- int error = 0;
- struct xfs_delattr_context dac = {
- .da_args = args,
- };
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
- do {
- error = xfs_attr_set_iter(&dac, &leaf_bp);
- if (error != -EAGAIN)
- break;
+ ASSERT(!attr->xattri_leaf_bp);
+
+ error = xfs_attr_node_addname_find_attr(attr);
+ if (error)
+ return error;
- error = xfs_attr_trans_roll(&dac);
- if (error) {
- if (leaf_bp)
- xfs_trans_brelse(args->trans, leaf_bp);
+ error = xfs_attr_node_try_addname(attr);
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
return error;
- }
- } while (true);
+ /*
+ * No state change, we really are in node form now
+ * but we need the transaction rolled to continue.
+ */
+ goto out;
+ }
+ if (error)
+ return error;
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_NODE_REPLACE);
+out:
+ trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp);
return error;
}
-STATIC int
-xfs_attr_sf_addname(
- struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp)
+static int
+xfs_attr_rmtval_alloc(
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_inode *dp = args->dp;
+ struct xfs_da_args *args = attr->xattri_da_args;
int error = 0;
/*
- * Try to add the attr to the attribute list in the inode.
+ * If there was an out-of-line value, allocate the blocks we
+ * identified for its storage and copy the value. This is done
+ * after we create the attribute so that we don't overflow the
+ * maximum size of a transaction and/or hit a deadlock.
*/
- error = xfs_attr_try_sf_addname(dp, args);
+ if (attr->xattri_blkcnt > 0) {
+ error = xfs_attr_rmtval_set_blk(attr);
+ if (error)
+ return error;
+ /* Roll the transaction only if there is more to allocate. */
+ if (attr->xattri_blkcnt > 0)
+ goto out;
+ }
- /* Should only be 0, -EEXIST or -ENOSPC */
- if (error != -ENOSPC)
+ error = xfs_attr_rmtval_set_value(args);
+ if (error)
return error;
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ ++attr->xattri_dela_state);
/*
- * It won't fit in the shortform, transform to a leaf block. GROT:
- * another possible req'mt for a double-split btree op.
+ * If we are not doing a rename, we've finished the operation but still
+ * have to clear the incomplete flag protecting the new attr from
+ * exposing partially initialised state if we crash during creation.
*/
- error = xfs_attr_shortform_to_leaf(args, leaf_bp);
- if (error)
- return error;
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ error = xfs_attr3_leaf_clearflag(args);
+out:
+ trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+/*
+ * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
+ * for later deletion of the entry.
+ */
+static int
+xfs_attr_leaf_mark_incomplete(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ int error;
/*
- * Prevent the leaf buffer from being unlocked so that a concurrent AIL
- * push cannot grab the half-baked leaf buffer and run into problems
- * with the write verifier.
+ * Fill in disk block numbers in the state structure
+ * so that we can get the buffers back after we commit
+ * several transactions in the following calls.
*/
- xfs_trans_bhold(args->trans, *leaf_bp);
+ error = xfs_attr_fillstate(state);
+ if (error)
+ return error;
/*
- * We're still in XFS_DAS_UNINIT state here. We've converted
- * the attr fork to leaf format and will restart with the leaf
- * add.
+ * Mark the attribute as INCOMPLETE
*/
- trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp);
- dac->flags |= XFS_DAC_DEFER_FINISH;
- return -EAGAIN;
+ return xfs_attr3_leaf_setflag(args);
+}
+
+/* Ensure the da state of an xattr deferred work item is ready to go. */
+static inline void
+xfs_attr_item_init_da_state(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+
+ if (!attr->xattri_da_state)
+ attr->xattri_da_state = xfs_da_state_alloc(args);
+ else
+ xfs_da_state_reset(attr->xattri_da_state, args);
}
/*
- * Set the attribute specified in @args.
- * This routine is meant to function as a delayed operation, and may return
- * -EAGAIN when the transaction needs to be rolled. Calling functions will need
- * to handle this, and recall the function until a successful error code is
- * returned.
+ * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
+ * the blocks are valid. Attr keys with remote blocks will be marked
+ * incomplete.
*/
-int
-xfs_attr_set_iter(
- struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp)
+static
+int xfs_attr_node_removename_setup(
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_inode *dp = args->dp;
- struct xfs_buf *bp = NULL;
- int forkoff, error = 0;
-
- /* State machine switch */
- switch (dac->dela_state) {
- case XFS_DAS_UNINIT:
- /*
- * If the fork is shortform, attempt to add the attr. If there
- * is no space, this converts to leaf format and returns
- * -EAGAIN with the leaf buffer held across the roll. The caller
- * will deal with a transaction roll error, but otherwise
- * release the hold once we return with a clean transaction.
- */
- if (xfs_attr_is_shortform(dp))
- return xfs_attr_sf_addname(dac, leaf_bp);
- if (*leaf_bp != NULL) {
- xfs_trans_bhold_release(args->trans, *leaf_bp);
- *leaf_bp = NULL;
- }
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state *state;
+ int error;
- if (xfs_attr_is_leaf(dp)) {
- error = xfs_attr_leaf_try_add(args, *leaf_bp);
- if (error == -ENOSPC) {
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- return error;
-
- /*
- * Finish any deferred work items and roll the
- * transaction once more. The goal here is to
- * call node_addname with the inode and
- * transaction in the same state (inode locked
- * and joined, transaction clean) no matter how
- * we got to this step.
- *
- * At this point, we are still in
- * XFS_DAS_UNINIT, but when we come back, we'll
- * be a node, so we'll fall down into the node
- * handling code below
- */
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- } else if (error) {
- return error;
- }
+ xfs_attr_item_init_da_state(attr);
+ error = xfs_attr_node_lookup(args, attr->xattri_da_state);
+ if (error != -EEXIST)
+ goto out;
+ error = 0;
- dac->dela_state = XFS_DAS_FOUND_LBLK;
- } else {
- error = xfs_attr_node_addname_find_attr(dac);
- if (error)
- return error;
+ state = attr->xattri_da_state;
+ ASSERT(state->path.blk[state->path.active - 1].bp != NULL);
+ ASSERT(state->path.blk[state->path.active - 1].magic ==
+ XFS_ATTR_LEAF_MAGIC);
- error = xfs_attr_node_addname(dac);
- if (error)
- return error;
+ error = xfs_attr_leaf_mark_incomplete(args, state);
+ if (error)
+ goto out;
+ if (args->rmtblkno > 0)
+ error = xfs_attr_rmtval_invalidate(args);
+out:
+ if (error) {
+ xfs_da_state_free(attr->xattri_da_state);
+ attr->xattri_da_state = NULL;
+ }
- dac->dela_state = XFS_DAS_FOUND_NBLK;
- }
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- case XFS_DAS_FOUND_LBLK:
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
+ return error;
+}
- /* Open coded xfs_attr_rmtval_set without trans handling */
- if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) {
- dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT;
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_find_space(dac);
- if (error)
- return error;
- }
- }
+/*
+ * Remove the original attr we have just replaced. This is dependent on the
+ * original lookup and insert placing the old attr in args->blkno/args->index
+ * and the new attr in args->blkno2/args->index2.
+ */
+static int
+xfs_attr_leaf_remove_attr(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL;
+ int forkoff;
+ int error;
- /*
- * Repeat allocating remote blocks for the attr value until
- * blkcnt drops to zero.
- */
- if (dac->blkcnt > 0) {
- error = xfs_attr_rmtval_set_blk(dac);
- if (error)
- return error;
- trace_xfs_attr_set_iter_return(dac->dela_state,
- args->dp);
- return -EAGAIN;
- }
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+ &bp);
+ if (error)
+ return error;
- error = xfs_attr_rmtval_set_value(args);
- if (error)
- return error;
+ xfs_attr3_leaf_remove(bp, args);
- /*
- * If this is not a rename, clear the incomplete flag and we're
- * done.
- */
- if (!(args->op_flags & XFS_DA_OP_RENAME)) {
- if (args->rmtblkno > 0)
- error = xfs_attr3_leaf_clearflag(args);
- return error;
- }
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
- /*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- *
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
- */
- error = xfs_attr3_leaf_flipflags(args);
- if (error)
- return error;
- /*
- * Commit the flag value change and start the next trans in
- * series.
- */
- dac->dela_state = XFS_DAS_FLIP_LFLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- case XFS_DAS_FLIP_LFLAG:
- /*
- * Dismantle the "old" attribute/value pair by removing a
- * "remote" value (if it exists).
- */
- xfs_attr_restore_rmt_blk(args);
- error = xfs_attr_rmtval_invalidate(args);
- if (error)
- return error;
+ return error;
+}
- fallthrough;
- case XFS_DAS_RM_LBLK:
- /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
- dac->dela_state = XFS_DAS_RM_LBLK;
- if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN)
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- if (error)
- return error;
+/*
+ * Shrink an attribute from leaf to shortform. Used by the node format remove
+ * path when the node format collapses to a single block and so we have to check
+ * if it can be collapsed further.
+ */
+static int
+xfs_attr_leaf_shrink(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp;
+ int forkoff;
+ int error;
- dac->dela_state = XFS_DAS_RD_LEAF;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- }
+ if (!xfs_attr_is_leaf(dp))
+ return 0;
- fallthrough;
- case XFS_DAS_RD_LEAF:
- /*
- * This is the last step for leaf format. Read the block with
- * the old attr, remove the old attr, check for shortform
- * conversion and return.
- */
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
- &bp);
- if (error)
- return error;
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
- xfs_attr3_leaf_remove(bp, args);
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff) {
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+ } else {
+ xfs_trans_brelse(args->trans, bp);
+ }
- forkoff = xfs_attr_shortform_allfit(bp, dp);
- if (forkoff)
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
+ return error;
+}
- return error;
+/*
+ * Run the attribute operation specified in @attr.
+ *
+ * This routine is meant to function as a delayed operation and will set the
+ * state to XFS_DAS_DONE when the operation is complete. Calling functions will
+ * need to handle this, and recall the function until either an error or
+ * XFS_DAS_DONE is detected.
+ */
+int
+xfs_attr_set_iter(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error = 0;
- case XFS_DAS_FOUND_NBLK:
- /*
- * Find space for remote blocks and fall into the allocation
- * state.
- */
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_find_space(dac);
- if (error)
- return error;
+ /* State machine switch */
+next_state:
+ switch (attr->xattri_dela_state) {
+ case XFS_DAS_UNINIT:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ case XFS_DAS_SF_ADD:
+ return xfs_attr_sf_addname(attr);
+ case XFS_DAS_LEAF_ADD:
+ return xfs_attr_leaf_addname(attr);
+ case XFS_DAS_NODE_ADD:
+ return xfs_attr_node_addname(attr);
+
+ case XFS_DAS_SF_REMOVE:
+ error = xfs_attr_sf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_LEAF_REMOVE:
+ error = xfs_attr_leaf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_NODE_REMOVE:
+ error = xfs_attr_node_removename_setup(attr);
+ if (error == -ENOATTR &&
+ (args->op_flags & XFS_DA_OP_RECOVERY)) {
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ error = 0;
+ break;
}
+ if (error)
+ return error;
+ attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT;
+ if (args->rmtblkno == 0)
+ attr->xattri_dela_state++;
+ break;
+ case XFS_DAS_LEAF_SET_RMT:
+ case XFS_DAS_NODE_SET_RMT:
+ error = xfs_attr_rmtval_find_space(attr);
+ if (error)
+ return error;
+ attr->xattri_dela_state++;
fallthrough;
- case XFS_DAS_ALLOC_NODE:
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
- dac->dela_state = XFS_DAS_ALLOC_NODE;
- if (args->rmtblkno > 0) {
- if (dac->blkcnt > 0) {
- error = xfs_attr_rmtval_set_blk(dac);
- if (error)
- return error;
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- error = xfs_attr_rmtval_set_value(args);
- if (error)
- return error;
- }
- /*
- * If this was not a rename, clear the incomplete flag and we're
- * done.
- */
- if (!(args->op_flags & XFS_DA_OP_RENAME)) {
- if (args->rmtblkno > 0)
- error = xfs_attr3_leaf_clearflag(args);
- goto out;
- }
+ case XFS_DAS_LEAF_ALLOC_RMT:
+ case XFS_DAS_NODE_ALLOC_RMT:
+ error = xfs_attr_rmtval_alloc(attr);
+ if (error)
+ return error;
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ break;
+ goto next_state;
+ case XFS_DAS_LEAF_REPLACE:
+ case XFS_DAS_NODE_REPLACE:
/*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- *
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
+ * We must "flip" the incomplete flags on the "new" and "old"
+ * attribute/value pairs so that one disappears and one appears
+ * atomically.
*/
error = xfs_attr3_leaf_flipflags(args);
if (error)
- goto out;
+ return error;
/*
- * Commit the flag value change and start the next trans in
- * series
+ * We must commit the flag value change now to make it atomic
+ * and then we can start the next trans in series at REMOVE_OLD.
*/
- dac->dela_state = XFS_DAS_FLIP_NFLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
+ attr->xattri_dela_state++;
+ break;
- case XFS_DAS_FLIP_NFLAG:
+ case XFS_DAS_LEAF_REMOVE_OLD:
+ case XFS_DAS_NODE_REMOVE_OLD:
/*
- * Dismantle the "old" attribute/value pair by removing a
- * "remote" value (if it exists).
+ * If we have a remote attr, start the process of removing it
+ * by invalidating any cached buffers.
+ *
+ * If we don't have a remote attr, we skip the remote block
+ * removal state altogether with a second state increment.
*/
xfs_attr_restore_rmt_blk(args);
-
- error = xfs_attr_rmtval_invalidate(args);
- if (error)
- return error;
-
- fallthrough;
- case XFS_DAS_RM_NBLK:
- /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
- dac->dela_state = XFS_DAS_RM_NBLK;
if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN)
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
-
+ error = xfs_attr_rmtval_invalidate(args);
if (error)
return error;
+ } else {
+ attr->xattri_dela_state++;
+ }
- dac->dela_state = XFS_DAS_CLR_FLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
+ attr->xattri_dela_state++;
+ goto next_state;
+
+ case XFS_DAS_LEAF_REMOVE_RMT:
+ case XFS_DAS_NODE_REMOVE_RMT:
+ error = xfs_attr_rmtval_remove(attr);
+ if (error == -EAGAIN) {
+ error = 0;
+ break;
}
+ if (error)
+ return error;
- fallthrough;
- case XFS_DAS_CLR_FLAG:
/*
- * The last state for node format. Look up the old attr and
- * remove it.
+ * We've finished removing the remote attr blocks, so commit the
+ * transaction and move on to removing the attr name from the
+ * leaf/node block. Removing the attr might require a full
+ * transaction reservation for btree block freeing, so we
+ * can't do that in the same transaction where we removed the
+ * remote attr blocks.
*/
- error = xfs_attr_node_addname_clear_incomplete(dac);
+ attr->xattri_dela_state++;
+ break;
+
+ case XFS_DAS_LEAF_REMOVE_ATTR:
+ error = xfs_attr_leaf_remove_attr(attr);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+
+ case XFS_DAS_NODE_REMOVE_ATTR:
+ error = xfs_attr_node_remove_attr(attr);
+ if (!error)
+ error = xfs_attr_leaf_shrink(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
break;
default:
ASSERT(0);
break;
}
-out:
+
+ trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp);
return error;
}
@@ -648,6 +868,7 @@ xfs_attr_lookup(
{
struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = NULL;
+ struct xfs_da_state *state;
int error;
if (!xfs_inode_hasattr(dp))
@@ -665,33 +886,85 @@ xfs_attr_lookup(
return error;
}
- return xfs_attr_node_hasname(args, NULL);
+ state = xfs_da_state_alloc(args);
+ error = xfs_attr_node_lookup(args, state);
+ xfs_da_state_free(state);
+ return error;
+}
+
+static int
+xfs_attr_intent_init(
+ struct xfs_da_args *args,
+ unsigned int op_flags, /* op flag (set or remove) */
+ struct xfs_attr_intent **attr) /* new xfs_attr_intent */
+{
+
+ struct xfs_attr_intent *new;
+
+ new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ new->xattri_op_flags = op_flags;
+ new->xattri_da_args = args;
+
+ *attr = new;
+ return 0;
}
-/*
- * Remove the attribute specified in @args.
- */
-int
-xfs_attr_remove_args(
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_add(
struct xfs_da_args *args)
{
- int error;
- struct xfs_delattr_context dac = {
- .da_args = args,
- };
+ struct xfs_attr_intent *new;
+ int error = 0;
- do {
- error = xfs_attr_remove_iter(&dac);
- if (error != -EAGAIN)
- break;
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new);
+ if (error)
+ return error;
- error = xfs_attr_trans_roll(&dac);
- if (error)
- return error;
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
- } while (true);
+ return 0;
+}
- return error;
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_replace(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_intent *new;
+ int error = 0;
+
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
+
+ return 0;
+}
+
+/* Removes an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_remove(
+ struct xfs_da_args *args)
+{
+
+ struct xfs_attr_intent *new;
+ int error;
+
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
+
+ return 0;
}
/*
@@ -724,14 +997,14 @@ xfs_attr_set(
/*
* We have no control over the attribute names that userspace passes us
* to remove, so we have to allow the name lookup prior to attribute
- * removal to fail as well.
+ * removal to fail as well. Preserve the logged flag, since we need
+ * to pass that through to the logging code.
*/
- args->op_flags = XFS_DA_OP_OKNOENT;
+ args->op_flags = XFS_DA_OP_OKNOENT |
+ (args->op_flags & XFS_DA_OP_LOGGED);
if (args->value) {
XFS_STATS_INC(mp, xs_attr_set);
-
- args->op_flags |= XFS_DA_OP_ADDNAME;
args->total = xfs_attr_calc_size(args, &local);
/*
@@ -748,20 +1021,10 @@ xfs_attr_set(
return error;
}
- tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
- M_RES(mp)->tr_attrsetrt.tr_logres *
- args->total;
- tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- total = args->total;
-
if (!local)
rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
} else {
XFS_STATS_INC(mp, xs_attr_remove);
-
- tres = M_RES(mp)->tr_attrrm;
- total = XFS_ATTRRM_SPACE_RES(mp);
rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
}
@@ -769,6 +1032,7 @@ xfs_attr_set(
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
+ xfs_init_attr_trans(args, &tres, &total);
error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
if (error)
return error;
@@ -776,33 +1040,43 @@ xfs_attr_set(
if (args->value || xfs_inode_hasattr(dp)) {
error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(args->trans, dp,
+ XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error)
goto out_trans_cancel;
}
error = xfs_attr_lookup(args);
- if (args->value) {
- if (error == -EEXIST && (args->attr_flags & XATTR_CREATE))
- goto out_trans_cancel;
- if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto out_trans_cancel;
- if (error != -ENOATTR && error != -EEXIST)
+ switch (error) {
+ case -EEXIST:
+ /* if no value, we are performing a remove operation */
+ if (!args->value) {
+ error = xfs_attr_defer_remove(args);
+ break;
+ }
+ /* Pure create fails if the attr already exists */
+ if (args->attr_flags & XATTR_CREATE)
goto out_trans_cancel;
- error = xfs_attr_set_args(args);
- if (error)
- goto out_trans_cancel;
- /* shortform attribute has already been committed */
- if (!args->trans)
- goto out_unlock;
- } else {
- if (error != -EEXIST)
+ error = xfs_attr_defer_replace(args);
+ break;
+ case -ENOATTR:
+ /* Can't remove what isn't there. */
+ if (!args->value)
goto out_trans_cancel;
- error = xfs_attr_remove_args(args);
- if (error)
+ /* Pure replace fails if no existing attr to replace. */
+ if (args->attr_flags & XATTR_REPLACE)
goto out_trans_cancel;
+
+ error = xfs_attr_defer_add(args);
+ break;
+ default:
+ goto out_trans_cancel;
}
+ if (error)
+ goto out_trans_cancel;
/*
* If this is a synchronous mount, make sure that the
@@ -845,28 +1119,41 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
* Add a name to the shortform attribute list structure
* This is the external routine.
*/
-STATIC int
-xfs_attr_shortform_addname(xfs_da_args_t *args)
+static int
+xfs_attr_shortform_addname(
+ struct xfs_da_args *args)
{
- int newsize, forkoff, retval;
+ int newsize, forkoff;
+ int error;
trace_xfs_attr_sf_addname(args);
- retval = xfs_attr_shortform_lookup(args);
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- return retval;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
- return retval;
- retval = xfs_attr_sf_removename(args);
- if (retval)
- return retval;
+ error = xfs_attr_shortform_lookup(args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ return error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ return error;
+
+ error = xfs_attr_sf_removename(args);
+ if (error)
+ return error;
+
/*
- * Since we have removed the old attr, clear ATTR_REPLACE so
- * that the leaf format add routine won't trip over the attr
- * not being around.
+ * Since we have removed the old attr, clear XFS_DA_OP_REPLACE
+ * so that the new attr doesn't fit in shortform format, the
+ * leaf format add routine won't trip over the attr not being
+ * around.
*/
- args->attr_flags &= ~XATTR_REPLACE;
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ break;
+ case 0:
+ break;
+ default:
+ return error;
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
@@ -889,8 +1176,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
* External routines when attribute list is one block
*========================================================================*/
-/* Store info about a remote block */
-STATIC void
+/* Save the current remote block info and clear the current pointers. */
+static void
xfs_attr_save_rmt_blk(
struct xfs_da_args *args)
{
@@ -899,10 +1186,13 @@ xfs_attr_save_rmt_blk(
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
args->rmtvaluelen2 = args->rmtvaluelen;
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
/* Set stored info about a remote block */
-STATIC void
+static void
xfs_attr_restore_rmt_blk(
struct xfs_da_args *args)
{
@@ -928,45 +1218,54 @@ xfs_attr_leaf_try_add(
struct xfs_da_args *args,
struct xfs_buf *bp)
{
- int retval;
+ int error;
/*
- * Look up the given attribute in the leaf block. Figure out if
- * the given flags produce an error or call for an atomic rename.
+ * If the caller provided a buffer to us, it is locked and held in
+ * the transaction because it just did a shortform to leaf conversion.
+ * Hence we don't need to read it again. Otherwise read in the leaf
+ * buffer.
*/
- retval = xfs_attr_leaf_hasname(args, &bp);
- if (retval != -ENOATTR && retval != -EEXIST)
- return retval;
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto out_brelse;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
+ if (bp) {
+ xfs_trans_bhold_release(args->trans, bp);
+ } else {
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Look up the xattr name to set the insertion point for the new xattr.
+ */
+ error = xfs_attr3_leaf_lookup_int(bp, args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto out_brelse;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
goto out_brelse;
trace_xfs_attr_leaf_replace(args);
-
- /* save the attribute state for later removal*/
- args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
- xfs_attr_save_rmt_blk(args);
-
/*
- * clear the remote attr state now that it is saved so that the
- * values reflect the state of the attribute we are about to
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
* add, not the attribute we just found and will remove later.
*/
- args->rmtblkno = 0;
- args->rmtblkcnt = 0;
- args->rmtvaluelen = 0;
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto out_brelse;
}
- /*
- * Add the attribute to the leaf block
- */
return xfs_attr3_leaf_add(bp, args);
out_brelse:
xfs_trans_brelse(args->trans, bp);
- return retval;
+ return error;
}
/*
@@ -1012,9 +1311,10 @@ xfs_attr_leaf_removename(
dp = args->dp;
error = xfs_attr_leaf_hasname(args, &bp);
-
if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return 0;
return error;
} else if (error != -EEXIST)
return error;
@@ -1062,32 +1362,20 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
return error;
}
-/*
- * Return EEXIST if attr is found, or ENOATTR if not
- * statep: If not null is set to point at the found state. Caller will
- * be responsible for freeing the state in this case.
- */
+/* Return EEXIST if attr is found, or ENOATTR if not. */
STATIC int
-xfs_attr_node_hasname(
+xfs_attr_node_lookup(
struct xfs_da_args *args,
- struct xfs_da_state **statep)
+ struct xfs_da_state *state)
{
- struct xfs_da_state *state;
int retval, error;
- state = xfs_da_state_alloc(args);
- if (statep != NULL)
- *statep = state;
-
/*
* Search to see if name exists, and get back a pointer to it.
*/
error = xfs_da3_node_lookup_int(state, &retval);
if (error)
- retval = error;
-
- if (!statep)
- xfs_da_state_free(state);
+ return error;
return retval;
}
@@ -1098,46 +1386,48 @@ xfs_attr_node_hasname(
STATIC int
xfs_attr_node_addname_find_attr(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
- int retval;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
/*
* Search to see if name already exists, and get back a pointer
* to where it should go.
*/
- retval = xfs_attr_node_hasname(args, &dac->da_state);
- if (retval != -ENOATTR && retval != -EEXIST)
- goto error;
-
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto error;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
+ xfs_attr_item_init_da_state(attr);
+ error = xfs_attr_node_lookup(args, attr->xattri_da_state);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
goto error;
- trace_xfs_attr_node_replace(args);
-
- /* save the attribute state for later removal*/
- args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
- xfs_attr_save_rmt_blk(args);
+ trace_xfs_attr_node_replace(args);
/*
- * clear the remote attr state now that it is saved so that the
- * values reflect the state of the attribute we are about to
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
* add, not the attribute we just found and will remove later.
*/
- args->rmtblkno = 0;
- args->rmtblkcnt = 0;
- args->rmtvaluelen = 0;
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto error;
}
return 0;
error:
- if (dac->da_state)
- xfs_da_state_free(dac->da_state);
- return retval;
+ if (attr->xattri_da_state) {
+ xfs_da_state_free(attr->xattri_da_state);
+ attr->xattri_da_state = NULL;
+ }
+ return error;
}
/*
@@ -1146,25 +1436,16 @@ error:
* This will involve walking down the Btree, and may involve splitting
* leaf nodes and even splitting intermediate nodes up to and including
* the root node (a special case of an intermediate node).
- *
- * "Remote" attribute values confuse the issue and atomic rename operations
- * add a whole extra layer of confusion on top of that.
- *
- * This routine is meant to function as a delayed operation, and may return
- * -EAGAIN when the transaction needs to be rolled. Calling functions will need
- * to handle this, and recall the function until a successful error code is
- *returned.
*/
-STATIC int
-xfs_attr_node_addname(
- struct xfs_delattr_context *dac)
+static int
+xfs_attr_node_try_addname(
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state *state = dac->da_state;
+ struct xfs_da_state *state = attr->xattri_da_state;
struct xfs_da_state_blk *blk;
int error;
- trace_xfs_attr_node_addname(args);
+ trace_xfs_attr_node_addname(state->args);
blk = &state->path.blk[state->path.active-1];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -1175,25 +1456,9 @@ xfs_attr_node_addname(
/*
* Its really a single leaf node, but it had
* out-of-line values so it looked like it *might*
- * have been a b-tree.
- */
- xfs_da_state_free(state);
- state = NULL;
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- goto out;
-
- /*
- * Now that we have converted the leaf to a node, we can
- * roll the transaction, and try xfs_attr3_leaf_add
- * again on re-entry. No need to set dela_state to do
- * this. dela_state is still unset by this function at
- * this point.
+ * have been a b-tree. Let the caller deal with this.
*/
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_node_addname_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
+ goto out;
}
/*
@@ -1205,7 +1470,6 @@ xfs_attr_node_addname(
error = xfs_da3_split(state);
if (error)
goto out;
- dac->flags |= XFS_DAC_DEFER_FINISH;
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -1214,148 +1478,12 @@ xfs_attr_node_addname(
}
out:
- if (state)
- xfs_da_state_free(state);
+ xfs_da_state_free(state);
+ attr->xattri_da_state = NULL;
return error;
}
-
-STATIC int
-xfs_attr_node_addname_clear_incomplete(
- struct xfs_delattr_context *dac)
-{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state *state = NULL;
- int retval = 0;
- int error = 0;
-
- /*
- * Re-find the "old" attribute entry after any split ops. The INCOMPLETE
- * flag means that we will find the "old" attr, not the "new" one.
- */
- args->attr_filter |= XFS_ATTR_INCOMPLETE;
- state = xfs_da_state_alloc(args);
- state->inleaf = 0;
- error = xfs_da3_node_lookup_int(state, &retval);
- if (error)
- goto out;
-
- error = xfs_attr_node_removename(args, state);
-
- /*
- * Check to see if the tree needs to be collapsed.
- */
- if (retval && (state->path.active > 1)) {
- error = xfs_da3_join(state);
- if (error)
- goto out;
- }
- retval = error = 0;
-
-out:
- if (state)
- xfs_da_state_free(state);
- if (error)
- return error;
- return retval;
-}
-
-/*
- * Shrink an attribute from leaf to shortform
- */
-STATIC int
-xfs_attr_node_shrink(
- struct xfs_da_args *args,
- struct xfs_da_state *state)
-{
- struct xfs_inode *dp = args->dp;
- int error, forkoff;
- struct xfs_buf *bp;
-
- /*
- * Have to get rid of the copy of this dabuf in the state.
- */
- ASSERT(state->path.active == 1);
- ASSERT(state->path.blk[0].bp);
- state->path.blk[0].bp = NULL;
-
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
- if (error)
- return error;
-
- forkoff = xfs_attr_shortform_allfit(bp, dp);
- if (forkoff) {
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
- } else
- xfs_trans_brelse(args->trans, bp);
-
- return error;
-}
-
-/*
- * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
- * for later deletion of the entry.
- */
-STATIC int
-xfs_attr_leaf_mark_incomplete(
- struct xfs_da_args *args,
- struct xfs_da_state *state)
-{
- int error;
-
- /*
- * Fill in disk block numbers in the state structure
- * so that we can get the buffers back after we commit
- * several transactions in the following calls.
- */
- error = xfs_attr_fillstate(state);
- if (error)
- return error;
-
- /*
- * Mark the attribute as INCOMPLETE
- */
- return xfs_attr3_leaf_setflag(args);
-}
-
-/*
- * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
- * the blocks are valid. Attr keys with remote blocks will be marked
- * incomplete.
- */
-STATIC
-int xfs_attr_node_removename_setup(
- struct xfs_delattr_context *dac)
-{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state **state = &dac->da_state;
- int error;
-
- error = xfs_attr_node_hasname(args, state);
- if (error != -EEXIST)
- goto out;
- error = 0;
-
- ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
- ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
- XFS_ATTR_LEAF_MAGIC);
-
- if (args->rmtblkno > 0) {
- error = xfs_attr_leaf_mark_incomplete(args, *state);
- if (error)
- goto out;
-
- error = xfs_attr_rmtval_invalidate(args);
- }
-out:
- if (error)
- xfs_da_state_free(*state);
-
- return error;
-}
-
-STATIC int
+static int
xfs_attr_node_removename(
struct xfs_da_args *args,
struct xfs_da_state *state)
@@ -1374,246 +1502,42 @@ xfs_attr_node_removename(
return retval;
}
-/*
- * Remove the attribute specified in @args.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- *
- * This routine is meant to function as either an in-line or delayed operation,
- * and may return -EAGAIN when the transaction needs to be rolled. Calling
- * functions will need to handle this, and call the function until a
- * successful error code is returned.
- */
-int
-xfs_attr_remove_iter(
- struct xfs_delattr_context *dac)
-{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state *state = dac->da_state;
- int retval, error = 0;
- struct xfs_inode *dp = args->dp;
-
- trace_xfs_attr_node_removename(args);
-
- switch (dac->dela_state) {
- case XFS_DAS_UNINIT:
- if (!xfs_inode_hasattr(dp))
- return -ENOATTR;
-
- /*
- * Shortform or leaf formats don't require transaction rolls and
- * thus state transitions. Call the right helper and return.
- */
- if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
- return xfs_attr_sf_removename(args);
-
- if (xfs_attr_is_leaf(dp))
- return xfs_attr_leaf_removename(args);
-
- /*
- * Node format may require transaction rolls. Set up the
- * state context and fall into the state machine.
- */
- if (!dac->da_state) {
- error = xfs_attr_node_removename_setup(dac);
- if (error)
- return error;
- state = dac->da_state;
- }
-
- fallthrough;
- case XFS_DAS_RMTBLK:
- dac->dela_state = XFS_DAS_RMTBLK;
-
- /*
- * If there is an out-of-line value, de-allocate the blocks.
- * This is done before we remove the attribute so that we don't
- * overflow the maximum size of a transaction and/or hit a
- * deadlock.
- */
- if (args->rmtblkno > 0) {
- /*
- * May return -EAGAIN. Roll and repeat until all remote
- * blocks are removed.
- */
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN) {
- trace_xfs_attr_remove_iter_return(
- dac->dela_state, args->dp);
- return error;
- } else if (error) {
- goto out;
- }
-
- /*
- * Refill the state structure with buffers (the prior
- * calls released our buffers) and close out this
- * transaction before proceeding.
- */
- ASSERT(args->rmtblkno == 0);
- error = xfs_attr_refillstate(state);
- if (error)
- goto out;
- dac->dela_state = XFS_DAS_RM_NAME;
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- fallthrough;
- case XFS_DAS_RM_NAME:
- /*
- * If we came here fresh from a transaction roll, reattach all
- * the buffers to the current transaction.
- */
- if (dac->dela_state == XFS_DAS_RM_NAME) {
- error = xfs_attr_refillstate(state);
- if (error)
- goto out;
- }
-
- retval = xfs_attr_node_removename(args, state);
-
- /*
- * Check to see if the tree needs to be collapsed. If so, roll
- * the transacton and fall into the shrink state.
- */
- if (retval && (state->path.active > 1)) {
- error = xfs_da3_join(state);
- if (error)
- goto out;
-
- dac->flags |= XFS_DAC_DEFER_FINISH;
- dac->dela_state = XFS_DAS_RM_SHRINK;
- trace_xfs_attr_remove_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- fallthrough;
- case XFS_DAS_RM_SHRINK:
- /*
- * If the result is small enough, push it all into the inode.
- * This is our final state so it's safe to return a dirty
- * transaction.
- */
- if (xfs_attr_is_leaf(dp))
- error = xfs_attr_node_shrink(args, state);
- ASSERT(error != -EAGAIN);
- break;
- default:
- ASSERT(0);
- error = -EINVAL;
- goto out;
- }
-out:
- if (state)
- xfs_da_state_free(state);
- return error;
-}
-
-/*
- * Fill in the disk block numbers in the state structure for the buffers
- * that are attached to the state structure.
- * This is done so that we can quickly reattach ourselves to those buffers
- * after some set of transaction commits have released these buffers.
- */
-STATIC int
-xfs_attr_fillstate(xfs_da_state_t *state)
+static int
+xfs_attr_node_remove_attr(
+ struct xfs_attr_intent *attr)
{
- xfs_da_state_path_t *path;
- xfs_da_state_blk_t *blk;
- int level;
-
- trace_xfs_attr_fillstate(state->args);
-
- /*
- * Roll down the "path" in the state structure, storing the on-disk
- * block number for those buffers in the "path".
- */
- path = &state->path;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->bp) {
- blk->disk_blkno = xfs_buf_daddr(blk->bp);
- blk->bp = NULL;
- } else {
- blk->disk_blkno = 0;
- }
- }
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state *state = xfs_da_state_alloc(args);
+ int retval = 0;
+ int error = 0;
/*
- * Roll down the "altpath" in the state structure, storing the on-disk
- * block number for those buffers in the "altpath".
+ * The attr we are removing has already been marked incomplete, so
+ * we need to set the filter appropriately to re-find the "old"
+ * attribute entry after any split ops.
*/
- path = &state->altpath;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->bp) {
- blk->disk_blkno = xfs_buf_daddr(blk->bp);
- blk->bp = NULL;
- } else {
- blk->disk_blkno = 0;
- }
- }
-
- return 0;
-}
-
-/*
- * Reattach the buffers to the state structure based on the disk block
- * numbers stored in the state structure.
- * This is done after some set of transaction commits have released those
- * buffers from our grip.
- */
-STATIC int
-xfs_attr_refillstate(xfs_da_state_t *state)
-{
- xfs_da_state_path_t *path;
- xfs_da_state_blk_t *blk;
- int level, error;
-
- trace_xfs_attr_refillstate(state->args);
+ args->attr_filter |= XFS_ATTR_INCOMPLETE;
+ error = xfs_da3_node_lookup_int(state, &retval);
+ if (error)
+ goto out;
- /*
- * Roll down the "path" in the state structure, storing the on-disk
- * block number for those buffers in the "path".
- */
- path = &state->path;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->disk_blkno) {
- error = xfs_da3_node_read_mapped(state->args->trans,
- state->args->dp, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
- if (error)
- return error;
- } else {
- blk->bp = NULL;
- }
- }
+ error = xfs_attr_node_removename(args, state);
/*
- * Roll down the "altpath" in the state structure, storing the on-disk
- * block number for those buffers in the "altpath".
+ * Check to see if the tree needs to be collapsed.
*/
- path = &state->altpath;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->disk_blkno) {
- error = xfs_da3_node_read_mapped(state->args->trans,
- state->args->dp, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
- if (error)
- return error;
- } else {
- blk->bp = NULL;
- }
+ if (retval && (state->path.active > 1)) {
+ error = xfs_da3_join(state);
+ if (error)
+ goto out;
}
+ retval = error = 0;
- return 0;
+out:
+ xfs_da_state_free(state);
+ if (error)
+ return error;
+ return retval;
}
/*
@@ -1639,7 +1563,8 @@ xfs_attr_node_get(
/*
* Search to see if name exists, and get back a pointer to it.
*/
- error = xfs_attr_node_hasname(args, &state);
+ state = xfs_da_state_alloc(args);
+ error = xfs_attr_node_lookup(args, state);
if (error != -EEXIST)
goto out_release;
@@ -1658,8 +1583,7 @@ out_release:
state->path.blk[i].bp = NULL;
}
- if (state)
- xfs_da_state_free(state);
+ xfs_da_state_free(state);
return error;
}
@@ -1679,3 +1603,20 @@ xfs_attr_namecheck(
/* There shouldn't be any nulls here */
return !memchr(name, 0, length);
}
+
+int __init
+xfs_attr_intent_init_cache(void)
+{
+ xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent",
+ sizeof(struct xfs_attr_intent),
+ 0, 0, NULL);
+
+ return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attr_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_attr_intent_cache);
+ xfs_attr_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 5e71f719bdd5..b4a2fc77017e 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -425,7 +425,7 @@ struct xfs_attr_list_context {
*/
/*
- * Enum values for xfs_delattr_context.da_state
+ * Enum values for xfs_attr_intent.xattri_da_state
*
* These values are used by delayed attribute operations to keep track of where
* they were before they returned -EAGAIN. A return code of -EAGAIN signals the
@@ -434,46 +434,107 @@ struct xfs_attr_list_context {
* to where it was and resume executing where it left off.
*/
enum xfs_delattr_state {
- XFS_DAS_UNINIT = 0, /* No state has been set yet */
- XFS_DAS_RMTBLK, /* Removing remote blks */
- XFS_DAS_RM_NAME, /* Remove attr name */
- XFS_DAS_RM_SHRINK, /* We are shrinking the tree */
- XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */
- XFS_DAS_FOUND_NBLK, /* We found node blk for attr */
- XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */
- XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */
- XFS_DAS_RD_LEAF, /* Read in the new leaf */
- XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */
- XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */
- XFS_DAS_RM_NBLK, /* A rename is removing node blocks */
- XFS_DAS_CLR_FLAG, /* Clear incomplete flag */
+ XFS_DAS_UNINIT = 0, /* No state has been set yet */
+
+ /*
+ * Initial sequence states. The replace setup code relies on the
+ * ADD and REMOVE states for a specific format to be sequential so
+ * that we can transform the initial operation to be performed
+ * according to the xfs_has_larp() state easily.
+ */
+ XFS_DAS_SF_ADD, /* Initial sf add state */
+ XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */
+
+ XFS_DAS_LEAF_ADD, /* Initial leaf add state */
+ XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */
+
+ XFS_DAS_NODE_ADD, /* Initial node add state */
+ XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */
+
+ /* Leaf state set/replace/remove sequence */
+ XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */
+ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */
+ XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */
+ XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */
+
+ /* Node state sequence, must match leaf state above */
+ XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */
+ XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */
+ XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */
+ XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */
+
+ XFS_DAS_DONE, /* finished operation */
};
-/*
- * Defines for xfs_delattr_context.flags
- */
-#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */
-#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/
+#define XFS_DAS_STRINGS \
+ { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \
+ { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \
+ { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \
+ { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \
+ { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \
+ { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \
+ { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \
+ { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \
+ { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \
+ { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \
+ { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \
+ { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \
+ { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \
+ { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \
+ { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \
+ { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \
+ { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \
+ { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \
+ { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \
+ { XFS_DAS_DONE, "XFS_DAS_DONE" }
+
+struct xfs_attri_log_nameval;
/*
* Context used for keeping track of delayed attribute operations
*/
-struct xfs_delattr_context {
- struct xfs_da_args *da_args;
-
- /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
- struct xfs_bmbt_irec map;
- xfs_dablk_t lblkno;
- int blkcnt;
+struct xfs_attr_intent {
+ /*
+ * used to log this item to an intent containing a list of attrs to
+ * commit later
+ */
+ struct list_head xattri_list;
/* Used in xfs_attr_node_removename to roll through removing blocks */
- struct xfs_da_state *da_state;
+ struct xfs_da_state *xattri_da_state;
+
+ struct xfs_da_args *xattri_da_args;
+
+ /*
+ * Shared buffer containing the attr name and value so that the logging
+ * code can share large memory buffers between log items.
+ */
+ struct xfs_attri_log_nameval *xattri_nameval;
+
+ /*
+ * Used by xfs_attr_set to hold a leaf buffer across a transaction roll
+ */
+ struct xfs_buf *xattri_leaf_bp;
/* Used to keep track of current state of delayed operation */
- unsigned int flags;
- enum xfs_delattr_state dela_state;
+ enum xfs_delattr_state xattri_dela_state;
+
+ /*
+ * Attr operation being performed - XFS_ATTRI_OP_FLAGS_*
+ */
+ unsigned int xattri_op_flags;
+
+ /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
+ xfs_dablk_t xattri_lblkno;
+ int xattri_blkcnt;
+ struct xfs_bmbt_irec xattri_map;
};
+
/*========================================================================
* Function prototypes for the kernel.
*========================================================================*/
@@ -489,11 +550,77 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip);
int xfs_attr_get_ilocked(struct xfs_da_args *args);
int xfs_attr_get(struct xfs_da_args *args);
int xfs_attr_set(struct xfs_da_args *args);
-int xfs_attr_set_args(struct xfs_da_args *args);
-int xfs_attr_remove_args(struct xfs_da_args *args);
-int xfs_attr_remove_iter(struct xfs_delattr_context *dac);
+int xfs_attr_set_iter(struct xfs_attr_intent *attr);
+int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
bool xfs_attr_namecheck(const void *name, size_t length);
-void xfs_delattr_context_init(struct xfs_delattr_context *dac,
- struct xfs_da_args *args);
+int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
+void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
+ unsigned int *total);
+
+/*
+ * Check to see if the attr should be upgraded from non-existent or shortform to
+ * single-leaf-block attribute list.
+ */
+static inline bool
+xfs_attr_is_shortform(
+ struct xfs_inode *ip)
+{
+ return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
+ (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_afp->if_nextents == 0);
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_add_state(struct xfs_da_args *args)
+{
+ /*
+ * When called from the completion of a attr remove to determine the
+ * next state, the attribute fork may be null. This can occur only occur
+ * on a pure remove, but we grab the next state before we check if a
+ * replace operation is being performed. If we are called from any other
+ * context, i_afp is guaranteed to exist. Hence if the attr fork is
+ * null, we were called from a pure remove operation and so we are done.
+ */
+ if (!args->dp->i_afp)
+ return XFS_DAS_DONE;
+
+ args->op_flags |= XFS_DA_OP_ADDNAME;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_ADD;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_ADD;
+ return XFS_DAS_NODE_ADD;
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_remove_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_REMOVE;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_REMOVE;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_REMOVE;
+ return XFS_DAS_NODE_REMOVE;
+}
+
+/*
+ * If we are logging the attributes, then we have to start with removal of the
+ * old attribute so that there is always consistent state that we can recover
+ * from if the system goes down part way through. We always log the new attr
+ * value, so even when we remove the attr first we still have the information in
+ * the log to finish the replace operation atomically.
+ */
+static inline enum xfs_delattr_state
+xfs_attr_init_replace_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE;
+ if (args->op_flags & XFS_DA_OP_LOGGED)
+ return xfs_attr_init_remove_state(args);
+ return xfs_attr_init_add_state(args);
+}
+
+extern struct kmem_cache *xfs_attr_intent_cache;
+int __init xfs_attr_intent_init_cache(void);
+void xfs_attr_intent_destroy_cache(void);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 014daa8c542d..37e7c33f6283 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -28,6 +28,7 @@
#include "xfs_dir2.h"
#include "xfs_log.h"
#include "xfs_ag.h"
+#include "xfs_errortag.h"
/*
@@ -310,6 +311,15 @@ xfs_attr3_leaf_verify(
return fa;
/*
+ * Empty leaf blocks should never occur; they imply the existence of a
+ * software bug that needs fixing. xfs_repair also flags them as a
+ * corruption that needs fixing, so we should never let these go to
+ * disk.
+ */
+ if (ichdr.count == 0)
+ return __this_address;
+
+ /*
* firstused is the block offset of the first name info structure.
* Make sure it doesn't go off the block or crash into the header.
*/
@@ -445,6 +455,14 @@ xfs_attr3_leaf_read(
* Namespace helper routines
*========================================================================*/
+/*
+ * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE
+ * flag on disk - if there's an incomplete attr then recovery needs to tear it
+ * down. If there's no incomplete attr, then recovery needs to tear that attr
+ * down to replace it with the attr that has been logged. In this case, the
+ * INCOMPLETE flag will not be set in attr->attr_filter, but rather
+ * XFS_DA_OP_RECOVERY will be set in args->op_flags.
+ */
static bool
xfs_attr_match(
struct xfs_da_args *args,
@@ -452,14 +470,18 @@ xfs_attr_match(
unsigned char *name,
int flags)
{
+
if (args->namelen != namelen)
return false;
if (memcmp(args->name, name, namelen) != 0)
return false;
- /*
- * If we are looking for incomplete entries, show only those, else only
- * show complete entries.
- */
+
+ /* Recovery ignores the INCOMPLETE flag. */
+ if ((args->op_flags & XFS_DA_OP_RECOVERY) &&
+ args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return true;
+
+ /* All remaining matches need to be filtered by INCOMPLETE state. */
if (args->attr_filter !=
(flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
return false;
@@ -798,6 +820,14 @@ xfs_attr_sf_removename(
sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
error = xfs_attr_sf_findname(args, &sfe, &base);
+
+ /*
+ * If we are recovering an operation, finding nothing to
+ * remove is not an error - it just means there was nothing
+ * to clean up.
+ */
+ if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
+ return 0;
if (error != -EEXIST)
return error;
size = xfs_attr_sf_entsize(sfe);
@@ -818,7 +848,7 @@ xfs_attr_sf_removename(
totsize -= size;
if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
- !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+ !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
xfs_attr_fork_remove(dp, args->trans);
} else {
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
@@ -1127,9 +1157,17 @@ xfs_attr3_leaf_to_shortform(
goto out;
if (forkoff == -1) {
- ASSERT(xfs_has_attr2(dp->i_mount));
- ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
- xfs_attr_fork_remove(dp, args->trans);
+ /*
+ * Don't remove the attr fork if this operation is the first
+ * part of a attr replace operations. We're going to add a new
+ * attr immediately, so we need to keep the attr fork around in
+ * this case.
+ */
+ if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
+ ASSERT(xfs_has_attr2(dp->i_mount));
+ ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
+ xfs_attr_fork_remove(dp, args->trans);
+ }
goto out;
}
@@ -1189,6 +1227,11 @@ xfs_attr3_leaf_to_node(
trace_xfs_attr_leaf_to_node(args);
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+ error = -EIO;
+ goto out;
+ }
+
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
@@ -1486,8 +1529,9 @@ xfs_attr3_leaf_add_work(
entry->flags = args->attr_filter;
if (tmp)
entry->flags |= XFS_ATTR_LOCAL;
- if (args->op_flags & XFS_DA_OP_RENAME) {
- entry->flags |= XFS_ATTR_INCOMPLETE;
+ if (args->op_flags & XFS_DA_OP_REPLACE) {
+ if (!(args->op_flags & XFS_DA_OP_LOGGED))
+ entry->flags |= XFS_ATTR_INCOMPLETE;
if ((args->blkno2 == args->blkno) &&
(args->index2 <= args->index)) {
args->index2++;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 83b95be9ded8..7298c148f848 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -568,14 +568,14 @@ xfs_attr_rmtval_stale(
*/
int
xfs_attr_rmtval_find_space(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_bmbt_irec *map = &dac->map;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
int error;
- dac->lblkno = 0;
- dac->blkcnt = 0;
+ attr->xattri_lblkno = 0;
+ attr->xattri_blkcnt = 0;
args->rmtblkcnt = 0;
args->rmtblkno = 0;
memset(map, 0, sizeof(struct xfs_bmbt_irec));
@@ -584,8 +584,8 @@ xfs_attr_rmtval_find_space(
if (error)
return error;
- dac->blkcnt = args->rmtblkcnt;
- dac->lblkno = args->rmtblkno;
+ attr->xattri_blkcnt = args->rmtblkcnt;
+ attr->xattri_lblkno = args->rmtblkno;
return 0;
}
@@ -598,17 +598,18 @@ xfs_attr_rmtval_find_space(
*/
int
xfs_attr_rmtval_set_blk(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_args *args = attr->xattri_da_args;
struct xfs_inode *dp = args->dp;
- struct xfs_bmbt_irec *map = &dac->map;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
int nmap;
int error;
nmap = 1;
- error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno,
- dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total,
+ error = xfs_bmapi_write(args->trans, dp,
+ (xfs_fileoff_t)attr->xattri_lblkno,
+ attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
map, &nmap);
if (error)
return error;
@@ -618,8 +619,8 @@ xfs_attr_rmtval_set_blk(
(map->br_startblock != HOLESTARTBLOCK));
/* roll attribute extent map forwards */
- dac->lblkno += map->br_blockcount;
- dac->blkcnt -= map->br_blockcount;
+ attr->xattri_lblkno += map->br_blockcount;
+ attr->xattri_blkcnt -= map->br_blockcount;
return 0;
}
@@ -673,9 +674,9 @@ xfs_attr_rmtval_invalidate(
*/
int
xfs_attr_rmtval_remove(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_args *args = attr->xattri_da_args;
int error, done;
/*
@@ -695,8 +696,8 @@ xfs_attr_rmtval_remove(
* the parent
*/
if (!done) {
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp);
+ trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state,
+ args->dp);
return -EAGAIN;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index d72eff30ca18..d097ec6c4dc3 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
xfs_buf_flags_t incore_flags);
int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
-int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr);
int xfs_attr_rmt_find_hole(struct xfs_da_args *args);
int xfs_attr_rmtval_set_value(struct xfs_da_args *args);
-int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac);
-int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr);
+int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr);
#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 74198dd82b03..6833110d1bd4 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels(
xfs_mount_t *mp, /* file system mount structure */
int whichfork) /* data or attr fork */
{
+ uint64_t maxblocks; /* max blocks at this level */
+ xfs_extnum_t maxleafents; /* max leaf entries possible */
int level; /* btree level */
- uint maxblocks; /* max blocks at this level */
- uint maxleafents; /* max leaf entries possible */
int maxrootrecs; /* max records in root block */
int minleafrecs; /* min records in leaf block */
int minnoderecs; /* min records in node block */
int sz; /* root block size */
/*
- * The maximum number of extents in a file, hence the maximum number of
- * leaf entries, is controlled by the size of the on-disk extent count,
- * either a signed 32-bit number for the data fork, or a signed 16-bit
- * number for the attr fork.
+ * The maximum number of extents in a fork, hence the maximum number of
+ * leaf entries, is controlled by the size of the on-disk extent count.
*
* Note that we can no longer assume that if we are in ATTR1 that the
* fork offset of all the inodes will be
@@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels(
* ATTR2 we have to assume the worst case scenario of a minimum size
* available.
*/
- if (whichfork == XFS_DATA_FORK) {
- maxleafents = MAXEXTNUM;
+ maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
+ whichfork);
+ if (whichfork == XFS_DATA_FORK)
sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
- } else {
- maxleafents = MAXAEXTNUM;
+ else
sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
- }
+
maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
minleafrecs = mp->m_bmap_dmnr[0];
minnoderecs = mp->m_bmap_dmnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ maxblocks = howmany_64(maxleafents, minleafrecs);
for (level = 1; maxblocks > 1; level++) {
if (maxblocks <= maxrootrecs)
maxblocks = 1;
else
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ maxblocks = howmany_64(maxblocks, minnoderecs);
}
mp->m_bm_maxlevels[whichfork] = level;
ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
@@ -468,7 +466,7 @@ error0:
if (bp_release)
xfs_trans_brelse(NULL, bp);
error_norelse:
- xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+ xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
__func__, i);
xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -485,7 +483,7 @@ STATIC void
xfs_bmap_validate_ret(
xfs_fileoff_t bno,
xfs_filblks_t len,
- int flags,
+ uint32_t flags,
xfs_bmbt_irec_t *mval,
int nmap,
int ret_nmap)
@@ -1399,7 +1397,7 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t da_new; /* new count del alloc blocks used */
xfs_filblks_t da_old; /* old count del alloc blocks used */
xfs_filblks_t temp=0; /* value for da_new calculations */
@@ -1452,7 +1450,7 @@ xfs_bmap_add_extent_delay_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
/*
@@ -1470,13 +1468,13 @@ xfs_bmap_add_extent_delay_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -1950,7 +1948,7 @@ xfs_bmap_add_extent_unwritten_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_bmbt_irec old;
@@ -2000,7 +1998,7 @@ xfs_bmap_add_extent_unwritten_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
/*
@@ -2018,13 +2016,13 @@ xfs_bmap_add_extent_unwritten_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2479,7 +2477,7 @@ xfs_bmap_add_extent_hole_delay(
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t temp; /* temp for indirect calculations */
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -2510,15 +2508,15 @@ xfs_bmap_add_extent_hole_delay(
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN)))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2616,7 +2614,7 @@ xfs_bmap_add_extent_hole_real(
struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new,
int *logflagsp,
- int flags)
+ uint32_t flags)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
@@ -2626,7 +2624,7 @@ xfs_bmap_add_extent_hole_real(
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
ASSERT(!isnullstartblock(new->br_startblock));
@@ -2661,17 +2659,17 @@ xfs_bmap_add_extent_hole_real(
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_startblock + left.br_blockcount == new->br_startblock &&
left.br_state == new->br_state &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_startblock + new->br_blockcount == right.br_startblock &&
new->br_state == right.br_state &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -2906,15 +2904,15 @@ xfs_bmap_extsize_align(
/*
* For large extent hint sizes, the aligned extent might be larger than
- * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
- * the length back under MAXEXTLEN. The outer allocation loops handle
- * short allocation just fine, so it is safe to do this. We only want to
- * do it when we are forced to, though, because it means more allocation
- * operations are required.
+ * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
+ * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
+ * allocation loops handle short allocation just fine, so it is safe to
+ * do this. We only want to do it when we are forced to, though, because
+ * it means more allocation operations are required.
*/
- while (align_alen > MAXEXTLEN)
+ while (align_alen > XFS_MAX_BMBT_EXTLEN)
align_alen -= extsz;
- ASSERT(align_alen <= MAXEXTLEN);
+ ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);
/*
* If the previous block overlaps with this proposed allocation
@@ -3004,9 +3002,9 @@ xfs_bmap_extsize_align(
return -EINVAL;
} else {
ASSERT(orig_off >= align_off);
- /* see MAXEXTLEN handling above */
+ /* see XFS_BMBT_MAX_EXTLEN handling above */
ASSERT(orig_end <= align_off + align_alen ||
- align_alen + extsz > MAXEXTLEN);
+ align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
}
#ifdef DEBUG
@@ -3766,7 +3764,7 @@ xfs_bmapi_trim_map(
xfs_fileoff_t obno,
xfs_fileoff_t end,
int n,
- int flags)
+ uint32_t flags)
{
if ((flags & XFS_BMAPI_ENTIRE) ||
got->br_startoff + got->br_blockcount <= obno) {
@@ -3811,7 +3809,7 @@ xfs_bmapi_update_map(
xfs_fileoff_t obno,
xfs_fileoff_t end,
int *n,
- int flags)
+ uint32_t flags)
{
xfs_bmbt_irec_t *mval = *map;
@@ -3864,7 +3862,7 @@ xfs_bmapi_read(
xfs_filblks_t len,
struct xfs_bmbt_irec *mval,
int *nmap,
- int flags)
+ uint32_t flags)
{
struct xfs_mount *mp = ip->i_mount;
int whichfork = xfs_bmapi_whichfork(flags);
@@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc(
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
- alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
+ alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
@@ -4104,7 +4102,7 @@ xfs_bmapi_allocate(
if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
bma->prev.br_startoff = NULLFILEOFF;
} else {
- bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+ bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
if (!bma->eof)
bma->length = XFS_FILBLKS_MIN(bma->length,
bma->got.br_startoff - bma->offset);
@@ -4184,7 +4182,7 @@ xfs_bmapi_convert_unwritten(
struct xfs_bmalloca *bma,
struct xfs_bmbt_irec *mval,
xfs_filblks_t len,
- int flags)
+ uint32_t flags)
{
int whichfork = xfs_bmapi_whichfork(flags);
struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
@@ -4312,7 +4310,7 @@ xfs_bmapi_write(
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t bno, /* starting file offs. mapped */
xfs_filblks_t len, /* length to map in file */
- int flags, /* XFS_BMAPI_... */
+ uint32_t flags, /* XFS_BMAPI_... */
xfs_extlen_t total, /* total blocks needed */
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap) /* i/o: mval size/count */
@@ -4424,8 +4422,8 @@ xfs_bmapi_write(
* xfs_extlen_t and therefore 32 bits. Hence we have to
* check for 32-bit overflows and handle them here.
*/
- if (len > (xfs_filblks_t)MAXEXTLEN)
- bma.length = MAXEXTLEN;
+ if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
+ bma.length = XFS_MAX_BMBT_EXTLEN;
else
bma.length = len;
@@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc(
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
error = xfs_iext_count_may_overflow(ip, whichfork,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
-
if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
bma.got.br_startoff > offset_fsb) {
/*
@@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc(
bma.ip = ip;
bma.wasdel = true;
bma.offset = bma.got.br_startoff;
- bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
+ XFS_MAX_BMBT_EXTLEN);
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
/*
@@ -4629,7 +4630,7 @@ xfs_bmapi_remap(
xfs_fileoff_t bno,
xfs_filblks_t len,
xfs_fsblock_t startblock,
- int flags)
+ uint32_t flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
@@ -4641,7 +4642,7 @@ xfs_bmapi_remap(
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(len > 0);
- ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
+ ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_NORMAP)));
@@ -4801,7 +4802,7 @@ xfs_bmap_del_extent_delay(
int64_t da_old, da_new, da_diff = 0;
xfs_fileoff_t del_endoff, got_endoff;
xfs_filblks_t got_indlen, new_indlen, stolen;
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
int error = 0;
bool isrt;
@@ -4926,7 +4927,7 @@ xfs_bmap_del_extent_cow(
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
struct xfs_bmbt_irec new;
xfs_fileoff_t del_endoff, got_endoff;
- int state = BMAP_COWFORK;
+ uint32_t state = BMAP_COWFORK;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -4999,7 +5000,7 @@ xfs_bmap_del_extent_real(
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
int whichfork, /* data or attr fork */
- int bflags) /* bmapi flags */
+ uint32_t bflags) /* bmapi flags */
{
xfs_fsblock_t del_endblock=0; /* first block past del */
xfs_fileoff_t del_endoff; /* first offset past del */
@@ -5015,7 +5016,7 @@ xfs_bmap_del_extent_real(
xfs_bmbt_irec_t new; /* new record to be inserted */
/* REFERENCED */
uint qfield; /* quota field to update */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
mp = ip->i_mount;
@@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real(
* Deleting the middle of the extent.
*/
- /*
- * For directories, -ENOSPC is returned since a directory entry
- * remove operation must not fail due to low extent count
- * availability. -ENOSPC will be handled by higher layers of XFS
- * by letting the corresponding empty Data/Free blocks to linger
- * until a future remove operation. Dabtree blocks would be
- * swapped with the last block in the leaf space and then the
- * new last block will be unmapped.
- *
- * The above logic also applies to the source directory entry of
- * a rename operation.
- */
- error = xfs_iext_count_may_overflow(ip, whichfork, 1);
- if (error) {
- ASSERT(S_ISDIR(VFS_I(ip)->i_mode) &&
- whichfork == XFS_DATA_FORK);
- error = -ENOSPC;
- goto done;
- }
-
old = got;
got.br_blockcount = del->br_startoff - got.br_startoff;
@@ -5281,7 +5262,7 @@ __xfs_bunmapi(
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t start, /* first file offset deleted */
xfs_filblks_t *rlen, /* i/o: amount remaining */
- int flags, /* misc flags */
+ uint32_t flags, /* misc flags */
xfs_extnum_t nexts) /* number of extents max */
{
struct xfs_btree_cur *cur; /* bmap btree cursor */
@@ -5299,7 +5280,6 @@ __xfs_bunmapi(
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
- xfs_fileoff_t max_len;
xfs_fileoff_t end;
struct xfs_iext_cursor icur;
bool done = false;
@@ -5318,16 +5298,6 @@ __xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
- /*
- * Guesstimate how many blocks we can unmap without running the risk of
- * blowing out the transaction with a mix of EFIs and reflink
- * adjustments.
- */
- if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
- max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
- else
- max_len = len;
-
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
@@ -5366,7 +5336,7 @@ __xfs_bunmapi(
extno = 0;
while (end != (xfs_fileoff_t)-1 && end >= start &&
- (nexts == 0 || extno < nexts) && max_len > 0) {
+ (nexts == 0 || extno < nexts)) {
/*
* Is the found extent after a hole in which end lives?
* Just back up to the previous extent, if so.
@@ -5400,14 +5370,6 @@ __xfs_bunmapi(
if (del.br_startoff + del.br_blockcount > end + 1)
del.br_blockcount = end + 1 - del.br_startoff;
- /* How much can we safely unmap? */
- if (max_len < del.br_blockcount) {
- del.br_startoff += del.br_blockcount - max_len;
- if (!wasdel)
- del.br_startblock += del.br_blockcount - max_len;
- del.br_blockcount = max_len;
- }
-
if (!isrt)
goto delete;
@@ -5543,7 +5505,6 @@ delete:
if (error)
goto error0;
- max_len -= del.br_blockcount;
end = del.br_startoff - 1;
nodelete:
/*
@@ -5609,7 +5570,7 @@ xfs_bunmapi(
struct xfs_inode *ip,
xfs_fileoff_t bno,
xfs_filblks_t len,
- int flags,
+ uint32_t flags,
xfs_extnum_t nexts,
int *done)
{
@@ -5641,7 +5602,7 @@ xfs_bmse_can_merge(
if ((left->br_startoff + left->br_blockcount != startoff) ||
(left->br_startblock + left->br_blockcount != got->br_startblock) ||
(left->br_state != got->br_state) ||
- (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
+ (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
return false;
return true;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 03d9aaf87413..16db95b11589 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -39,7 +39,7 @@ struct xfs_bmalloca {
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
int datatype;/* data type being allocated */
- int flags;
+ uint32_t flags;
};
#define XFS_BMAP_MAX_NMAP 4
@@ -47,17 +47,17 @@ struct xfs_bmalloca {
/*
* Flags for xfs_bmapi_*
*/
-#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
-#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
-#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
-#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
+#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */
+#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */
+#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */
/*
* unwritten extent conversion - this needs write cache flushing and no additional
* allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
* from written to unwritten, otherwise convert from unwritten to written.
*/
-#define XFS_BMAPI_CONVERT 0x040
+#define XFS_BMAPI_CONVERT (1u << 5)
/*
* allocate zeroed extents - this requires all newly allocated user data extents
@@ -65,7 +65,7 @@ struct xfs_bmalloca {
* Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
* during the allocation range to zeroed written extents.
*/
-#define XFS_BMAPI_ZERO 0x080
+#define XFS_BMAPI_ZERO (1u << 6)
/*
* Map the inode offset to the block given in ap->firstblock. Primarily
@@ -75,16 +75,16 @@ struct xfs_bmalloca {
* For bunmapi, this flag unmaps the range without adjusting quota, reducing
* refcount, or freeing the blocks.
*/
-#define XFS_BMAPI_REMAP 0x100
+#define XFS_BMAPI_REMAP (1u << 7)
/* Map something in the CoW fork. */
-#define XFS_BMAPI_COWFORK 0x200
+#define XFS_BMAPI_COWFORK (1u << 8)
/* Skip online discard of freed extents */
-#define XFS_BMAPI_NODISCARD 0x1000
+#define XFS_BMAPI_NODISCARD (1u << 9)
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
-#define XFS_BMAPI_NORMAP 0x2000
+#define XFS_BMAPI_NORMAP (1u << 10)
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -106,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w)
(w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
}
-static inline int xfs_bmapi_whichfork(int bmapi_flags)
+static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
{
if (bmapi_flags & XFS_BMAPI_COWFORK)
return XFS_COW_FORK;
@@ -124,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
/*
* Flags for xfs_bmap_add_extent*.
*/
-#define BMAP_LEFT_CONTIG (1 << 0)
-#define BMAP_RIGHT_CONTIG (1 << 1)
-#define BMAP_LEFT_FILLING (1 << 2)
-#define BMAP_RIGHT_FILLING (1 << 3)
-#define BMAP_LEFT_DELAY (1 << 4)
-#define BMAP_RIGHT_DELAY (1 << 5)
-#define BMAP_LEFT_VALID (1 << 6)
-#define BMAP_RIGHT_VALID (1 << 7)
-#define BMAP_ATTRFORK (1 << 8)
-#define BMAP_COWFORK (1 << 9)
+#define BMAP_LEFT_CONTIG (1u << 0)
+#define BMAP_RIGHT_CONTIG (1u << 1)
+#define BMAP_LEFT_FILLING (1u << 2)
+#define BMAP_RIGHT_FILLING (1u << 3)
+#define BMAP_LEFT_DELAY (1u << 4)
+#define BMAP_RIGHT_DELAY (1u << 5)
+#define BMAP_LEFT_VALID (1u << 6)
+#define BMAP_RIGHT_VALID (1u << 7)
+#define BMAP_ATTRFORK (1u << 8)
+#define BMAP_COWFORK (1u << 9)
#define XFS_BMAP_EXT_FLAGS \
{ BMAP_LEFT_CONTIG, "LC" }, \
@@ -183,15 +183,15 @@ int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
int whichfork);
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
- int *nmap, int flags);
+ int *nmap, uint32_t flags);
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
xfs_extnum_t nexts);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extnum_t nexts, int *done);
int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
@@ -243,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
-static inline int xfs_bmap_fork_to_state(int whichfork)
+static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
{
switch (whichfork) {
case XFS_ATTR_FORK:
@@ -260,7 +260,7 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
- int flags);
+ uint32_t flags);
extern struct kmem_cache *xfs_bmap_intent_cache;
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 453309fc85f2..2b77d45c215f 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -597,7 +597,11 @@ xfs_bmbt_maxrecs(
return xfs_bmbt_block_maxrecs(blocklen, leaf);
}
-/* Compute the max possible height for block mapping btrees. */
+/*
+ * Calculate the maximum possible height of the btree that the on-disk format
+ * supports. This is used for sizing structures large enough to support every
+ * possible configuration of a filesystem that might get mounted.
+ */
unsigned int
xfs_bmbt_maxlevels_ondisk(void)
{
@@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void)
minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
/* One extra level for the inode root. */
- return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
+ return xfs_btree_compute_maxlevels(minrecs,
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
}
/*
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c1500b238520..2eecc49fc1b2 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -52,6 +52,71 @@ xfs_btree_magic(
}
/*
+ * These sibling pointer checks are optimised for null sibling pointers. This
+ * happens a lot, and we don't need to byte swap at runtime if the sibling
+ * pointer is NULL.
+ *
+ * These are explicitly marked at inline because the cost of calling them as
+ * functions instead of inlining them is about 36 bytes extra code per call site
+ * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these
+ * two sibling check functions reduces the compiled code size by over 300
+ * bytes.
+ */
+static inline xfs_failaddr_t
+xfs_btree_check_lblock_siblings(
+ struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_fsblock_t fsb,
+ __be64 dsibling)
+{
+ xfs_fsblock_t sibling;
+
+ if (dsibling == cpu_to_be64(NULLFSBLOCK))
+ return NULL;
+
+ sibling = be64_to_cpu(dsibling);
+ if (sibling == fsb)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_lptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_fsbno(mp, sibling))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_btree_check_sblock_siblings(
+ struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ __be32 dsibling)
+{
+ xfs_agblock_t sibling;
+
+ if (dsibling == cpu_to_be32(NULLAGBLOCK))
+ return NULL;
+
+ sibling = be32_to_cpu(dsibling);
+ if (sibling == agbno)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_sptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_agbno(mp, agno, sibling))
+ return __this_address;
+ }
+ return NULL;
+}
+
+/*
* Check a long btree block header. Return the address of the failing check,
* or NULL if everything is ok.
*/
@@ -65,6 +130,8 @@ __xfs_btree_check_lblock(
struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_fsblock_t fsb = NULLFSBLOCK;
if (crc) {
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -83,16 +150,16 @@ __xfs_btree_check_lblock(
if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
- level + 1))
- return __this_address;
- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
- level + 1))
- return __this_address;
- return NULL;
+ if (bp)
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ block->bb_u.l.bb_rightsib);
+ return fa;
}
/* Check a long btree block header. */
@@ -130,6 +197,9 @@ __xfs_btree_check_sblock(
struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_agblock_t agbno = NULLAGBLOCK;
+ xfs_agnumber_t agno = NULLAGNUMBER;
if (crc) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -146,16 +216,18 @@ __xfs_btree_check_sblock(
if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
- level + 1))
- return __this_address;
- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
- level + 1))
- return __this_address;
- return NULL;
+ if (bp) {
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
+ }
+
+ fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno,
+ block->bb_u.s.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno,
+ agbno, block->bb_u.s.bb_rightsib);
+ return fa;
}
/* Check a short btree block header. */
@@ -373,8 +445,14 @@ xfs_btree_del_cursor(
break;
}
+ /*
+ * If we are doing a BMBT update, the number of unaccounted blocks
+ * allocated during this cursor life time should be zero. If it's not
+ * zero, then we should be shut down or on our way to shutdown due to
+ * cancelling a dirty transaction on error.
+ */
ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
- xfs_is_shutdown(cur->bc_mp));
+ xfs_is_shutdown(cur->bc_mp) || error != 0);
if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
kmem_free(cur->bc_ops);
if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
@@ -751,20 +829,20 @@ xfs_btree_lastrec(
*/
void
xfs_btree_offsets(
- int64_t fields, /* bitmask of fields */
+ uint32_t fields, /* bitmask of fields */
const short *offsets, /* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
int *last) /* output: last byte offset */
{
int i; /* current bit number */
- int64_t imask; /* mask for current bit number */
+ uint32_t imask; /* mask for current bit number */
ASSERT(fields != 0);
/*
* Find the lowest bit, so the first byte offset.
*/
- for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+ for (i = 0, imask = 1u; ; i++, imask <<= 1) {
if (imask & fields) {
*first = offsets[i];
break;
@@ -773,7 +851,7 @@ xfs_btree_offsets(
/*
* Find the highest bit, so the last byte offset.
*/
- for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+ for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
if (imask & fields) {
*last = offsets[i + 1] - 1;
break;
@@ -1456,7 +1534,7 @@ void
xfs_btree_log_block(
struct xfs_btree_cur *cur, /* btree cursor */
struct xfs_buf *bp, /* buffer containing btree block */
- int fields) /* mask of fields: XFS_BB_... */
+ uint32_t fields) /* mask of fields: XFS_BB_... */
{
int first; /* first byte offset logged */
int last; /* last byte offset logged */
@@ -3194,7 +3272,7 @@ xfs_btree_insrec(
struct xfs_btree_block *block; /* btree block */
struct xfs_buf *bp; /* buffer for block */
union xfs_btree_ptr nptr; /* new block ptr */
- struct xfs_btree_cur *ncur; /* new btree cursor */
+ struct xfs_btree_cur *ncur = NULL; /* new btree cursor */
union xfs_btree_key nkey; /* new block key */
union xfs_btree_key *lkey;
int optr; /* old key/record index */
@@ -3274,7 +3352,7 @@ xfs_btree_insrec(
#ifdef DEBUG
error = xfs_btree_check_block(cur, block, level, bp);
if (error)
- return error;
+ goto error0;
#endif
/*
@@ -3294,7 +3372,7 @@ xfs_btree_insrec(
for (i = numrecs - ptr; i >= 0; i--) {
error = xfs_btree_debug_check_ptr(cur, pp, i, level);
if (error)
- return error;
+ goto error0;
}
xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
@@ -3379,6 +3457,8 @@ xfs_btree_insrec(
return 0;
error0:
+ if (ncur)
+ xfs_btree_del_cursor(ncur, error);
return error;
}
@@ -4271,6 +4351,21 @@ xfs_btree_visit_block(
if (xfs_btree_ptr_is_null(cur, &rptr))
return -ENOENT;
+ /*
+ * We only visit blocks once in this walk, so we have to avoid the
+ * internal xfs_btree_lookup_get_block() optimisation where it will
+ * return the same block without checking if the right sibling points
+ * back to us and creates a cyclic reference in the btree.
+ */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ } else {
+ if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ }
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
}
@@ -4445,20 +4540,21 @@ xfs_btree_lblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_fsblock_t fsb;
+ xfs_failaddr_t fa;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
- return __this_address;
- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
- return __this_address;
-
- return NULL;
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ block->bb_u.l.bb_rightsib);
+ return fa;
}
/**
@@ -4499,7 +4595,9 @@ xfs_btree_sblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- xfs_agblock_t agno;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_failaddr_t fa;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
@@ -4507,14 +4605,13 @@ xfs_btree_sblock_verify(
/* sibling pointer verification */
agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
- return __this_address;
- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
- return __this_address;
-
- return NULL;
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+ block->bb_u.s.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+ block->bb_u.s.bb_rightsib);
+ return fa;
}
/*
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 22d9f411fde6..eef27858a013 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
/*
* For logging record fields.
*/
-#define XFS_BB_MAGIC (1 << 0)
-#define XFS_BB_LEVEL (1 << 1)
-#define XFS_BB_NUMRECS (1 << 2)
-#define XFS_BB_LEFTSIB (1 << 3)
-#define XFS_BB_RIGHTSIB (1 << 4)
-#define XFS_BB_BLKNO (1 << 5)
-#define XFS_BB_LSN (1 << 6)
-#define XFS_BB_UUID (1 << 7)
-#define XFS_BB_OWNER (1 << 8)
+#define XFS_BB_MAGIC (1u << 0)
+#define XFS_BB_LEVEL (1u << 1)
+#define XFS_BB_NUMRECS (1u << 2)
+#define XFS_BB_LEFTSIB (1u << 3)
+#define XFS_BB_RIGHTSIB (1u << 4)
+#define XFS_BB_BLKNO (1u << 5)
+#define XFS_BB_LSN (1u << 6)
+#define XFS_BB_UUID (1u << 7)
+#define XFS_BB_OWNER (1u << 8)
#define XFS_BB_NUM_BITS 5
-#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
+#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1)
#define XFS_BB_NUM_BITS_CRC 9
-#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
+#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1)
/*
* Generic stats interface
@@ -345,7 +345,7 @@ xfs_btree_dup_cursor(
*/
void
xfs_btree_offsets(
- int64_t fields, /* bitmask of fields */
+ uint32_t fields, /* bitmask of fields */
const short *offsets,/* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
@@ -435,7 +435,7 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
/*
* Internal btree helpers also used by xfs_bmap.c.
*/
-void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t);
void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
/*
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 9dc1ecb9713d..e7201dc68f43 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -22,6 +22,7 @@
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_errortag.h"
/*
* xfs_da_btree.c
@@ -116,6 +117,17 @@ xfs_da_state_free(xfs_da_state_t *state)
kmem_cache_free(xfs_da_state_cache, state);
}
+void
+xfs_da_state_reset(
+ struct xfs_da_state *state,
+ struct xfs_da_args *args)
+{
+ xfs_da_state_kill_altpath(state);
+ memset(state, 0, sizeof(struct xfs_da_state));
+ state->args = args;
+ state->mp = state->args->dp->i_mount;
+}
+
static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork)
{
if (whichfork == XFS_DATA_FORK)
@@ -482,6 +494,9 @@ xfs_da3_split(
trace_xfs_da_split(state->args);
+ if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+ return -EIO;
+
/*
* Walk back up the tree splitting/inserting/adjusting as necessary.
* If we need to insert and there isn't room, split the node, then
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 0faf7d9ac241..ffa3df5b2893 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -30,6 +30,7 @@ struct xfs_da_geometry {
unsigned int free_hdr_size; /* dir2 free header size */
unsigned int free_max_bests; /* # of bests entries in dir2 free */
xfs_dablk_t freeblk; /* blockno of free data v2 */
+ xfs_extnum_t max_extents; /* Max. extents in corresponding fork */
xfs_dir2_data_aoff_t data_first_offset;
size_t data_entry_offset;
@@ -76,27 +77,33 @@ typedef struct xfs_da_args {
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
int rmtvaluelen2; /* remote attr value length in bytes */
- int op_flags; /* operation flags */
+ uint32_t op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
/*
* Operation flags:
*/
-#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */
-#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */
-#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
-#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
-#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
-#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */
+#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */
+#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */
+#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */
+#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */
+#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
+#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
+#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
+#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
- { XFS_DA_OP_RENAME, "RENAME" }, \
+ { XFS_DA_OP_REPLACE, "REPLACE" }, \
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_NOTIME, "NOTIME" }
+ { XFS_DA_OP_NOTIME, "NOTIME" }, \
+ { XFS_DA_OP_REMOVE, "REMOVE" }, \
+ { XFS_DA_OP_RECOVERY, "RECOVERY" }, \
+ { XFS_DA_OP_LOGGED, "LOGGED" }
/*
* Storage for holding state during Btree searches and split/join ops.
@@ -197,7 +204,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
* Utility routines.
*/
-#define XFS_DABUF_MAP_HOLE_OK (1 << 0)
+#define XFS_DABUF_MAP_HOLE_OK (1u << 0)
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
@@ -220,6 +227,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args);
void xfs_da_state_free(xfs_da_state_t *state);
+void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args);
void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5a49caa5c9df..25e2841084e1 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
* Directory address space divided into sections,
* spaces separated by 32GB.
*/
+#define XFS_DIR2_MAX_SPACES 3
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
#define XFS_DIR2_DATA_SPACE 0
#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
@@ -688,10 +689,10 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
+#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
/*
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 0805ade2d300..5a321b783398 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -22,6 +22,10 @@
#include "xfs_refcount.h"
#include "xfs_bmap.h"
#include "xfs_alloc.h"
+#include "xfs_buf.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
static struct kmem_cache *xfs_defer_pending_cache;
@@ -184,36 +188,61 @@ static const struct xfs_defer_op_type *defer_op_types[] = {
[XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
[XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
[XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
+ [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type,
};
-static void
+/*
+ * Ensure there's a log intent item associated with this deferred work item if
+ * the operation must be restarted on crash. Returns 1 if there's a log item;
+ * 0 if there isn't; or a negative errno.
+ */
+static int
xfs_defer_create_intent(
struct xfs_trans *tp,
struct xfs_defer_pending *dfp,
bool sort)
{
const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ struct xfs_log_item *lip;
- if (!dfp->dfp_intent)
- dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
- dfp->dfp_count, sort);
+ if (dfp->dfp_intent)
+ return 1;
+
+ lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort);
+ if (!lip)
+ return 0;
+ if (IS_ERR(lip))
+ return PTR_ERR(lip);
+
+ dfp->dfp_intent = lip;
+ return 1;
}
/*
* For each pending item in the intake list, log its intent item and the
* associated extents, then add the entire intake list to the end of
* the pending list.
+ *
+ * Returns 1 if at least one log item was associated with the deferred work;
+ * 0 if there are no log items; or a negative errno.
*/
-STATIC void
+static int
xfs_defer_create_intents(
struct xfs_trans *tp)
{
struct xfs_defer_pending *dfp;
+ int ret = 0;
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
+ int ret2;
+
trace_xfs_defer_create_intent(tp->t_mountp, dfp);
- xfs_defer_create_intent(tp, dfp, true);
+ ret2 = xfs_defer_create_intent(tp, dfp, true);
+ if (ret2 < 0)
+ return ret2;
+ ret |= ret2;
}
+ return ret;
}
/* Abort all the intents that were committed. */
@@ -449,6 +478,8 @@ xfs_defer_finish_one(
dfp->dfp_count--;
error = ops->finish_item(tp, dfp->dfp_done, li, &state);
if (error == -EAGAIN) {
+ int ret;
+
/*
* Caller wants a fresh transaction; put the work item
* back on the list and log a new log intent item to
@@ -459,7 +490,9 @@ xfs_defer_finish_one(
dfp->dfp_count++;
dfp->dfp_done = NULL;
dfp->dfp_intent = NULL;
- xfs_defer_create_intent(tp, dfp, false);
+ ret = xfs_defer_create_intent(tp, dfp, false);
+ if (ret < 0)
+ error = ret;
}
if (error)
@@ -487,7 +520,7 @@ int
xfs_defer_finish_noroll(
struct xfs_trans **tp)
{
- struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *dfp = NULL;
int error = 0;
LIST_HEAD(dop_pending);
@@ -506,17 +539,24 @@ xfs_defer_finish_noroll(
* of time that any one intent item can stick around in memory,
* pinning the log tail.
*/
- xfs_defer_create_intents(*tp);
- list_splice_init(&(*tp)->t_dfops, &dop_pending);
+ int has_intents = xfs_defer_create_intents(*tp);
- error = xfs_defer_trans_roll(tp);
- if (error)
- goto out_shutdown;
+ list_splice_init(&(*tp)->t_dfops, &dop_pending);
- /* Possibly relog intent items to keep the log moving. */
- error = xfs_defer_relog(tp, &dop_pending);
- if (error)
+ if (has_intents < 0) {
+ error = has_intents;
goto out_shutdown;
+ }
+ if (has_intents || dfp) {
+ error = xfs_defer_trans_roll(tp);
+ if (error)
+ goto out_shutdown;
+
+ /* Relog intent items to keep the log moving. */
+ error = xfs_defer_relog(tp, &dop_pending);
+ if (error)
+ goto out_shutdown;
+ }
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
@@ -665,13 +705,15 @@ xfs_defer_ops_capture(
if (list_empty(&tp->t_dfops))
return NULL;
+ error = xfs_defer_create_intents(tp);
+ if (error < 0)
+ return ERR_PTR(error);
+
/* Create an object to capture the defer ops. */
dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
INIT_LIST_HEAD(&dfc->dfc_list);
INIT_LIST_HEAD(&dfc->dfc_dfops);
- xfs_defer_create_intents(tp);
-
/* Move the dfops chain and transaction state to the capture struct. */
list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
@@ -748,6 +790,10 @@ xfs_defer_ops_capture_and_commit(
/* If we don't capture anything, commit transaction and exit. */
dfc = xfs_defer_ops_capture(tp);
+ if (IS_ERR(dfc)) {
+ xfs_trans_cancel(tp);
+ return PTR_ERR(dfc);
+ }
if (!dfc)
return xfs_trans_commit(tp);
@@ -774,17 +820,25 @@ xfs_defer_ops_continue(
struct xfs_trans *tp,
struct xfs_defer_resources *dres)
{
+ unsigned int i;
+
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
- /* Lock and join the captured inode to the new transaction. */
+ /* Lock the captured resources to the new transaction. */
if (dfc->dfc_held.dr_inos == 2)
xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
else if (dfc->dfc_held.dr_inos == 1)
xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_lock(dfc->dfc_held.dr_bp[i]);
+
+ /* Join the captured resources to the new transaction. */
xfs_defer_restore_resources(tp, &dfc->dfc_held);
memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
+ dres->dr_bufs = 0;
/* Move captured dfops chain and state to the transaction. */
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
@@ -854,7 +908,9 @@ xfs_defer_init_item_caches(void)
error = xfs_extfree_intent_init_cache();
if (error)
goto err;
-
+ error = xfs_attr_intent_init_cache();
+ if (error)
+ goto err;
return 0;
err:
xfs_defer_destroy_item_caches();
@@ -865,6 +921,7 @@ err:
void
xfs_defer_destroy_item_caches(void)
{
+ xfs_attr_intent_destroy_cache();
xfs_extfree_intent_destroy_cache();
xfs_bmap_intent_destroy_cache();
xfs_refcount_intent_destroy_cache();
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 7bb8a31ad65b..114a3a4930a3 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -19,6 +19,7 @@ enum xfs_defer_ops_type {
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
XFS_DEFER_OPS_TYPE_AGFL_FREE,
+ XFS_DEFER_OPS_TYPE_ATTR,
XFS_DEFER_OPS_TYPE_MAX,
};
@@ -63,6 +64,8 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_attr_defer_type;
+
/*
* Deferred operation item relogging limits.
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 5f1e4799e8fa..3cd51fa3837b 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -150,6 +150,8 @@ xfs_da_mount(
dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+ dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
+ mp->m_sb.sb_blocklog;
dageo->magicpct = (dageo->blksize * 37) / 100;
/* set up attribute geometry - single fsb only */
@@ -161,6 +163,12 @@ xfs_da_mount(
dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+
+ if (xfs_has_large_extent_counts(mp))
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ else
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
dageo->magicpct = (dageo->blksize * 37) / 100;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index a23a52e643ad..5362908164b0 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -59,7 +59,10 @@
#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36
#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37
#define XFS_ERRTAG_AG_RESV_FAIL 38
-#define XFS_ERRTAG_MAX 39
+#define XFS_ERRTAG_LARP 39
+#define XFS_ERRTAG_DA_LEAF_SPLIT 40
+#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
+#define XFS_ERRTAG_MAX 42
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -103,5 +106,8 @@
#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1
#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
#define XFS_RANDOM_AG_RESV_FAIL 1
+#define XFS_RANDOM_LARP 1
+#define XFS_RANDOM_DA_LEAF_SPLIT 1
+#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index d665c04e69dd..afdfc8108c5f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -372,12 +372,14 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
+#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
XFS_SB_FEAT_INCOMPAT_SPINODES| \
XFS_SB_FEAT_INCOMPAT_META_UUID| \
XFS_SB_FEAT_INCOMPAT_BIGTIME| \
- XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
+ XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
+ XFS_SB_FEAT_INCOMPAT_NREXT64)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -388,7 +390,9 @@ xfs_sb_has_incompat_feature(
return (sbp->sb_features_incompat & feature) != 0;
}
-#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
+ (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
static inline bool
xfs_sb_has_incompat_log_feature(
@@ -413,6 +417,11 @@ xfs_sb_add_incompat_log_features(
sbp->sb_features_log_incompat |= features;
}
+static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
+{
+ return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+}
static inline bool
xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
@@ -525,26 +534,26 @@ typedef struct xfs_agf {
#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
-#define XFS_AGF_MAGICNUM 0x00000001
-#define XFS_AGF_VERSIONNUM 0x00000002
-#define XFS_AGF_SEQNO 0x00000004
-#define XFS_AGF_LENGTH 0x00000008
-#define XFS_AGF_ROOTS 0x00000010
-#define XFS_AGF_LEVELS 0x00000020
-#define XFS_AGF_FLFIRST 0x00000040
-#define XFS_AGF_FLLAST 0x00000080
-#define XFS_AGF_FLCOUNT 0x00000100
-#define XFS_AGF_FREEBLKS 0x00000200
-#define XFS_AGF_LONGEST 0x00000400
-#define XFS_AGF_BTREEBLKS 0x00000800
-#define XFS_AGF_UUID 0x00001000
-#define XFS_AGF_RMAP_BLOCKS 0x00002000
-#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000
-#define XFS_AGF_REFCOUNT_ROOT 0x00008000
-#define XFS_AGF_REFCOUNT_LEVEL 0x00010000
-#define XFS_AGF_SPARE64 0x00020000
+#define XFS_AGF_MAGICNUM (1u << 0)
+#define XFS_AGF_VERSIONNUM (1u << 1)
+#define XFS_AGF_SEQNO (1u << 2)
+#define XFS_AGF_LENGTH (1u << 3)
+#define XFS_AGF_ROOTS (1u << 4)
+#define XFS_AGF_LEVELS (1u << 5)
+#define XFS_AGF_FLFIRST (1u << 6)
+#define XFS_AGF_FLLAST (1u << 7)
+#define XFS_AGF_FLCOUNT (1u << 8)
+#define XFS_AGF_FREEBLKS (1u << 9)
+#define XFS_AGF_LONGEST (1u << 10)
+#define XFS_AGF_BTREEBLKS (1u << 11)
+#define XFS_AGF_UUID (1u << 12)
+#define XFS_AGF_RMAP_BLOCKS (1u << 13)
+#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14)
+#define XFS_AGF_REFCOUNT_ROOT (1u << 15)
+#define XFS_AGF_REFCOUNT_LEVEL (1u << 16)
+#define XFS_AGF_SPARE64 (1u << 17)
#define XFS_AGF_NUM_BITS 18
-#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
+#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
{ XFS_AGF_MAGICNUM, "MAGICNUM" }, \
@@ -619,22 +628,22 @@ typedef struct xfs_agi {
#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
-#define XFS_AGI_MAGICNUM (1 << 0)
-#define XFS_AGI_VERSIONNUM (1 << 1)
-#define XFS_AGI_SEQNO (1 << 2)
-#define XFS_AGI_LENGTH (1 << 3)
-#define XFS_AGI_COUNT (1 << 4)
-#define XFS_AGI_ROOT (1 << 5)
-#define XFS_AGI_LEVEL (1 << 6)
-#define XFS_AGI_FREECOUNT (1 << 7)
-#define XFS_AGI_NEWINO (1 << 8)
-#define XFS_AGI_DIRINO (1 << 9)
-#define XFS_AGI_UNLINKED (1 << 10)
+#define XFS_AGI_MAGICNUM (1u << 0)
+#define XFS_AGI_VERSIONNUM (1u << 1)
+#define XFS_AGI_SEQNO (1u << 2)
+#define XFS_AGI_LENGTH (1u << 3)
+#define XFS_AGI_COUNT (1u << 4)
+#define XFS_AGI_ROOT (1u << 5)
+#define XFS_AGI_LEVEL (1u << 6)
+#define XFS_AGI_FREECOUNT (1u << 7)
+#define XFS_AGI_NEWINO (1u << 8)
+#define XFS_AGI_DIRINO (1u << 9)
+#define XFS_AGI_UNLINKED (1u << 10)
#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
-#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
-#define XFS_AGI_FREE_ROOT (1 << 11)
-#define XFS_AGI_FREE_LEVEL (1 << 12)
-#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */
+#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1u << 11)
+#define XFS_AGI_FREE_LEVEL (1u << 12)
+#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */
#define XFS_AGI_NUM_BITS_R2 14
/* disk block (xfs_daddr_t) in the AG */
@@ -791,16 +800,41 @@ struct xfs_dinode {
__be32 di_nlink; /* number of links to file */
__be16 di_projid_lo; /* lower part of owner's project id */
__be16 di_projid_hi; /* higher part owner's project id */
- __u8 di_pad[6]; /* unused, zeroed space */
- __be16 di_flushiter; /* incremented on flush */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ __be64 di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ __be64 di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ __u8 di_v2_pad[6];
+ __be16 di_flushiter;
+ };
+ };
xfs_timestamp_t di_atime; /* time last accessed */
xfs_timestamp_t di_mtime; /* time last modified */
xfs_timestamp_t di_ctime; /* time created/inode modified */
__be64 di_size; /* number of bytes in file */
__be64 di_nblocks; /* # of direct & btree blocks used */
__be32 di_extsize; /* basic/minimum extent size for file */
- __be32 di_nextents; /* number of extents in data fork */
- __be16 di_anextents; /* number of extents in attribute fork*/
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ __be32 di_nextents;
+ __be16 di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ __be32 di_big_anextents;
+ __be16 di_nrext64_pad;
+ } __packed;
+ } __packed;
__u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
__s8 di_aformat; /* format of attr fork's data */
__be32 di_dmevmask; /* DMIG event mask */
@@ -870,6 +904,56 @@ enum xfs_dinode_fmt {
{ XFS_DINODE_FMT_UUID, "uuid" }
/*
+ * Max values for extnum and aextnum.
+ *
+ * The original on-disk extent counts were held in signed fields, resulting in
+ * maximum extent counts of 2^31 and 2^15 for the data and attr forks
+ * respectively. Similarly the maximum extent length is limited to 2^21 blocks
+ * by the 21-bit wide blockcount field of a BMBT extent record.
+ *
+ * The newly introduced data fork extent counter can hold a 64-bit value,
+ * however the maximum number of extents in a file is also limited to 2^54
+ * extents by the 54-bit wide startoff field of a BMBT extent record.
+ *
+ * It is further limited by the maximum supported file size of 2^63
+ * *bytes*. This leads to a maximum extent count for maximally sized filesystem
+ * blocks (64kB) of:
+ *
+ * 2^63 bytes / 2^16 bytes per block = 2^47 blocks
+ *
+ * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence
+ * 2^48 was chosen as the maximum data fork extent count.
+ *
+ * The maximum file size that can be represented by the data fork extent counter
+ * in the worst case occurs when all extents are 1 block in length and each
+ * block is 1KB in size.
+ *
+ * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and
+ * with 1KB sized blocks, a file can reach upto,
+ * 1KB * (2^31) = 2TB
+ *
+ * This is much larger than the theoretical maximum size of a directory
+ * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB.
+ *
+ * Hence, a directory inode can never overflow its data fork extent counter.
+ */
+#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1))
+#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1))
+
+/*
+ * When we upgrade an inode to the large extent counts, the maximum value by
+ * which the extent count can increase is bound by the change in size of the
+ * on-disk field. No upgrade operation should ever be adding more than a few
+ * tens of extents, so if we get a really large value it is a sign of a code bug
+ * or corruption.
+ */
+#define XFS_MAX_EXTCNT_UPGRADE_NR \
+ min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL)
+
+/*
* Inode minimum and maximum sizes.
*/
#define XFS_DINODE_MIN_LOG 8
@@ -918,10 +1002,6 @@ enum xfs_dinode_fmt {
((w) == XFS_DATA_FORK ? \
(dip)->di_format : \
(dip)->di_aformat)
-#define XFS_DFORK_NEXTENTS(dip,w) \
- ((w) == XFS_DATA_FORK ? \
- be32_to_cpu((dip)->di_nextents) : \
- be16_to_cpu((dip)->di_anextents))
/*
* For block and character special files the 32bit dev_t is stored at the
@@ -988,15 +1068,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
+#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
- XFS_DIFLAG2_BIGTIME)
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
@@ -1004,6 +1086,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
}
+static inline bool xfs_dinode_has_large_extent_counts(
+ const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
+}
+
/*
* Inode number format:
* low inopblog bits - offset in block
@@ -1085,10 +1174,10 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */
-#define XFS_DQTYPE_USER 0x01 /* user dquot record */
-#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */
-#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */
-#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */
+#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */
+#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */
+#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */
+#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */
/* bitmask to determine if this is a user/group/project dquot */
#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \
@@ -1596,6 +1685,8 @@ typedef struct xfs_bmdr_block {
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
+#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK))
+
/*
* bmbt records have a file offset (block) field that is 54 bits wide, so this
* is the largest xfs_fileoff_t that we ever expect to see.
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 505533c43a92..1cfd5bc6520a 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
+#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
/*
* Minimum and maximum sizes need for growth checks.
@@ -377,7 +378,7 @@ struct xfs_bulkstat {
uint32_t bs_extsize_blks; /* extent size hint, blocks */
uint32_t bs_nlink; /* number of links */
- uint32_t bs_extents; /* number of extents */
+ uint32_t bs_extents; /* 32-bit data fork extent counter */
uint32_t bs_aextents; /* attribute number of extents */
uint16_t bs_version; /* structure version */
uint16_t bs_forkoff; /* inode fork offset in bytes */
@@ -386,8 +387,9 @@ struct xfs_bulkstat {
uint16_t bs_checked; /* checked inode metadata */
uint16_t bs_mode; /* type and mode */
uint16_t bs_pad2; /* zeroed */
+ uint64_t bs_extents64; /* 64-bit data fork extent counter */
- uint64_t bs_pad[7]; /* zeroed */
+ uint64_t bs_pad[6]; /* zeroed */
};
#define XFS_BULKSTAT_VERSION_V1 (1)
@@ -459,17 +461,28 @@ struct xfs_bulk_ireq {
* Only return results from the specified @agno. If @ino is zero, start
* with the first inode of @agno.
*/
-#define XFS_BULK_IREQ_AGNO (1 << 0)
+#define XFS_BULK_IREQ_AGNO (1U << 0)
/*
* Return bulkstat information for a single inode, where @ino value is a
* special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_*
* values below. Not compatible with XFS_BULK_IREQ_AGNO.
*/
-#define XFS_BULK_IREQ_SPECIAL (1 << 1)
+#define XFS_BULK_IREQ_SPECIAL (1U << 1)
-#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
- XFS_BULK_IREQ_SPECIAL)
+/*
+ * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign
+ * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use
+ * xfs_bulkstat->bs_extents for returning data fork extent count and set
+ * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and
+ * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than
+ * XFS_MAX_EXTCNT_DATA_FORK_OLD.
+ */
+#define XFS_BULK_IREQ_NREXT64 (1U << 2)
+
+#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
+ XFS_BULK_IREQ_SPECIAL | \
+ XFS_BULK_IREQ_NREXT64)
/* Operate on the root directory inode. */
#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
@@ -699,34 +712,34 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_NR 25
/* i: Repair this metadata. */
-#define XFS_SCRUB_IFLAG_REPAIR (1 << 0)
+#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
/* o: Metadata object needs repair. */
-#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1)
+#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1)
/*
* o: Metadata object could be optimized. It's not corrupt, but
* we could improve on it somehow.
*/
-#define XFS_SCRUB_OFLAG_PREEN (1 << 2)
+#define XFS_SCRUB_OFLAG_PREEN (1u << 2)
/* o: Cross-referencing failed. */
-#define XFS_SCRUB_OFLAG_XFAIL (1 << 3)
+#define XFS_SCRUB_OFLAG_XFAIL (1u << 3)
/* o: Metadata object disagrees with cross-referenced metadata. */
-#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4)
+#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4)
/* o: Scan was not complete. */
-#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5)
+#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5)
/* o: Metadata object looked funny but isn't corrupt. */
-#define XFS_SCRUB_OFLAG_WARNING (1 << 6)
+#define XFS_SCRUB_OFLAG_WARNING (1u << 6)
/*
* o: IFLAG_REPAIR was set but metadata object did not need fixing or
* optimization and has therefore not been altered.
*/
-#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7)
#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index b418fe0c0679..bf2f4bc89193 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2414,9 +2414,9 @@ out_drop:
*/
void
xfs_ialloc_log_agi(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *bp, /* allocation group header buffer */
- int fields) /* bitmask of fields to log */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
{
int first; /* first byte number */
int last; /* last byte number */
@@ -2772,6 +2772,8 @@ xfs_ialloc_setup_geometry(
igeo->new_diflags2 = 0;
if (xfs_has_bigtime(mp))
igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+ if (xfs_has_large_extent_counts(mp))
+ igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
/* Compute inode btree geometry. */
igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 8b5c2b709022..a7705b6a1fd3 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -60,7 +60,7 @@ void
xfs_ialloc_log_agi(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *bp, /* allocation group header buffer */
- int fields); /* bitmask of fields to log */
+ uint32_t fields); /* bitmask of fields to log */
/*
* Read in the allocation group header (inode allocation section)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index cae9708c8587..3b1b63f9d886 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -279,6 +279,25 @@ xfs_inode_to_disk_ts(
return ts;
}
+static inline void
+xfs_inode_to_disk_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
+ to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp));
+ /*
+ * We might be upgrading the inode to use larger extent counters
+ * than was previously used. Hence zero the unused field.
+ */
+ to->di_nrext64_pad = cpu_to_be16(0);
+ } else {
+ to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
+ to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
+ }
+}
+
void
xfs_inode_to_disk(
struct xfs_inode *ip,
@@ -296,7 +315,6 @@ xfs_inode_to_disk(
to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
- memset(to->di_pad, 0, sizeof(to->di_pad));
to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
@@ -307,8 +325,6 @@ xfs_inode_to_disk(
to->di_size = cpu_to_be64(ip->i_disk_size);
to->di_nblocks = cpu_to_be64(ip->i_nblocks);
to->di_extsize = cpu_to_be32(ip->i_extsize);
- to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
- to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
to->di_forkoff = ip->i_forkoff;
to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_flags = cpu_to_be16(ip->i_diflags);
@@ -323,11 +339,14 @@ xfs_inode_to_disk(
to->di_lsn = cpu_to_be64(lsn);
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_version = 2;
to->di_flushiter = cpu_to_be16(ip->i_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_inode_to_disk_iext_counters(ip, to);
}
static xfs_failaddr_t
@@ -336,20 +355,40 @@ xfs_dinode_verify_fork(
struct xfs_mount *mp,
int whichfork)
{
- uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork);
+ xfs_extnum_t di_nextents;
+ xfs_extnum_t max_extents;
+ mode_t mode = be16_to_cpu(dip->di_mode);
+ uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork);
+ uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork);
+
+ di_nextents = xfs_dfork_nextents(dip, whichfork);
+
+ /*
+ * For fork types that can contain local data, check that the fork
+ * format matches the size of local data contained within the fork.
+ *
+ * For all types, check that when the size says the should be in extent
+ * or btree format, the inode isn't claiming it is in local format.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ if (be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
- switch (XFS_DFORK_FORMAT(dip, whichfork)) {
+ if (be64_to_cpu(dip->di_size) > fork_size &&
+ fork_format == XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ switch (fork_format) {
case XFS_DINODE_FMT_LOCAL:
/*
- * no local regular files yet
+ * No local regular files yet.
*/
- if (whichfork == XFS_DATA_FORK) {
- if (S_ISREG(be16_to_cpu(dip->di_mode)))
- return __this_address;
- if (be64_to_cpu(dip->di_size) >
- XFS_DFORK_SIZE(dip, mp, whichfork))
- return __this_address;
- }
+ if (S_ISREG(mode) && whichfork == XFS_DATA_FORK)
+ return __this_address;
if (di_nextents)
return __this_address;
break;
@@ -358,12 +397,11 @@ xfs_dinode_verify_fork(
return __this_address;
break;
case XFS_DINODE_FMT_BTREE:
- if (whichfork == XFS_ATTR_FORK) {
- if (di_nextents > MAXAEXTNUM)
- return __this_address;
- } else if (di_nextents > MAXEXTNUM) {
+ max_extents = xfs_iext_max_nextents(
+ xfs_dinode_has_large_extent_counts(dip),
+ whichfork);
+ if (di_nextents > max_extents)
return __this_address;
- }
break;
default:
return __this_address;
@@ -396,6 +434,24 @@ xfs_dinode_verify_forkoff(
return NULL;
}
+static xfs_failaddr_t
+xfs_dinode_verify_nrext64(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip)) {
+ if (!xfs_has_large_extent_counts(mp))
+ return __this_address;
+ if (dip->di_nrext64_pad != 0)
+ return __this_address;
+ } else if (dip->di_version >= 3) {
+ if (dip->di_v3_pad != 0)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
@@ -407,6 +463,9 @@ xfs_dinode_verify(
uint16_t flags;
uint64_t flags2;
uint64_t di_size;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
+ xfs_filblks_t nblocks;
if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
return __this_address;
@@ -437,10 +496,19 @@ xfs_dinode_verify(
if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
return __this_address;
+ fa = xfs_dinode_verify_nrext64(mp, dip);
+ if (fa)
+ return fa;
+
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+ nblocks = be64_to_cpu(dip->di_nblocks);
+
/* Fork checks carried over from xfs_iformat_fork */
- if (mode &&
- be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) >
- be64_to_cpu(dip->di_nblocks))
+ if (mode && nextents + naextents > nblocks)
+ return __this_address;
+
+ if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize)
@@ -497,7 +565,7 @@ xfs_dinode_verify(
default:
return __this_address;
}
- if (dip->di_anextents)
+ if (naextents)
return __this_address;
}
@@ -639,7 +707,7 @@ xfs_inode_validate_extsize(
if (extsize_bytes % blocksize_bytes)
return __this_address;
- if (extsize > MAXEXTLEN)
+ if (extsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
@@ -696,7 +764,7 @@ xfs_inode_validate_cowextsize(
if (cowextsize_bytes % mp->m_sb.sb_blocksize)
return __this_address;
- if (cowextsize > MAXEXTLEN)
+ if (cowextsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (cowextsize > mp->m_sb.sb_agblocks / 2)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 9149f4f796fc..1a4cdf550f6d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -36,7 +36,7 @@ xfs_init_local_fork(
int64_t size)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- int mem_size = size, real_size = 0;
+ int mem_size = size;
bool zero_terminate;
/*
@@ -50,8 +50,7 @@ xfs_init_local_fork(
mem_size++;
if (size) {
- real_size = roundup(mem_size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
+ ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
memcpy(ifp->if_u1.if_data, data, size);
if (zero_terminate)
ifp->if_u1.if_data[size] = '\0';
@@ -105,7 +104,7 @@ xfs_iformat_extents(
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
int state = xfs_bmap_fork_to_state(whichfork);
- int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork);
int size = nex * sizeof(xfs_bmbt_rec_t);
struct xfs_iext_cursor icur;
struct xfs_bmbt_rec *dp;
@@ -117,8 +116,8 @@ xfs_iformat_extents(
* we just bail out rather than crash in kmem_alloc() or memcpy() below.
*/
if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
- xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
- (unsigned long long) ip->i_ino, nex);
+ xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
+ ip->i_ino, nex);
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(1)", dip, sizeof(*dip),
__this_address);
@@ -230,7 +229,7 @@ xfs_iformat_data_fork(
* depend on it.
*/
ip->i_df.if_format = dip->di_format;
- ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents);
+ ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
@@ -295,14 +294,14 @@ xfs_iformat_attr_fork(
struct xfs_inode *ip,
struct xfs_dinode *dip)
{
+ xfs_extnum_t naextents = xfs_dfork_attr_extents(dip);
int error = 0;
/*
* Initialize the extent count early, as the per-format routines may
* depend on it.
*/
- ip->i_afp = xfs_ifork_alloc(dip->di_aformat,
- be16_to_cpu(dip->di_anextents));
+ ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents);
switch (ip->i_afp->if_format) {
case XFS_DINODE_FMT_LOCAL:
@@ -497,12 +496,7 @@ xfs_idata_realloc(
return;
}
- /*
- * For inline data, the underlying buffer must be a multiple of 4 bytes
- * in size so that it can be logged and stay on word boundaries.
- * We enforce that here.
- */
- ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4),
+ ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
GFP_NOFS | __GFP_NOFAIL);
ifp->if_bytes = new_size;
}
@@ -744,7 +738,8 @@ xfs_iext_count_may_overflow(
if (whichfork == XFS_COW_FORK)
return 0;
- max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM;
+ max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
+ whichfork);
if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
max_exts = 10;
@@ -755,3 +750,27 @@ xfs_iext_count_may_overflow(
return 0;
}
+
+/*
+ * Upgrade this inode's extent counter fields to be able to handle a potential
+ * increase in the extent count by nr_to_add. Normally this is the same
+ * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
+ */
+int
+xfs_iext_count_upgrade(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint nr_to_add)
+{
+ ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
+
+ if (!xfs_has_large_extent_counts(ip->i_mount) ||
+ xfs_inode_has_large_extent_counts(ip) ||
+ XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ return -EFBIG;
+
+ ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 3d64a3acb0ed..4f68c1f20beb 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -21,9 +21,9 @@ struct xfs_ifork {
void *if_root; /* extent tree root */
char *if_data; /* inline file data */
} if_u1;
+ xfs_extnum_t if_nextents; /* # of extents in this fork */
short if_broot_bytes; /* bytes allocated for root */
int8_t if_format; /* format of this fork */
- xfs_extnum_t if_nextents; /* # of extents in this fork */
};
/*
@@ -40,19 +40,6 @@ struct xfs_ifork {
#define XFS_IEXT_PUNCH_HOLE_CNT (1)
/*
- * Directory entry addition can cause the following,
- * 1. Data block can be added/removed.
- * A new extent can cause extent count to increase by 1.
- * 2. Free disk block can be added/removed.
- * Same behaviour as described above for Data block.
- * 3. Dabtree blocks.
- * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new
- * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH.
- */
-#define XFS_IEXT_DIR_MANIP_CNT(mp) \
- ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount)
-
-/*
* Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
* be added. One extra extent for dabtree in case a local attr is
* large enough to cause a double split. It can also cause extent
@@ -133,6 +120,65 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
return ifp->if_format;
}
+static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ case XFS_COW_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_DATA_FORK_LARGE;
+ return XFS_MAX_EXTCNT_DATA_FORK_SMALL;
+
+ case XFS_ATTR_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ return XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
+static inline xfs_extnum_t
+xfs_dfork_data_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be64_to_cpu(dip->di_big_nextents);
+
+ return be32_to_cpu(dip->di_nextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_attr_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be32_to_cpu(dip->di_big_anextents);
+
+ return be16_to_cpu(dip->di_anextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_nextents(
+ struct xfs_dinode *dip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return xfs_dfork_data_extents(dip);
+ case XFS_ATTR_FORK:
+ return xfs_dfork_attr_extents(dip);
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ return 0;
+}
+
struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format,
xfs_extnum_t nextents);
struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
@@ -229,6 +275,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip);
int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
int nr_to_add);
+int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
+ uint nr_to_add);
/* returns true if the fork has extents but they are not read in yet. */
static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index b322db523d65..b351b9dc6561 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -69,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr)
/* Log Clients */
#define XFS_TRANSACTION 0x69
-#define XFS_VOLUME 0x2
#define XFS_LOG 0xaa
#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
@@ -114,7 +113,12 @@ struct xfs_unmount_log_format {
#define XLOG_REG_TYPE_CUD_FORMAT 24
#define XLOG_REG_TYPE_BUI_FORMAT 25
#define XLOG_REG_TYPE_BUD_FORMAT 26
-#define XLOG_REG_TYPE_MAX 26
+#define XLOG_REG_TYPE_ATTRI_FORMAT 27
+#define XLOG_REG_TYPE_ATTRD_FORMAT 28
+#define XLOG_REG_TYPE_ATTR_NAME 29
+#define XLOG_REG_TYPE_ATTR_VALUE 30
+#define XLOG_REG_TYPE_MAX 30
+
/*
* Flags to log operation header
@@ -237,6 +241,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_CUD 0x1243
#define XFS_LI_BUI 0x1244 /* bmbt update intent */
#define XFS_LI_BUD 0x1245
+#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/
+#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -252,7 +258,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_CUI, "XFS_LI_CUI" }, \
{ XFS_LI_CUD, "XFS_LI_CUD" }, \
{ XFS_LI_BUI, "XFS_LI_BUI" }, \
- { XFS_LI_BUD, "XFS_LI_BUD" }
+ { XFS_LI_BUD, "XFS_LI_BUD" }, \
+ { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \
+ { XFS_LI_ATTRD, "XFS_LI_ATTRD" }
/*
* Inode Log Item Format definitions.
@@ -388,16 +396,41 @@ struct xfs_log_dinode {
uint32_t di_nlink; /* number of links to file */
uint16_t di_projid_lo; /* lower part of owner's project id */
uint16_t di_projid_hi; /* higher part of owner's project id */
- uint8_t di_pad[6]; /* unused, zeroed space */
- uint16_t di_flushiter; /* incremented on flush */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ uint64_t di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ uint64_t di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ uint8_t di_v2_pad[6]; /* V2 inode zeroed space */
+ uint16_t di_flushiter; /* V2 inode incremented on flush */
+ };
+ };
xfs_log_timestamp_t di_atime; /* time last accessed */
xfs_log_timestamp_t di_mtime; /* time last modified */
xfs_log_timestamp_t di_ctime; /* time created/inode modified */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
- xfs_extnum_t di_nextents; /* number of extents in data fork */
- xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ uint32_t di_nextents;
+ uint16_t di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ uint32_t di_big_anextents;
+ uint16_t di_nrext64_pad;
+ } __packed;
+ } __packed;
uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
int8_t di_aformat; /* format of attr fork's data */
uint32_t di_dmevmask; /* DMIG event mask */
@@ -869,4 +902,44 @@ struct xfs_icreate_log {
__be32 icl_gen; /* inode generation number to use */
};
+/*
+ * Flags for deferred attribute operations.
+ * Upper bits are flags, lower byte is type code
+ */
+#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */
+#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */
+#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */
+#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */
+
+/*
+ * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should
+ * never have any other bits set.
+ */
+#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \
+ XFS_ATTR_SECURE | \
+ XFS_ATTR_INCOMPLETE)
+
+/*
+ * This is the structure used to lay out an attr log item in the
+ * log.
+ */
+struct xfs_attri_log_format {
+ uint16_t alfi_type; /* attri log item type */
+ uint16_t alfi_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfi_id; /* attri identifier */
+ uint64_t alfi_ino; /* the inode for this attr operation */
+ uint32_t alfi_op_flags; /* marks the op as a set or remove */
+ uint32_t alfi_name_len; /* attr name length */
+ uint32_t alfi_value_len; /* attr value length */
+ uint32_t alfi_attr_filter;/* attr filter flags */
+};
+
+struct xfs_attrd_log_format {
+ uint16_t alfd_type; /* attrd log item type */
+ uint16_t alfd_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfd_alf_id; /* id of corresponding attri */
+};
+
#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index ff69a0000817..2420865f3007 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -72,6 +72,8 @@ extern const struct xlog_recover_item_ops xlog_rui_item_ops;
extern const struct xlog_recover_item_ops xlog_rud_item_ops;
extern const struct xlog_recover_item_ops xlog_cui_item_ops;
extern const struct xlog_recover_item_ops xlog_cud_item_ops;
+extern const struct xlog_recover_item_ops xlog_attri_item_ops;
+extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
/*
* Macros, structures, prototypes for internal log manager use.
@@ -108,12 +110,6 @@ struct xlog_recover {
#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
-/*
- * This is the number of entries in the l_buf_cancel_table used during
- * recovery.
- */
-#define XLOG_BC_TABLE_SIZE 64
-
#define XLOG_RECOVER_CRCPASS 0
#define XLOG_RECOVER_PASS1 1
#define XLOG_RECOVER_PASS2 2
@@ -126,5 +122,13 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_inode **ipp);
void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
uint64_t intent_id);
+int xlog_alloc_buf_cancel_table(struct xlog *log);
+void xlog_free_buf_cancel_table(struct xlog *log);
+
+#ifdef DEBUG
+void xlog_check_buf_cancel_table(struct xlog *log);
+#else
+#define xlog_check_buf_cancel_table(log) do { } while (0)
+#endif
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 67798ff5e14e..9975b93a7412 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -14,6 +14,7 @@
#include "xfs_trans_space.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
+#include "xfs_trace.h"
/*
* Calculate the maximum length in bytes that would be required for a local
@@ -37,6 +38,65 @@ xfs_log_calc_max_attrsetm_res(
}
/*
+ * Compute an alternate set of log reservation sizes for use exclusively with
+ * minimum log size calculations.
+ */
+static void
+xfs_log_calc_trans_resv_for_minlogblocks(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resv)
+{
+ unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
+
+ /*
+ * In the early days of rmap+reflink, we always set the rmap maxlevels
+ * to 9 even if the AG was small enough that it would never grow to
+ * that height. Transaction reservation sizes influence the minimum
+ * log size calculation, which influences the size of the log that mkfs
+ * creates. Use the old value here to ensure that newly formatted
+ * small filesystems will mount on older kernels.
+ */
+ if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+ mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
+ xfs_trans_resv_calc(mp, resv);
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * In the early days of reflink, typical log operation counts
+ * were greatly overestimated.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ resv->tr_itruncate.tr_logcount =
+ XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ } else if (xfs_has_rmapbt(mp)) {
+ /*
+ * In the early days of non-reflink rmap, the impact of rmapbt
+ * updates on log counts were not taken into account at all.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ }
+
+ /*
+ * In the early days of reflink, we did not use deferred refcount
+ * update log items, so log reservations must be recomputed using the
+ * old calculations.
+ */
+ resv->tr_write.tr_logres =
+ xfs_calc_write_reservation_minlogsize(mp);
+ resv->tr_itruncate.tr_logres =
+ xfs_calc_itruncate_reservation_minlogsize(mp);
+ resv->tr_qm_dqalloc.tr_logres =
+ xfs_calc_qm_dqalloc_reservation_minlogsize(mp);
+
+ /* Put everything back the way it was. This goes at the end. */
+ mp->m_rmap_maxlevels = rmap_maxlevels;
+}
+
+/*
* Iterate over the log space reservation table to figure out and return
* the maximum one in terms of the pre-calculated values which were done
* at mount time.
@@ -46,19 +106,25 @@ xfs_log_get_max_trans_res(
struct xfs_mount *mp,
struct xfs_trans_res *max_resp)
{
+ struct xfs_trans_resv resv = {};
struct xfs_trans_res *resp;
struct xfs_trans_res *end_resp;
+ unsigned int i;
int log_space = 0;
int attr_space;
attr_space = xfs_log_calc_max_attrsetm_res(mp);
- resp = (struct xfs_trans_res *)M_RES(mp);
- end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
- for (; resp < end_resp; resp++) {
+ xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv);
+
+ resp = (struct xfs_trans_res *)&resv;
+ end_resp = (struct xfs_trans_res *)(&resv + 1);
+ for (i = 0; resp < end_resp; i++, resp++) {
int tmp = resp->tr_logcount > 1 ?
resp->tr_logres * resp->tr_logcount :
resp->tr_logres;
+
+ trace_xfs_trans_resv_calc_minlogsize(mp, i, resp);
if (log_space < tmp) {
log_space = tmp;
*max_resp = *resp; /* struct copy */
@@ -66,9 +132,10 @@ xfs_log_get_max_trans_res(
}
if (attr_space > log_space) {
- *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */
+ *max_resp = resv.tr_attrsetm; /* struct copy */
max_resp->tr_logres = attr_space;
}
+ trace_xfs_log_get_max_trans_res(mp, max_resp);
}
/*
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index a02c5062f9b2..cb035da3f990 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -16,7 +16,6 @@
* and quota-limits. This is a waste in the common case, but hey ...
*/
typedef uint64_t xfs_qcnt_t;
-typedef uint16_t xfs_qwarncnt_t;
typedef uint8_t xfs_dqtype_t;
@@ -29,8 +28,8 @@ typedef uint8_t xfs_dqtype_t;
/*
* flags for q_flags field in the dquot.
*/
-#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */
-#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */
+#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */
+#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */
#define XFS_DQFLAG_STRINGS \
{ XFS_DQFLAG_DIRTY, "DIRTY" }, \
@@ -73,29 +72,45 @@ typedef uint8_t xfs_dqtype_t;
* to a single function. None of these XFS_QMOPT_* flags are meant to have
* persistent values (ie. their values can and will change between versions)
*/
-#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
-#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
-#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
-#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
+#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */
+#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */
+#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */
/*
* flags to xfs_trans_mod_dquot to indicate which field needs to be
* modified.
*/
-#define XFS_QMOPT_RES_REGBLKS 0x0010000
-#define XFS_QMOPT_RES_RTBLKS 0x0020000
-#define XFS_QMOPT_BCOUNT 0x0040000
-#define XFS_QMOPT_ICOUNT 0x0080000
-#define XFS_QMOPT_RTBCOUNT 0x0100000
-#define XFS_QMOPT_DELBCOUNT 0x0200000
-#define XFS_QMOPT_DELRTBCOUNT 0x0400000
-#define XFS_QMOPT_RES_INOS 0x0800000
+#define XFS_QMOPT_RES_REGBLKS (1u << 7)
+#define XFS_QMOPT_RES_RTBLKS (1u << 8)
+#define XFS_QMOPT_BCOUNT (1u << 9)
+#define XFS_QMOPT_ICOUNT (1u << 10)
+#define XFS_QMOPT_RTBCOUNT (1u << 11)
+#define XFS_QMOPT_DELBCOUNT (1u << 12)
+#define XFS_QMOPT_DELRTBCOUNT (1u << 13)
+#define XFS_QMOPT_RES_INOS (1u << 14)
/*
* flags for dqalloc.
*/
-#define XFS_QMOPT_INHERIT 0x1000000
+#define XFS_QMOPT_INHERIT (1u << 31)
+
+#define XFS_QMOPT_FLAGS \
+ { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
+ { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
+ { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
+ { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
+ { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
+ { XFS_QMOPT_INHERIT, "INHERIT" }, \
+ { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
+ { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
+ { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
+ { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
+ { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
+ { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
+ { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
+ { XFS_QMOPT_RES_INOS, "RES_INOS" }
/*
* flags to xfs_trans_mod_dquot.
@@ -114,6 +129,7 @@ typedef uint8_t xfs_dqtype_t;
(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
struct xfs_disk_dquot *ddq, xfs_dqid_t id);
extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 327ba25e9e17..97e9e6020596 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -886,8 +886,13 @@ xfs_refcount_still_have_space(
{
unsigned long overhead;
- overhead = cur->bc_ag.refc.shape_changes *
- xfs_allocfree_log_count(cur->bc_mp, 1);
+ /*
+ * Worst case estimate: full splits of the free space and rmap btrees
+ * to handle each of the shape changes to the refcount btree.
+ */
+ overhead = xfs_allocfree_block_count(cur->bc_mp,
+ cur->bc_ag.refc.shape_changes);
+ overhead += cur->bc_mp->m_refc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
/*
@@ -960,6 +965,7 @@ xfs_refcount_adjust_extents(
* Either cover the hole (increment) or
* delete the range (decrement).
*/
+ cur->bc_ag.refc.nr_ops++;
if (tmp.rc_refcount) {
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -970,7 +976,6 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_ag.refc.nr_ops++;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
@@ -1001,11 +1006,11 @@ xfs_refcount_adjust_extents(
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &ext);
+ cur->bc_ag.refc.nr_ops++;
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
goto out_error;
- cur->bc_ag.refc.nr_ops++;
} else if (ext.rc_refcount == 1) {
error = xfs_refcount_delete(cur, &found_rec);
if (error)
@@ -1014,7 +1019,6 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_ag.refc.nr_ops++;
goto advloop;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 9eb01edbd89d..e8b322de7f3d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -67,14 +67,17 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
* log (plus any key updates) so we'll conservatively assume 32 bytes
* per record. We must also leave space for btree splits on both ends
* of the range and space for the CUD and a new CUI.
+ *
+ * Each EFI that we attach to the transaction is assumed to consume ~32 bytes.
+ * This is a low estimate for an EFI tracking a single extent (16 bytes for the
+ * EFI header, 16 for the extent, and 12 for the xlog op header), but the
+ * estimate is acceptable if there's more than one extent being freed.
+ * In the worst case of freeing every other block during a refcount decrease
+ * operation, we amortize the space used for one EFI log item across 16
+ * extents.
*/
#define XFS_REFCOUNT_ITEM_OVERHEAD 32
-static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
-{
- return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
-}
-
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
union xfs_btree_rec;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index cd322174dbff..2845019d31da 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -34,18 +34,32 @@ int
xfs_rmap_lookup_le(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
- xfs_extlen_t len,
uint64_t owner,
uint64_t offset,
unsigned int flags,
+ struct xfs_rmap_irec *irec,
int *stat)
{
+ int get_stat = 0;
+ int error;
+
cur->bc_rec.r.rm_startblock = bno;
- cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_blockcount = 0;
cur->bc_rec.r.rm_owner = owner;
cur->bc_rec.r.rm_offset = offset;
cur->bc_rec.r.rm_flags = flags;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ if (error || !(*stat) || !irec)
+ return error;
+
+ error = xfs_rmap_get_rec(cur, irec, &get_stat);
+ if (error)
+ return error;
+ if (!get_stat)
+ return -EFSCORRUPTED;
+
+ return 0;
}
/*
@@ -251,7 +265,6 @@ out_bad_rec:
struct xfs_find_left_neighbor_info {
struct xfs_rmap_irec high;
struct xfs_rmap_irec *irec;
- int *stat;
};
/* For each rmap given, figure out if it matches the key we want. */
@@ -276,7 +289,6 @@ xfs_rmap_find_left_neighbor_helper(
return 0;
*info->irec = *rec;
- *info->stat = 1;
return -ECANCELED;
}
@@ -285,7 +297,7 @@ xfs_rmap_find_left_neighbor_helper(
* return a match with the same owner and adjacent physical and logical
* block ranges.
*/
-int
+STATIC int
xfs_rmap_find_left_neighbor(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
@@ -296,6 +308,7 @@ xfs_rmap_find_left_neighbor(
int *stat)
{
struct xfs_find_left_neighbor_info info;
+ int found = 0;
int error;
*stat = 0;
@@ -313,21 +326,44 @@ xfs_rmap_find_left_neighbor(
info.high.rm_flags = flags;
info.high.rm_blockcount = 0;
info.irec = irec;
- info.stat = stat;
trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
- error = xfs_rmap_query_range(cur, &info.high, &info.high,
- xfs_rmap_find_left_neighbor_helper, &info);
- if (error == -ECANCELED)
- error = 0;
- if (*stat)
- trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, irec->rm_startblock,
- irec->rm_blockcount, irec->rm_owner,
- irec->rm_offset, irec->rm_flags);
- return error;
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * extent conversion and remap operations run a bit faster if the
+ * physical extents aren't being shared. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_find_left_neighbor_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
}
/* For each rmap given, figure out if it matches the key we want. */
@@ -353,7 +389,6 @@ xfs_rmap_lookup_le_range_helper(
return 0;
*info->irec = *rec;
- *info->stat = 1;
return -ECANCELED;
}
@@ -374,6 +409,7 @@ xfs_rmap_lookup_le_range(
int *stat)
{
struct xfs_find_left_neighbor_info info;
+ int found = 0;
int error;
info.high.rm_startblock = bno;
@@ -386,20 +422,44 @@ xfs_rmap_lookup_le_range(
info.high.rm_blockcount = 0;
*stat = 0;
info.irec = irec;
- info.stat = stat;
- trace_xfs_rmap_lookup_le_range(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
- error = xfs_rmap_query_range(cur, &info.high, &info.high,
- xfs_rmap_lookup_le_range_helper, &info);
- if (error == -ECANCELED)
- error = 0;
- if (*stat)
- trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, irec->rm_startblock,
- irec->rm_blockcount, irec->rm_owner,
- irec->rm_offset, irec->rm_flags);
- return error;
+ trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ bno, 0, owner, offset, flags);
+
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * scrub run much faster on most filesystems because bmbt records are
+ * usually an exact match for rmap records. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_lookup_le_range_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_lookup_le_range_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
}
/*
@@ -510,7 +570,7 @@ xfs_rmap_unmap(
* for the AG headers at rm_startblock == 0 created by mkfs/growfs that
* will not ever be removed from the tree.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec, &i);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -518,13 +578,6 @@ xfs_rmap_unmap(
goto out_error;
}
- error = xfs_rmap_get_rec(cur, &ltrec, &i);
- if (error)
- goto out_error;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto out_error;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
@@ -786,18 +839,11 @@ xfs_rmap_map(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec,
&have_lt);
if (error)
goto out_error;
if (have_lt) {
- error = xfs_rmap_get_rec(cur, &ltrec, &have_lt);
- if (error)
- goto out_error;
- if (XFS_IS_CORRUPT(mp, have_lt != 1)) {
- error = -EFSCORRUPTED;
- goto out_error;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
@@ -1022,7 +1068,7 @@ xfs_rmap_convert(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i);
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -1030,13 +1076,6 @@ xfs_rmap_convert(
goto done;
}
- error = xfs_rmap_get_rec(cur, &PREV, &i);
- if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
PREV.rm_blockcount, PREV.rm_owner,
@@ -1140,7 +1179,7 @@ xfs_rmap_convert(
_RET_IP_);
/* reset the cursor back to PREV */
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -2677,7 +2716,7 @@ xfs_rmap_record_exists(
ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
(flags & XFS_RMAP_BMBT_BLOCK));
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec,
&has_record);
if (error)
return error;
@@ -2686,14 +2725,6 @@ xfs_rmap_record_exists(
return 0;
}
- error = xfs_rmap_get_rec(cur, &irec, &has_record);
- if (error)
- return error;
- if (!has_record) {
- *has_rmap = false;
- return 0;
- }
-
*has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno &&
irec.rm_startblock + irec.rm_blockcount >= bno + len);
return 0;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index b718ebeda372..54741a591a17 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -122,8 +122,8 @@ int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
const struct xfs_owner_info *oinfo);
int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, uint64_t owner, uint64_t offset,
- unsigned int flags, int *stat);
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, uint64_t owner, uint64_t offset,
unsigned int flags, int *stat);
@@ -184,9 +184,6 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
xfs_fsblock_t startblock, xfs_filblks_t blockcount,
xfs_exntst_t state, struct xfs_btree_cur **pcur);
-int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- uint64_t owner, uint64_t offset, unsigned int flags,
- struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
struct xfs_rmap_irec *irec, int *stat);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 5740ba664867..fa180ab66b73 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1008,6 +1008,7 @@ xfs_rtfree_extent(
/* Find all the free records within a given range. */
int
xfs_rtalloc_query_range(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *low_rec,
const struct xfs_rtalloc_rec *high_rec,
@@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range(
void *priv)
{
struct xfs_rtalloc_rec rec;
- struct xfs_mount *mp = tp->t_mountp;
xfs_rtblock_t rtstart;
xfs_rtblock_t rtend;
xfs_rtblock_t high_key;
@@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range(
rec.ar_startext = rtstart;
rec.ar_extcount = rtend - rtstart + 1;
- error = fn(tp, &rec, priv);
+ error = fn(mp, tp, &rec, priv);
if (error)
break;
}
@@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range(
/* Find all the free records. */
int
xfs_rtalloc_query_all(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv)
@@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all(
struct xfs_rtalloc_rec keys[2];
keys[0].ar_startext = 0;
- keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1;
+ keys[1].ar_startext = mp->m_sb.sb_rextents - 1;
keys[0].ar_extcount = keys[1].ar_extcount = 0;
- return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
+ return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv);
}
/* Is the given extent all free? */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index f4e84aa1d50a..a20cade590e9 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -31,15 +31,66 @@
*/
/*
+ * Check that all the V4 feature bits that the V5 filesystem format requires are
+ * correctly set.
+ */
+static bool
+xfs_sb_validate_v5_features(
+ struct xfs_sb *sbp)
+{
+ /* We must not have any unknown V4 feature bits set */
+ if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS)
+ return false;
+
+ /*
+ * The CRC bit is considered an invalid V4 flag, so we have to add it
+ * manually to the OKBITS mask.
+ */
+ if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS |
+ XFS_SB_VERSION2_CRCBIT))
+ return false;
+
+ /* Now check all the required V4 feature flags are set. */
+
+#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \
+ XFS_SB_VERSION_ALIGNBIT | \
+ XFS_SB_VERSION_LOGV2BIT | \
+ XFS_SB_VERSION_EXTFLGBIT | \
+ XFS_SB_VERSION_DIRV2BIT | \
+ XFS_SB_VERSION_MOREBITSBIT)
+
+#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+ XFS_SB_VERSION2_ATTR2BIT | \
+ XFS_SB_VERSION2_PROJID32BIT | \
+ XFS_SB_VERSION2_CRCBIT)
+
+ if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS)
+ return false;
+ if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS)
+ return false;
+ return true;
+}
+
+/*
* We support all XFS versions newer than a v4 superblock with V2 directories.
*/
bool
xfs_sb_good_version(
struct xfs_sb *sbp)
{
- /* all v5 filesystems are supported */
+ /*
+ * All v5 filesystems are supported, but we must check that all the
+ * required v4 feature flags are enabled correctly as the code checks
+ * those flags and not for v5 support.
+ */
if (xfs_sb_is_v5(sbp))
- return true;
+ return xfs_sb_validate_v5_features(sbp);
+
+ /* We must not have any unknown v4 feature bits set */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
/* versions prior to v4 are not supported */
if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4)
@@ -51,12 +102,6 @@ xfs_sb_good_version(
if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
return false;
- /* And must not have any unknown v4 feature bits set */
- if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
- return false;
-
/* It's a supported v4 filesystem */
return true;
}
@@ -70,6 +115,8 @@ xfs_sb_version_to_features(
/* optional V4 features */
if (sbp->sb_rblocks > 0)
features |= XFS_FEAT_REALTIME;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)
+ features |= XFS_FEAT_NLINK;
if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)
features |= XFS_FEAT_ATTR;
if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT)
@@ -124,6 +171,9 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_BIGTIME;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
features |= XFS_FEAT_NEEDSREPAIR;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
+ features |= XFS_FEAT_NREXT64;
+
return features;
}
@@ -262,12 +312,15 @@ xfs_validate_sb_common(
bool has_dalign;
if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
- xfs_warn(mp, "bad magic number");
+ xfs_warn(mp,
+"Superblock has bad magic number 0x%x. Not an XFS filesystem?",
+ be32_to_cpu(dsb->sb_magicnum));
return -EWRONGFS;
}
if (!xfs_sb_good_version(sbp)) {
- xfs_warn(mp, "bad version");
+ xfs_warn(mp,
+"Superblock has unknown features enabled or corrupted feature masks.");
return -EWRONGFS;
}
@@ -911,6 +964,11 @@ xfs_log_sb(
* reservations that have been taken out percpu counters. If we have an
* unclean shutdown, this will be corrected by log recovery rebuilding
* the counters from the AGF block counts.
+ *
+ * Do not update sb_frextents here because it is not part of the lazy
+ * sb counters, despite having a percpu counter. It is always kept
+ * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
+ * and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
@@ -1135,6 +1193,8 @@ xfs_fs_geometry(
} else {
geo->logsectsize = BBSIZE;
}
+ if (xfs_has_large_extent_counts(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 25c4cab58851..c4381388c0c1 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -54,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
/*
* Values for t_flags.
*/
-#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
-#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
-#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
-#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
-#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
-#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
-#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */
+/* Transaction needs to be logged */
+#define XFS_TRANS_DIRTY (1u << 0)
+/* Superblock is dirty and needs to be logged */
+#define XFS_TRANS_SB_DIRTY (1u << 1)
+/* Transaction took a permanent log reservation */
+#define XFS_TRANS_PERM_LOG_RES (1u << 2)
+/* Synchronous transaction commit needed */
+#define XFS_TRANS_SYNC (1u << 3)
+/* Transaction can use reserve block pool */
+#define XFS_TRANS_RESERVE (1u << 4)
+/* Transaction should avoid VFS level superblock write accounting */
+#define XFS_TRANS_NO_WRITECOUNT (1u << 5)
+/* Transaction has freed blocks returned to it's reservation */
+#define XFS_TRANS_RES_FDBLKS (1u << 6)
+/* Transaction contains an intent done log item */
+#define XFS_TRANS_HAS_INTENT_DONE (1u << 7)
+
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index f0b38f4aba80..8b9bd178a487 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -213,7 +213,7 @@ xfs_symlink_shortform_verify(
/*
* Zero length symlinks should never occur in memory as they are
- * never alllowed to exist on disk.
+ * never allowed to exist on disk.
*/
if (!size)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6f83d9b306ee..e9913c2c5a24 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -56,15 +56,14 @@ xfs_calc_buf_res(
* Per-extent log reservation for the btree changes involved in freeing or
* allocating an extent. In classic XFS there were two trees that will be
* modified (bnobt + cntbt). With rmap enabled, there are three trees
- * (rmapbt). With reflink, there are four trees (refcountbt). The number of
- * blocks reserved is based on the formula:
+ * (rmapbt). The number of blocks reserved is based on the formula:
*
* num trees * ((2 blocks/level * max depth) - 1)
*
* Keep in mind that max depth is calculated separately for each type of tree.
*/
uint
-xfs_allocfree_log_count(
+xfs_allocfree_block_count(
struct xfs_mount *mp,
uint num_ops)
{
@@ -73,13 +72,24 @@ xfs_allocfree_log_count(
blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
if (xfs_has_rmapbt(mp))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
- if (xfs_has_reflink(mp))
- blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
return blocks;
}
/*
+ * Per-extent log reservation for refcount btree changes. These are never done
+ * in the same transaction as an allocation or a free, so we compute them
+ * separately.
+ */
+static unsigned int
+xfs_refcountbt_block_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ return num_ops * (2 * mp->m_refc_maxlevels - 1);
+}
+
+/*
* Logging inodes is really tricksy. They are logged in memory format,
* which means that what we write into the log doesn't directly translate into
* the amount of space they use on disk.
@@ -136,7 +146,7 @@ xfs_calc_inobt_res(
{
return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -183,7 +193,7 @@ xfs_calc_inode_chunk_res(
{
uint res, size = 0;
- res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
if (alloc) {
/* icreate tx uses ordered buffers */
@@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res(
/*
* Per-extent log reservation for the btree changes involved in freeing or
* allocating a realtime extent. We have to be able to log as many rtbitmap
- * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents,
- * as well as the realtime summary block.
+ * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
+ * extents, as well as the realtime summary block.
*/
static unsigned int
-xfs_rtalloc_log_count(
+xfs_rtalloc_block_count(
struct xfs_mount *mp,
unsigned int num_ops)
{
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
unsigned int rtbmp_bytes;
- rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY;
+ rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
}
@@ -233,6 +243,28 @@ xfs_rtalloc_log_count(
* register overflow from temporaries in the calculations.
*/
+/*
+ * Compute the log reservation required to handle the refcount update
+ * transaction. Refcount updates are always done via deferred log items.
+ *
+ * This is calculated as:
+ * Data device refcount updates (t1):
+ * the agfs of the ags containing the blocks: nr_ops * sector size
+ * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+static unsigned int
+xfs_calc_refcountbt_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+}
/*
* In a write transaction we can allocate a maximum of 2
@@ -247,7 +279,7 @@ xfs_rtalloc_log_count(
* the inode's bmap btree: max depth * block size
* the agfs of the ags from which the extents are allocated: 2 * sector
* the superblock free block counter: sector size
- * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 1 block
* the allocation btrees: 2 trees * (2 * max depth - 1) * block size
* And the bmap_finish transaction can free bmap blocks in a join (t3):
@@ -255,34 +287,65 @@ xfs_rtalloc_log_count(
* the agfls of the ags containing the blocks: 2 * sector size
* the super block free block counter: sector size
* the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
*/
STATIC uint
xfs_calc_write_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- unsigned int t1, t2, t3;
+ unsigned int t1, t2, t3, t4;
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
if (xfs_has_realtime(mp)) {
t2 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz);
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz);
} else {
t2 = 0;
}
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * two refcountbt splits for each transaction. The codebase runs
+ * refcountbt updates in separate transactions now, so to compute the
+ * minimum log size, add the refcountbtree splits back to t1 and t3 and
+ * do not account them separately as t4. Reflink did not support
+ * realtime when the reservations were established, so no adjustment to
+ * t2 is needed.
+ */
+ if (for_minlogsize) {
+ unsigned int adj = 0;
+
+ if (xfs_has_reflink(mp))
+ adj = xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 2),
+ blksz);
+ t1 += adj;
+ t3 += adj;
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 1);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_write_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_write_reservation(mp, true);
}
/*
@@ -299,33 +362,62 @@ xfs_calc_write_reservation(
* the agf for each of the ags: 2 * sector size
* the agfl for each of the ags: 2 * sector size
* the super block to reflect the freed blocks: sector size
- * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime bitmap:
+ * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 2 exts * 1 block
* worst case split in allocation btrees per extent assuming 2 extents:
* 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
*/
STATIC uint
xfs_calc_itruncate_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- unsigned int t1, t2, t3;
+ unsigned int t1, t2, t3, t4;
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
if (xfs_has_realtime(mp)) {
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
} else {
t3 = 0;
}
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * four refcountbt splits in the same transaction as bnobt/cntbt
+ * updates. The codebase runs refcountbt updates in separate
+ * transactions now, so to compute the minimum log size, add the
+ * refcount btree splits back here and do not compute them separately
+ * as t4. Reflink did not support realtime when the reservations were
+ * established, so do not adjust t3.
+ */
+ if (for_minlogsize) {
+ if (xfs_has_reflink(mp))
+ t2 += xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 4),
+ blksz);
+
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 2);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_itruncate_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_itruncate_reservation(mp, true);
}
/*
@@ -349,7 +441,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
XFS_FSB_TO_B(mp, 1))));
}
@@ -389,7 +481,7 @@ xfs_calc_link_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1))));
}
@@ -427,7 +519,7 @@ xfs_calc_remove_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -572,7 +664,7 @@ xfs_calc_growdata_reservation(
struct xfs_mount *mp)
{
return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -670,7 +762,7 @@ xfs_calc_addafork_reservation(
xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4),
XFS_FSB_TO_B(mp, 1))));
}
@@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation(
XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
(xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -791,13 +883,21 @@ xfs_calc_qm_setqlim_reservation(void)
*/
STATIC uint
xfs_calc_qm_dqalloc_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- return xfs_calc_write_reservation(mp) +
+ return xfs_calc_write_reservation(mp, for_minlogsize) +
xfs_calc_buf_res(1,
XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
}
+unsigned int
+xfs_calc_qm_dqalloc_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_qm_dqalloc_reservation(mp, true);
+}
+
/*
* Syncing the incore super block changes to disk.
* the super block to reflect the changes: sector size
@@ -814,36 +914,18 @@ xfs_trans_resv_calc(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
- unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
-
- /*
- * In the early days of rmap+reflink, we always set the rmap maxlevels
- * to 9 even if the AG was small enough that it would never grow to
- * that height. Transaction reservation sizes influence the minimum
- * log size calculation, which influences the size of the log that mkfs
- * creates. Use the old value here to ensure that newly formatted
- * small filesystems will mount on older kernels.
- */
- if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
- mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+ int logcount_adj = 0;
/*
* The following transactions are logged in physical format and
* require a permanent reservation on space.
*/
- resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
- else
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_itruncate.tr_logcount =
- XFS_ITRUNCATE_LOG_COUNT_REFLINK;
- else
- resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
@@ -899,11 +981,9 @@ xfs_trans_resv_calc(
resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
- else
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp,
+ false);
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
/*
@@ -930,6 +1010,19 @@ xfs_trans_resv_calc(
resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
- /* Put everything back the way it was. This goes at the end. */
- mp->m_rmap_maxlevels = rmap_maxlevels;
+ /*
+ * Add one logcount for BUI items that appear with rmap or reflink,
+ * one logcount for refcount intent items, and one logcount for rmap
+ * intent items.
+ */
+ if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp))
+ logcount_adj++;
+ if (xfs_has_reflink(mp))
+ logcount_adj++;
+ if (xfs_has_rmapbt(mp))
+ logcount_adj++;
+
+ resp->tr_itruncate.tr_logcount += logcount_adj;
+ resp->tr_write.tr_logcount += logcount_adj;
+ resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index fc4e9b369a3a..0554b9d775d2 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -73,7 +73,6 @@ struct xfs_trans_resv {
#define XFS_DEFAULT_LOG_COUNT 1
#define XFS_DEFAULT_PERM_LOG_COUNT 2
#define XFS_ITRUNCATE_LOG_COUNT 2
-#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
#define XFS_INACTIVE_LOG_COUNT 2
#define XFS_CREATE_LOG_COUNT 2
#define XFS_CREATE_TMPFILE_LOG_COUNT 2
@@ -83,13 +82,24 @@ struct xfs_trans_resv {
#define XFS_LINK_LOG_COUNT 2
#define XFS_RENAME_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT 2
-#define XFS_WRITE_LOG_COUNT_REFLINK 8
#define XFS_ADDAFORK_LOG_COUNT 2
#define XFS_ATTRINVAL_LOG_COUNT 1
#define XFS_ATTRSET_LOG_COUNT 3
#define XFS_ATTRRM_LOG_COUNT 3
+/*
+ * Original log operation counts were overestimated in the early days of
+ * reflink. These are retained here purely for minimum log size calculations
+ * and must not be used for runtime reservations.
+ */
+#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
+#define XFS_WRITE_LOG_COUNT_REFLINK 8
+
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
-uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
+uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+
+unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b6da06b40989..373f64a492a4 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
-typedef int32_t xfs_extnum_t; /* # of extents in a file */
-typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */
+typedef uint64_t xfs_extnum_t; /* # of extents in a file */
+typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
@@ -57,13 +57,6 @@ typedef void * xfs_failaddr_t;
#define NULLAGINO ((xfs_agino_t)-1)
/*
- * Max values for extlen, extnum, aextnum.
- */
-#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */
-#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */
-#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */
-
-/*
* Minimum and maximum blocksize and sectorsize.
* The blocksize upper limit is pretty much arbitrary.
* The sectorsize upper limit is due to sizeof(sb_sectsize).
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index a4cbbc346f60..285995ba3947 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -133,29 +133,13 @@ xchk_bmap_get_rmap(
if (info->is_shared) {
error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
owner, offset, rflags, rmap, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
- return false;
- goto out;
+ } else {
+ error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
+ owner, offset, rflags, rmap, &has_rmap);
}
-
- /*
- * Otherwise, use the (faster) regular lookup.
- */
- error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner,
- offset, rflags, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
+ if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
return false;
- if (!has_rmap)
- goto out;
- error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
- return false;
-
-out:
if (!has_rmap)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -350,7 +334,7 @@ xchk_bmap_iextent(
irec->br_startoff);
/* Make sure the extent points to a valid place. */
- if (irec->br_blockcount > MAXEXTLEN)
+ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (info->is_rt &&
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index bf1f3607d0b6..97b54ac3075f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -23,6 +23,8 @@
#include "xfs_rmap_btree.h"
#include "xfs_log.h"
#include "xfs_trans_priv.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index eac15af7b08c..51820b40ab1c 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -232,7 +232,8 @@ xchk_dinode(
size_t fork_recs;
unsigned long long isize;
uint64_t flags2;
- uint32_t nextents;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
prid_t prid;
uint16_t flags;
uint16_t mode;
@@ -390,8 +391,10 @@ xchk_dinode(
xchk_inode_extsize(sc, dip, ino, mode, flags);
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+
/* di_nextents */
- nextents = be32_to_cpu(dip->di_nextents);
fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_format) {
case XFS_DINODE_FMT_EXTENTS:
@@ -411,7 +414,7 @@ xchk_dinode(
/* di_forkoff */
if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
xchk_ino_set_corrupt(sc, ino);
- if (dip->di_anextents != 0 && dip->di_forkoff == 0)
+ if (naextents != 0 && dip->di_forkoff == 0)
xchk_ino_set_corrupt(sc, ino);
if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
xchk_ino_set_corrupt(sc, ino);
@@ -423,19 +426,18 @@ xchk_dinode(
xchk_ino_set_corrupt(sc, ino);
/* di_anextents */
- nextents = be16_to_cpu(dip->di_anextents);
fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
- if (nextents > fork_recs)
+ if (naextents > fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_BTREE:
- if (nextents <= fork_recs)
+ if (naextents <= fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
default:
- if (nextents != 0)
+ if (naextents != 0)
xchk_ino_set_corrupt(sc, ino);
}
@@ -513,14 +515,14 @@ xchk_inode_xref_bmap(
&nextents, &count);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
- if (nextents < be32_to_cpu(dip->di_nextents))
+ if (nextents < xfs_dfork_data_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
&nextents, &acount);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
- if (nextents != be16_to_cpu(dip->di_anextents))
+ if (nextents != xfs_dfork_attr_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
/* Check nblocks against the inode. */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 8fa012057405..0a3bde64c675 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -40,6 +40,7 @@ xchk_setup_rt(
/* Scrub a free extent record from the realtime bitmap. */
STATIC int
xchk_rtbitmap_rec(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
@@ -48,10 +49,10 @@ xchk_rtbitmap_rec(
xfs_rtblock_t startblock;
xfs_rtblock_t blockcount;
- startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize;
- blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize;
+ startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+ blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
- if (!xfs_verify_rtext(sc->mp, startblock, blockcount))
+ if (!xfs_verify_rtext(mp, startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
@@ -114,7 +115,7 @@ xchk_rtbitmap(
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc);
+ error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
goto out;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index b11870d07c56..2e8e400f10a9 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -340,20 +340,6 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
};
-/* This isn't a stable feature, warn once per day. */
-static inline void
-xchk_experimental_warning(
- struct xfs_mount *mp)
-{
- static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT(
- "xchk_warning", 86400 * HZ, 1);
- ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE);
-
- if (__ratelimit(&scrub_warning))
- xfs_alert(mp,
-"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
-}
-
static int
xchk_validate_inputs(
struct xfs_mount *mp,
@@ -478,7 +464,8 @@ xfs_scrub_metadata(
if (error)
goto out;
- xchk_experimental_warning(mp);
+ xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
+ "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
if (!sc) {
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 5c52ee869272..b744c62052b6 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -10,13 +10,14 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_acl.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_trans.h"
+#include "xfs_xattr.h"
#include <linux/posix_acl_xattr.h>
@@ -202,7 +203,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
xfs_acl_to_disk(args.value, acl);
}
- error = xfs_attr_set(&args);
+ error = xfs_attr_change(&args);
kmem_free(args.value);
/*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index bb6abdcb265d..263404d0bfda 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -16,11 +16,13 @@ extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
void xfs_forget_acl(struct inode *inode, const char *name);
#else
-static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu)
+#define xfs_get_acl NULL
+#define xfs_set_acl NULL
+static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl,
+ int type)
{
- return NULL;
+ return 0;
}
-# define xfs_set_acl NULL
static inline void xfs_forget_acl(struct inode *inode, const char *name)
{
}
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
new file mode 100644
index 000000000000..135d44133477
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.c
@@ -0,0 +1,885 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022 Oracle. All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_item.h"
+#include "xfs_trace.h"
+#include "xfs_trans_space.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+struct kmem_cache *xfs_attri_cache;
+struct kmem_cache *xfs_attrd_cache;
+
+static const struct xfs_item_ops xfs_attri_item_ops;
+static const struct xfs_item_ops xfs_attrd_item_ops;
+static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip);
+
+static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_attri_log_item, attri_item);
+}
+
+/*
+ * Shared xattr name/value buffers for logged extended attribute operations
+ *
+ * When logging updates to extended attributes, we can create quite a few
+ * attribute log intent items for a single xattr update. To avoid cycling the
+ * memory allocator and memcpy overhead, the name (and value, for setxattr)
+ * are kept in a refcounted object that is shared across all related log items
+ * and the upper-level deferred work state structure. The shared buffer has
+ * a control structure, followed by the name, and then the value.
+ */
+
+static inline struct xfs_attri_log_nameval *
+xfs_attri_log_nameval_get(
+ struct xfs_attri_log_nameval *nv)
+{
+ if (!refcount_inc_not_zero(&nv->refcount))
+ return NULL;
+ return nv;
+}
+
+static inline void
+xfs_attri_log_nameval_put(
+ struct xfs_attri_log_nameval *nv)
+{
+ if (!nv)
+ return;
+ if (refcount_dec_and_test(&nv->refcount))
+ kvfree(nv);
+}
+
+static inline struct xfs_attri_log_nameval *
+xfs_attri_log_nameval_alloc(
+ const void *name,
+ unsigned int name_len,
+ const void *value,
+ unsigned int value_len)
+{
+ struct xfs_attri_log_nameval *nv;
+
+ /*
+ * This could be over 64kB in length, so we have to use kvmalloc() for
+ * this. But kvmalloc() utterly sucks, so we use our own version.
+ */
+ nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) +
+ name_len + value_len);
+ if (!nv)
+ return nv;
+
+ nv->name.i_addr = nv + 1;
+ nv->name.i_len = name_len;
+ nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME;
+ memcpy(nv->name.i_addr, name, name_len);
+
+ if (value_len) {
+ nv->value.i_addr = nv->name.i_addr + name_len;
+ nv->value.i_len = value_len;
+ memcpy(nv->value.i_addr, value, value_len);
+ } else {
+ nv->value.i_addr = NULL;
+ nv->value.i_len = 0;
+ }
+ nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE;
+
+ refcount_set(&nv->refcount, 1);
+ return nv;
+}
+
+STATIC void
+xfs_attri_item_free(
+ struct xfs_attri_log_item *attrip)
+{
+ kmem_free(attrip->attri_item.li_lv_shadow);
+ xfs_attri_log_nameval_put(attrip->attri_nameval);
+ kmem_cache_free(xfs_attri_cache, attrip);
+}
+
+/*
+ * Freeing the attrip requires that we remove it from the AIL if it has already
+ * been placed there. However, the ATTRI may not yet have been placed in the
+ * AIL when called by xfs_attri_release() from ATTRD processing due to the
+ * ordering of committed vs unpin operations in bulk insert operations. Hence
+ * the reference count to ensure only the last caller frees the ATTRI.
+ */
+STATIC void
+xfs_attri_release(
+ struct xfs_attri_log_item *attrip)
+{
+ ASSERT(atomic_read(&attrip->attri_refcount) > 0);
+ if (!atomic_dec_and_test(&attrip->attri_refcount))
+ return;
+
+ xfs_trans_ail_delete(&attrip->attri_item, 0);
+ xfs_attri_item_free(attrip);
+}
+
+STATIC void
+xfs_attri_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+
+ *nvecs += 2;
+ *nbytes += sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(nv->name.i_len);
+
+ if (!nv->value.i_len)
+ return;
+
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attri log
+ * item. We use 1 iovec for the attri_format_item, 1 for the name, and
+ * another for the value if it is present
+ */
+STATIC void
+xfs_attri_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+
+ attrip->attri_format.alfi_type = XFS_LI_ATTRI;
+ attrip->attri_format.alfi_size = 1;
+
+ /*
+ * This size accounting must be done before copying the attrip into the
+ * iovec. If we do it after, the wrong size will be recorded to the log
+ * and we trip across assertion checks for bad region sizes later during
+ * the log recovery.
+ */
+
+ ASSERT(nv->name.i_len > 0);
+ attrip->attri_format.alfi_size++;
+
+ if (nv->value.i_len > 0)
+ attrip->attri_format.alfi_size++;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
+ &attrip->attri_format,
+ sizeof(struct xfs_attri_log_format));
+ xlog_copy_from_iovec(lv, &vecp, &nv->name);
+ if (nv->value.i_len > 0)
+ xlog_copy_from_iovec(lv, &vecp, &nv->value);
+}
+
+/*
+ * The unpin operation is the last place an ATTRI is manipulated in the log. It
+ * is either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the ATTRI transaction has been successfully committed to make
+ * it this far. Therefore, we expect whoever committed the ATTRI to either
+ * construct and commit the ATTRD or drop the ATTRD's reference in the event of
+ * error. Simply drop the log's ATTRI reference now that the log is done with
+ * it.
+ */
+STATIC void
+xfs_attri_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+
+STATIC void
+xfs_attri_item_release(
+ struct xfs_log_item *lip)
+{
+ xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an attri item. Caller may allocate an additional
+ * trailing buffer for name and value
+ */
+STATIC struct xfs_attri_log_item *
+xfs_attri_init(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_nameval *nv)
+{
+ struct xfs_attri_log_item *attrip;
+
+ attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ /*
+ * Grab an extra reference to the name/value buffer for this log item.
+ * The caller retains its own reference!
+ */
+ attrip->attri_nameval = xfs_attri_log_nameval_get(nv);
+ ASSERT(attrip->attri_nameval);
+
+ xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI,
+ &xfs_attri_item_ops);
+ attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip;
+ atomic_set(&attrip->attri_refcount, 2);
+
+ return attrip;
+}
+
+/*
+ * Copy an attr format buffer from the given buf, and into the destination attr
+ * format structure.
+ */
+STATIC int
+xfs_attri_copy_format(
+ struct xfs_log_iovec *buf,
+ struct xfs_attri_log_format *dst_attr_fmt)
+{
+ struct xfs_attri_log_format *src_attr_fmt = buf->i_addr;
+ size_t len;
+
+ len = sizeof(struct xfs_attri_log_format);
+ if (buf->i_len != len) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ return -EFSCORRUPTED;
+ }
+
+ memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len);
+ return 0;
+}
+
+static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_attrd_log_item, attrd_item);
+}
+
+STATIC void
+xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
+{
+ kmem_free(attrdp->attrd_item.li_lv_shadow);
+ kmem_cache_free(xfs_attrd_cache, attrdp);
+}
+
+STATIC void
+xfs_attrd_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_attrd_log_format);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attrd log item. We use
+ * only 1 iovec for the attrd_format, and we point that at the attr_log_format
+ * structure embedded in the attrd item.
+ */
+STATIC void
+xfs_attrd_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ attrdp->attrd_format.alfd_type = XFS_LI_ATTRD;
+ attrdp->attrd_format.alfd_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT,
+ &attrdp->attrd_format,
+ sizeof(struct xfs_attrd_log_format));
+}
+
+/*
+ * The ATTRD is either committed or aborted if the transaction is canceled. If
+ * the transaction is canceled, drop our reference to the ATTRI and free the
+ * ATTRD.
+ */
+STATIC void
+xfs_attrd_item_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip);
+
+ xfs_attri_release(attrdp->attrd_attrip);
+ xfs_attrd_item_free(attrdp);
+}
+
+static struct xfs_log_item *
+xfs_attrd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
+}
+
+/*
+ * Performs one step of an attribute update intent and marks the attrd item
+ * dirty.. An attr operation may be a set or a remove. Note that the
+ * transaction is marked dirty regardless of whether the operation succeeds or
+ * fails to support the ATTRI/ATTRD lifecycle rules.
+ */
+STATIC int
+xfs_xattri_finish_update(
+ struct xfs_attr_intent *attr,
+ struct xfs_attrd_log_item *attrdp)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+ error = -EIO;
+ goto out;
+ }
+
+ error = xfs_attr_set_iter(attr);
+ if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+ error = -EAGAIN;
+out:
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the ATTRI and frees the ATTRD
+ * 2.) shuts down the filesystem
+ */
+ args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+
+ /*
+ * attr intent/done items are null when logged attributes are disabled
+ */
+ if (attrdp)
+ set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+ return error;
+}
+
+/* Log an attr to the intent item. */
+STATIC void
+xfs_attr_log_item(
+ struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip,
+ const struct xfs_attr_intent *attr)
+{
+ struct xfs_attri_log_format *attrp;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
+
+ /*
+ * At this point the xfs_attr_intent has been constructed, and we've
+ * created the log intent. Fill in the attri log item and log format
+ * structure with fields from this xfs_attr_intent
+ */
+ attrp = &attrip->attri_format;
+ attrp->alfi_ino = attr->xattri_da_args->dp->i_ino;
+ ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK));
+ attrp->alfi_op_flags = attr->xattri_op_flags;
+ attrp->alfi_value_len = attr->xattri_nameval->value.i_len;
+ attrp->alfi_name_len = attr->xattri_nameval->name.i_len;
+ ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK));
+ attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter;
+}
+
+/* Get an ATTRI. */
+static struct xfs_log_item *
+xfs_attr_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attr_intent *attr;
+ struct xfs_da_args *args;
+
+ ASSERT(count == 1);
+
+ /*
+ * Each attr item only performs one attribute operation at a time, so
+ * this is a list of one
+ */
+ attr = list_first_entry_or_null(items, struct xfs_attr_intent,
+ xattri_list);
+ args = attr->xattri_da_args;
+
+ if (!(args->op_flags & XFS_DA_OP_LOGGED))
+ return NULL;
+
+ /*
+ * Create a buffer to store the attribute name and value. This buffer
+ * will be shared between the higher level deferred xattr work state
+ * and the lower level xattr log items.
+ */
+ if (!attr->xattri_nameval) {
+ /*
+ * Transfer our reference to the name/value buffer to the
+ * deferred work state structure.
+ */
+ attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name,
+ args->namelen, args->value, args->valuelen);
+ }
+ if (!attr->xattri_nameval)
+ return ERR_PTR(-ENOMEM);
+
+ attrip = xfs_attri_init(mp, attr->xattri_nameval);
+ xfs_trans_add_item(tp, &attrip->attri_item);
+ xfs_attr_log_item(tp, attrip, attr);
+
+ return &attrip->attri_item;
+}
+
+static inline void
+xfs_attr_free_item(
+ struct xfs_attr_intent *attr)
+{
+ if (attr->xattri_da_state)
+ xfs_da_state_free(attr->xattri_da_state);
+ xfs_attri_log_nameval_put(attr->xattri_nameval);
+ if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY)
+ kmem_free(attr);
+ else
+ kmem_cache_free(xfs_attr_intent_cache, attr);
+}
+
+/* Process an attr. */
+STATIC int
+xfs_attr_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ struct xfs_attr_intent *attr;
+ struct xfs_attrd_log_item *done_item = NULL;
+ int error;
+
+ attr = container_of(item, struct xfs_attr_intent, xattri_list);
+ if (done)
+ done_item = ATTRD_ITEM(done);
+
+ /*
+ * Always reset trans after EAGAIN cycle
+ * since the transaction is new
+ */
+ attr->xattri_da_args->trans = tp;
+
+ error = xfs_xattri_finish_update(attr, done_item);
+ if (error != -EAGAIN)
+ xfs_attr_free_item(attr);
+
+ return error;
+}
+
+/* Abort all pending ATTRs. */
+STATIC void
+xfs_attr_abort_intent(
+ struct xfs_log_item *intent)
+{
+ xfs_attri_release(ATTRI_ITEM(intent));
+}
+
+/* Cancel an attr */
+STATIC void
+xfs_attr_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_attr_intent *attr;
+
+ attr = container_of(item, struct xfs_attr_intent, xattri_list);
+ xfs_attr_free_item(attr);
+}
+
+STATIC bool
+xfs_attri_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id;
+}
+
+/* Is this recovered ATTRI format ok? */
+static inline bool
+xfs_attri_validate(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_format *attrp)
+{
+ unsigned int op = attrp->alfi_op_flags &
+ XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+
+ if (attrp->__pad != 0)
+ return false;
+
+ if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)
+ return false;
+
+ if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK)
+ return false;
+
+ /* alfi_op_flags should be either a set or remove */
+ switch (op) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ break;
+ default:
+ return false;
+ }
+
+ if (attrp->alfi_value_len > XATTR_SIZE_MAX)
+ return false;
+
+ if ((attrp->alfi_name_len > XATTR_NAME_MAX) ||
+ (attrp->alfi_name_len == 0))
+ return false;
+
+ return xfs_verify_ino(mp, attrp->alfi_ino);
+}
+
+/*
+ * Process an attr intent item that was recovered from the log. We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attri_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attr_intent *attr;
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_inode *ip;
+ struct xfs_da_args *args;
+ struct xfs_trans *tp;
+ struct xfs_trans_res tres;
+ struct xfs_attri_log_format *attrp;
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+ int error, ret = 0;
+ int total;
+ int local;
+ struct xfs_attrd_log_item *done_item = NULL;
+
+ /*
+ * First check the validity of the attr described by the ATTRI. If any
+ * are bad, then assume that all are bad and just toss the ATTRI.
+ */
+ attrp = &attrip->attri_format;
+ if (!xfs_attri_validate(mp, attrp) ||
+ !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+ return -EFSCORRUPTED;
+
+ error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
+ if (error)
+ return error;
+
+ attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
+ sizeof(struct xfs_da_args), KM_NOFS);
+ args = (struct xfs_da_args *)(attr + 1);
+
+ attr->xattri_da_args = args;
+ attr->xattri_op_flags = attrp->alfi_op_flags &
+ XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+
+ /*
+ * We're reconstructing the deferred work state structure from the
+ * recovered log item. Grab a reference to the name/value buffer and
+ * attach it to the new work state.
+ */
+ attr->xattri_nameval = xfs_attri_log_nameval_get(nv);
+ ASSERT(attr->xattri_nameval);
+
+ args->dp = ip;
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->name = nv->name.i_addr;
+ args->namelen = nv->name.i_len;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
+ args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
+ args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
+ XFS_DA_OP_LOGGED;
+
+ ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb));
+
+ switch (attr->xattri_op_flags) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ args->value = nv->value.i_addr;
+ args->valuelen = nv->value.i_len;
+ args->total = xfs_attr_calc_size(args, &local);
+ if (xfs_inode_hasattr(args->dp))
+ attr->xattri_dela_state = xfs_attr_init_replace_state(args);
+ else
+ attr->xattri_dela_state = xfs_attr_init_add_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ if (!xfs_inode_hasattr(args->dp))
+ goto out;
+ attr->xattri_dela_state = xfs_attr_init_remove_state(args);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ xfs_init_attr_trans(args, &tres, &total);
+ error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ goto out;
+
+ args->trans = tp;
+ done_item = xfs_trans_get_attrd(tp, attrip);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ ret = xfs_xattri_finish_update(attr, done_item);
+ if (ret == -EAGAIN) {
+ /* There's more work to do, so add it to this transaction */
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
+ } else
+ error = ret;
+
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+ }
+
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+
+out_unlock:
+ if (attr->xattri_leaf_bp)
+ xfs_buf_relse(attr->xattri_leaf_bp);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+out:
+ if (ret != -EAGAIN)
+ xfs_attr_free_item(attr);
+ return error;
+}
+
+/* Re-log an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_attri_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_attrd_log_item *attrdp;
+ struct xfs_attri_log_item *old_attrip;
+ struct xfs_attri_log_item *new_attrip;
+ struct xfs_attri_log_format *new_attrp;
+ struct xfs_attri_log_format *old_attrp;
+
+ old_attrip = ATTRI_ITEM(intent);
+ old_attrp = &old_attrip->attri_format;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ attrdp = xfs_trans_get_attrd(tp, old_attrip);
+ set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+ /*
+ * Create a new log item that shares the same name/value buffer as the
+ * old log item.
+ */
+ new_attrip = xfs_attri_init(tp->t_mountp, old_attrip->attri_nameval);
+ new_attrp = &new_attrip->attri_format;
+
+ new_attrp->alfi_ino = old_attrp->alfi_ino;
+ new_attrp->alfi_op_flags = old_attrp->alfi_op_flags;
+ new_attrp->alfi_value_len = old_attrp->alfi_value_len;
+ new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+ new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
+
+ xfs_trans_add_item(tp, &new_attrip->attri_item);
+ set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
+
+ return &new_attrip->attri_item;
+}
+
+STATIC int
+xlog_recover_attri_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attri_log_format *attri_formatp;
+ struct xfs_attri_log_nameval *nv;
+ const void *attr_value = NULL;
+ const void *attr_name;
+ int error;
+
+ attri_formatp = item->ri_buf[0].i_addr;
+ attr_name = item->ri_buf[1].i_addr;
+
+ /* Validate xfs_attri_log_format before the large memory allocation */
+ if (!xfs_attri_validate(mp, attri_formatp)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ if (attri_formatp->alfi_value_len)
+ attr_value = item->ri_buf[2].i_addr;
+
+ /*
+ * Memory alloc failure will cause replay to abort. We attach the
+ * name/value buffer to the recovered incore log item and drop our
+ * reference.
+ */
+ nv = xfs_attri_log_nameval_alloc(attr_name,
+ attri_formatp->alfi_name_len, attr_value,
+ attri_formatp->alfi_value_len);
+ if (!nv)
+ return -ENOMEM;
+
+ attrip = xfs_attri_init(mp, nv);
+ error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format);
+ if (error)
+ goto out;
+
+ /*
+ * The ATTRI has two references. One for the ATTRD and one for ATTRI to
+ * ensure it makes it into the AIL. Insert the ATTRI into the AIL
+ * directly and drop the ATTRI reference. Note that
+ * xfs_trans_ail_update() drops the AIL lock.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
+ xfs_attri_release(attrip);
+ xfs_attri_log_nameval_put(nv);
+ return 0;
+out:
+ xfs_attri_item_free(attrip);
+ xfs_attri_log_nameval_put(nv);
+ return error;
+}
+
+/*
+ * This routine is called to allocate an "attr free done" log item.
+ */
+static struct xfs_attrd_log_item *
+xfs_trans_get_attrd(struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip)
+{
+ struct xfs_attrd_log_item *attrdp;
+
+ ASSERT(tp != NULL);
+
+ attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+ &xfs_attrd_item_ops);
+ attrdp->attrd_attrip = attrip;
+ attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+ xfs_trans_add_item(tp, &attrdp->attrd_item);
+ return attrdp;
+}
+
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ if (!intent)
+ return NULL;
+
+ return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+ .max_items = 1,
+ .create_intent = xfs_attr_create_intent,
+ .abort_intent = xfs_attr_abort_intent,
+ .create_done = xfs_attr_create_done,
+ .finish_item = xfs_attr_finish_item,
+ .cancel_item = xfs_attr_cancel_item,
+};
+
+/*
+ * This routine is called when an ATTRD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding ATTRI if
+ * it was still in the log. To do this it searches the AIL for the ATTRI with
+ * an id equal to that in the ATTRD format structure. If we find it we drop
+ * the ATTRD reference, which removes the ATTRI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_attrd_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_attrd_log_format *attrd_formatp;
+
+ attrd_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_ATTRI,
+ attrd_formatp->alfd_alf_id);
+ return 0;
+}
+
+static const struct xfs_item_ops xfs_attri_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_attri_item_size,
+ .iop_format = xfs_attri_item_format,
+ .iop_unpin = xfs_attri_item_unpin,
+ .iop_release = xfs_attri_item_release,
+ .iop_recover = xfs_attri_item_recover,
+ .iop_match = xfs_attri_item_match,
+ .iop_relog = xfs_attri_item_relog,
+};
+
+const struct xlog_recover_item_ops xlog_attri_item_ops = {
+ .item_type = XFS_LI_ATTRI,
+ .commit_pass2 = xlog_recover_attri_commit_pass2,
+};
+
+static const struct xfs_item_ops xfs_attrd_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
+ .iop_size = xfs_attrd_item_size,
+ .iop_format = xfs_attrd_item_format,
+ .iop_release = xfs_attrd_item_release,
+ .iop_intent = xfs_attrd_item_intent,
+};
+
+const struct xlog_recover_item_ops xlog_attrd_item_ops = {
+ .item_type = XFS_LI_ATTRD,
+ .commit_pass2 = xlog_recover_attrd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
new file mode 100644
index 000000000000..3280a7930287
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2022 Oracle. All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+#ifndef __XFS_ATTR_ITEM_H__
+#define __XFS_ATTR_ITEM_H__
+
+/* kernel only ATTRI/ATTRD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+struct xfs_attri_log_nameval {
+ struct xfs_log_iovec name;
+ struct xfs_log_iovec value;
+ refcount_t refcount;
+
+ /* name and value follow the end of this struct */
+};
+
+/*
+ * This is the "attr intention" log item. It is used to log the fact that some
+ * extended attribute operations need to be processed. An operation is
+ * currently either a set or remove. Set or remove operations are described by
+ * the xfs_attr_intent which may be logged to this intent.
+ *
+ * During a normal attr operation, name and value point to the name and value
+ * fields of the caller's xfs_da_args structure. During a recovery, the name
+ * and value buffers are copied from the log, and stored in a trailing buffer
+ * attached to the xfs_attr_intent until they are committed. They are freed
+ * when the xfs_attr_intent itself is freed when the work is done.
+ */
+struct xfs_attri_log_item {
+ struct xfs_log_item attri_item;
+ atomic_t attri_refcount;
+ struct xfs_attri_log_nameval *attri_nameval;
+ struct xfs_attri_log_format attri_format;
+};
+
+/*
+ * This is the "attr done" log item. It is used to log the fact that some attrs
+ * earlier mentioned in an attri item have been freed.
+ */
+struct xfs_attrd_log_item {
+ struct xfs_log_item attrd_item;
+ struct xfs_attri_log_item *attrd_attrip;
+ struct xfs_attrd_log_format attrd_format;
+};
+
+extern struct kmem_cache *xfs_attri_cache;
+extern struct kmem_cache *xfs_attrd_cache;
+
+#endif /* __XFS_ATTR_ITEM_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 2d1e5134cebe..90a14e85e76d 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -15,6 +15,7 @@
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_sf.h"
#include "xfs_attr_leaf.h"
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 761dde155099..51f66e982484 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -39,6 +39,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_cache, buip);
}
@@ -54,10 +55,11 @@ xfs_bui_release(
struct xfs_bui_log_item *buip)
{
ASSERT(atomic_read(&buip->bui_refcount) > 0);
- if (atomic_dec_and_test(&buip->bui_refcount)) {
- xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_bui_item_free(buip);
- }
+ if (!atomic_dec_and_test(&buip->bui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&buip->bui_item, 0);
+ xfs_bui_item_free(buip);
}
@@ -198,14 +200,24 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_cache, budp);
}
+static struct xfs_log_item *
+xfs_bud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &BUD_ITEM(lip)->bud_buip->bui_item;
+}
+
static const struct xfs_item_ops xfs_bud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_bud_item_size,
.iop_format = xfs_bud_item_format,
.iop_release = xfs_bud_item_release,
+ .iop_intent = xfs_bud_item_intent,
};
static struct xfs_bud_log_item *
@@ -254,7 +266,7 @@ xfs_trans_log_finish_bmap_update(
* 1.) releases the BUI and frees the BUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
return error;
@@ -506,6 +518,8 @@ xfs_bui_item_recover(
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto err_cancel;
@@ -584,6 +598,7 @@ xfs_bui_item_relog(
}
static const struct xfs_item_ops xfs_bui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
.iop_unpin = xfs_bui_item_unpin,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index eb2e387ba528..52be58372c63 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -119,14 +119,14 @@ retry:
*/
ralen = ap->length / mp->m_sb.sb_rextsize;
/*
- * If the old value was close enough to MAXEXTLEN that
+ * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
* we rounded up to it, cut it back so it's valid again.
* Note that if it's a really large request (bigger than
- * MAXEXTLEN), we don't hear about that number, and can't
+ * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
* adjust the starting point to match it.
*/
- if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
- ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+ if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
+ ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
/*
* Lock out modifications to both the RT bitmap and summary inodes
@@ -839,9 +839,11 @@ xfs_alloc_file_space(
* count, hence we need to limit the number of blocks we are
* trying to reserve to avoid an overflow. We can't allocate
* more than @nimaps extents, and an extent is limited on disk
- * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+ * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
+ * limit.
*/
- resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+ resblks = min_t(xfs_fileoff_t, (e - s),
+ (XFS_MAX_BMBT_EXTLEN * nimaps));
if (unlikely(rt)) {
dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
rblocks = resblks;
@@ -857,6 +859,9 @@ xfs_alloc_file_space(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto error;
@@ -912,6 +917,8 @@ xfs_unmap_extent(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -1193,6 +1200,8 @@ xfs_insert_file_space(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -1421,6 +1430,9 @@ xfs_swap_extent_rmap(
error = xfs_iext_count_may_overflow(ip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
@@ -1429,6 +1441,9 @@ xfs_swap_extent_rmap(
error = xfs_iext_count_may_overflow(tip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index e11e9ef2338f..4d8a6aece995 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -8,15 +8,18 @@
/* kernel only definitions */
+struct xfs_buf;
+struct xfs_mount;
+
/* buf log item flags */
-#define XFS_BLI_HOLD 0x01
-#define XFS_BLI_DIRTY 0x02
-#define XFS_BLI_STALE 0x04
-#define XFS_BLI_LOGGED 0x08
-#define XFS_BLI_INODE_ALLOC_BUF 0x10
-#define XFS_BLI_STALE_INODE 0x20
-#define XFS_BLI_INODE_BUF 0x40
-#define XFS_BLI_ORDERED 0x80
+#define XFS_BLI_HOLD (1u << 0)
+#define XFS_BLI_DIRTY (1u << 1)
+#define XFS_BLI_STALE (1u << 2)
+#define XFS_BLI_LOGGED (1u << 3)
+#define XFS_BLI_INODE_ALLOC_BUF (1u << 4)
+#define XFS_BLI_STALE_INODE (1u << 5)
+#define XFS_BLI_INODE_BUF (1u << 6)
+#define XFS_BLI_ORDERED (1u << 7)
#define XFS_BLI_FLAGS \
{ XFS_BLI_HOLD, "HOLD" }, \
@@ -28,11 +31,6 @@
{ XFS_BLI_INODE_BUF, "INODE_BUF" }, \
{ XFS_BLI_ORDERED, "ORDERED" }
-
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_buf_log_item;
-
/*
* This is the in core log item structure used to track information
* needed to log buffers. It tracks how many times the lock has been
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index e484251dc9c8..ffa94102094d 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -24,6 +24,15 @@
#include "xfs_quota.h"
/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define XLOG_BC_TABLE_SIZE 64
+
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
+/*
* This structure is used during recovery to record the buf log items which
* have been canceled and should not be replayed.
*/
@@ -993,3 +1002,60 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = {
.commit_pass1 = xlog_recover_buf_commit_pass1,
.commit_pass2 = xlog_recover_buf_commit_pass2,
};
+
+#ifdef DEBUG
+void
+xlog_check_buf_cancel_table(
+ struct xlog *log)
+{
+ int i;
+
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+ ASSERT(list_empty(&log->l_buf_cancel_table[i]));
+}
+#endif
+
+int
+xlog_alloc_buf_cancel_table(
+ struct xlog *log)
+{
+ void *p;
+ int i;
+
+ ASSERT(log->l_buf_cancel_table == NULL);
+
+ p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ log->l_buf_cancel_table = p;
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
+ return 0;
+}
+
+void
+xlog_free_buf_cancel_table(
+ struct xlog *log)
+{
+ int i;
+
+ if (!log->l_buf_cancel_table)
+ return;
+
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) {
+ struct xfs_buf_cancel *bc;
+
+ while ((bc = list_first_entry_or_null(
+ &log->l_buf_cancel_table[i],
+ struct xfs_buf_cancel, bc_list))) {
+ list_del(&bc->bc_list);
+ kmem_free(bc);
+ }
+ }
+
+ kmem_free(log->l_buf_cancel_table);
+ log->l_buf_cancel_table = NULL;
+}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 5afedcbc78c7..5a6c3c3c4de2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -136,10 +136,7 @@ xfs_qm_adjust_res_timer(
res->timer = xfs_dquot_set_timeout(mp,
ktime_get_real_seconds() + qlim->time);
} else {
- if (res->timer == 0)
- res->warnings = 0;
- else
- res->timer = 0;
+ res->timer = 0;
}
}
@@ -322,6 +319,9 @@ xfs_dquot_disk_alloc(
error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, quotip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto err_cancel;
@@ -589,10 +589,6 @@ xfs_dquot_from_disk(
dqp->q_ino.count = be64_to_cpu(ddqp->d_icount);
dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount);
- dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns);
- dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns);
- dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns);
-
dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer);
dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer);
dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer);
@@ -634,9 +630,9 @@ xfs_dquot_to_disk(
ddqp->d_icount = cpu_to_be64(dqp->q_ino.count);
ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count);
- ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings);
- ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings);
- ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings);
+ ddqp->d_bwarns = 0;
+ ddqp->d_iwarns = 0;
+ ddqp->d_rtbwarns = 0;
ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer);
ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer);
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 6b5e3cf40c8b..80c8f851a2f3 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -44,14 +44,6 @@ struct xfs_dquot_res {
* in seconds since the Unix epoch.
*/
time64_t timer;
-
- /*
- * For root dquots, this is the maximum number of warnings that will
- * be issued for this quota type. Otherwise, this is the number of
- * warnings issued against this quota. Note that none of this is
- * implemented.
- */
- xfs_qwarncnt_t warnings;
};
static inline bool
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 749fd18c4f32..296faa41d81d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -57,6 +57,9 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_REDUCE_MAX_IEXTENTS,
XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
XFS_RANDOM_AG_RESV_FAIL,
+ XFS_RANDOM_LARP,
+ XFS_RANDOM_DA_LEAF_SPLIT,
+ XFS_RANDOM_ATTR_LEAF_TO_NODE,
};
struct xfs_errortag_attr {
@@ -170,6 +173,9 @@ XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR);
XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
+XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP);
+XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
+XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -211,6 +217,9 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
+ XFS_ERRORTAG_ATTR_LIST(larp),
+ XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
+ XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
NULL,
};
ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 5735d5ea87ee..5191e9145e55 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -64,16 +64,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
* XFS panic tags -- allow a call to xfs_alert_tag() be turned into
* a panic by setting xfs_panic_mask in a sysctl.
*/
-#define XFS_NO_PTAG 0
-#define XFS_PTAG_IFLUSH 0x00000001
-#define XFS_PTAG_LOGRES 0x00000002
-#define XFS_PTAG_AILDELETE 0x00000004
-#define XFS_PTAG_ERROR_REPORT 0x00000008
-#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
-#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
-#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
-#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
-#define XFS_PTAG_VERIFIER_ERROR 0x00000100
+#define XFS_NO_PTAG 0u
+#define XFS_PTAG_IFLUSH (1u << 0)
+#define XFS_PTAG_LOGRES (1u << 1)
+#define XFS_PTAG_AILDELETE (1u << 2)
+#define XFS_PTAG_ERROR_REPORT (1u << 3)
+#define XFS_PTAG_SHUTDOWN_CORRUPT (1u << 4)
+#define XFS_PTAG_SHUTDOWN_IOERROR (1u << 5)
+#define XFS_PTAG_SHUTDOWN_LOGERROR (1u << 6)
+#define XFS_PTAG_FSBLOCK_ZERO (1u << 7)
+#define XFS_PTAG_VERIFIER_ERROR (1u << 8)
#define XFS_PTAG_STRINGS \
{ XFS_NO_PTAG, "none" }, \
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 0e50f2c9348e..765be054dffe 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -58,10 +58,11 @@ xfs_efi_release(
struct xfs_efi_log_item *efip)
{
ASSERT(atomic_read(&efip->efi_refcount) > 0);
- if (atomic_dec_and_test(&efip->efi_refcount)) {
- xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_efi_item_free(efip);
- }
+ if (!atomic_dec_and_test(&efip->efi_refcount))
+ return;
+
+ xfs_trans_ail_delete(&efip->efi_item, 0);
+ xfs_efi_item_free(efip);
}
/*
@@ -306,11 +307,20 @@ xfs_efd_item_release(
xfs_efd_item_free(efdp);
}
+static struct xfs_log_item *
+xfs_efd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &EFD_ITEM(lip)->efd_efip->efi_item;
+}
+
static const struct xfs_item_ops xfs_efd_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_efd_item_size,
.iop_format = xfs_efd_item_format,
.iop_release = xfs_efd_item_release,
+ .iop_intent = xfs_efd_item_intent,
};
/*
@@ -380,7 +390,7 @@ xfs_trans_free_extent(
* 1.) releases the EFI and frees the EFD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
next_extent = efdp->efd_next_extent;
@@ -688,6 +698,7 @@ xfs_efi_item_relog(
}
static const struct xfs_item_ops xfs_efi_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
.iop_unpin = xfs_efi_item_unpin,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 85c412107a10..5a171c0b244b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -310,7 +310,7 @@ STATIC ssize_t
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- int *iolock)
+ unsigned int *iolock)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -513,7 +513,7 @@ xfs_file_dio_write_aligned(
struct kiocb *iocb,
struct iov_iter *from)
{
- int iolock = XFS_IOLOCK_SHARED;
+ unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
ret = xfs_ilock_iocb(iocb, iolock);
@@ -566,7 +566,7 @@ xfs_file_dio_write_unaligned(
{
size_t isize = i_size_read(VFS_I(ip));
size_t count = iov_iter_count(from);
- int iolock = XFS_IOLOCK_SHARED;
+ unsigned int iolock = XFS_IOLOCK_SHARED;
unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
ssize_t ret;
@@ -576,9 +576,9 @@ xfs_file_dio_write_unaligned(
* don't even bother trying the fast path in this case.
*/
if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
-retry_exclusive:
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
+retry_exclusive:
iolock = XFS_IOLOCK_EXCL;
flags = IOMAP_DIO_FORCE_WAIT;
}
@@ -655,7 +655,7 @@ xfs_file_dax_write(
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- int iolock = XFS_IOLOCK_EXCL;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0;
loff_t pos;
@@ -694,13 +694,11 @@ xfs_file_buffered_write(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
bool cleared_space = false;
- int iolock;
+ unsigned int iolock;
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
@@ -767,9 +765,7 @@ xfs_file_write_iter(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
size_t ocount = iov_iter_count(from);
@@ -1167,12 +1163,10 @@ xfs_file_open(
struct inode *inode,
struct file *file)
{
- if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
- return -EFBIG;
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
- return 0;
+ return generic_file_open(inode, file);
}
STATIC int
@@ -1181,7 +1175,7 @@ xfs_dir_open(
struct file *file)
{
struct xfs_inode *ip = XFS_I(inode);
- int mode;
+ unsigned int mode;
int error;
error = xfs_file_open(inode, file);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 6a3ce0f6dc9e..be9bcf8a1f99 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -128,11 +128,12 @@ xfs_filestream_pick_ag(
if (!pag->pagf_init) {
err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
if (err) {
- xfs_perag_put(pag);
- if (err != -EAGAIN)
+ if (err != -EAGAIN) {
+ xfs_perag_put(pag);
return err;
+ }
/* Couldn't lock the AGF, skip this AG. */
- continue;
+ goto next_ag;
}
}
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 10e1cb71439e..bb23199f65c3 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -450,11 +450,11 @@ xfs_getfsmap_logdev(
/* Transform a rtbitmap "record" into a fsmap */
STATIC int
xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
{
- struct xfs_mount *mp = tp->t_mountp;
struct xfs_getfsmap_info *info = priv;
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
@@ -535,7 +535,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
ahigh.ar_startext++;
- error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
+ error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
goto err;
@@ -547,7 +547,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
info->last = true;
ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
- error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
+ error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
if (error)
goto err;
err:
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 68f74549fa22..d4a77c53f94b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -149,12 +149,7 @@ xfs_growfs_data_private(
error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
delta, &lastag_extended);
} else {
- static struct ratelimit_state shrink_warning = \
- RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1);
- ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE);
-
- if (__ratelimit(&shrink_warning))
- xfs_alert(mp,
+ xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK,
"EXPERIMENTAL online shrink feature in use. Use at your own risk!");
error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta);
@@ -349,10 +344,7 @@ xfs_fs_counts(
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
xfs_fdblocks_unavailable(mp);
-
- spin_lock(&mp->m_sb_lock);
- cnt->freertx = mp->m_sb.sb_frextents;
- spin_unlock(&mp->m_sb_lock);
+ cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
}
/*
@@ -512,7 +504,7 @@ xfs_fs_goingdown(
void
xfs_do_force_shutdown(
struct xfs_mount *mp,
- int flags,
+ uint32_t flags,
char *fname,
int lnnum)
{
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index f62fa652c2fd..4d0a98f920ca 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -41,5 +41,6 @@ struct xfs_globals xfs_globals = {
#endif
#ifdef DEBUG
.pwork_threads = -1, /* automatic thread detection */
+ .larp = false, /* log attribute replay */
#endif
};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index bffd6eb0b298..5269354b1b69 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
- uint64_t freertx;
if (!XFS_IS_REALTIME_INODE(ip))
return false;
- freertx = READ_ONCE(mp->m_sb.sb_frextents);
- return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
+ if (__percpu_counter_compare(&mp->m_frextents,
+ mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
+ XFS_FDBLOCKS_BATCH) < 0)
+ return true;
+
+ return false;
}
#else
# define xfs_inodegc_want_queue_rt_file(ip) (false)
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 508e184e3b8f..b05314d48176 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 39ae53efb3ab..52d6f2c7d58b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -416,10 +416,12 @@ xfs_lockdep_subclass_ok(
* parent locking. Care must be taken to ensure we don't overrun the subclass
* storage fields in the class mask we build.
*/
-static inline int
-xfs_lock_inumorder(int lock_mode, int subclass)
+static inline uint
+xfs_lock_inumorder(
+ uint lock_mode,
+ uint subclass)
{
- int class = 0;
+ uint class = 0;
ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
XFS_ILOCK_RTSUM)));
@@ -464,7 +466,10 @@ xfs_lock_inodes(
int inodes,
uint lock_mode)
{
- int attempts = 0, i, j, try_lock;
+ int attempts = 0;
+ uint i;
+ int j;
+ bool try_lock;
struct xfs_log_item *lp;
/*
@@ -489,9 +494,9 @@ xfs_lock_inodes(
} else if (lock_mode & XFS_MMAPLOCK_EXCL)
ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
- try_lock = 0;
- i = 0;
again:
+ try_lock = false;
+ i = 0;
for (; i < inodes; i++) {
ASSERT(ips[i]);
@@ -506,7 +511,7 @@ again:
for (j = (i - 1); j >= 0 && !try_lock; j--) {
lp = &ips[j]->i_itemp->ili_item;
if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
- try_lock++;
+ try_lock = true;
}
}
@@ -546,8 +551,6 @@ again:
if ((attempts % 5) == 0) {
delay(1); /* Don't just spin the CPU */
}
- i = 0;
- try_lock = 0;
goto again;
}
}
@@ -1024,11 +1027,6 @@ xfs_create(
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
- error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
-
/*
* A newly created regular or special file just has one directory
* entry pointing to them, but a directory also the "." entry
@@ -1242,11 +1240,6 @@ xfs_link(
if (error)
goto std_return;
- error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto error_return;
-
/*
* If we are using project inheritance, we only allow hard link
* creation in our tree when the project IDs are the same; else
@@ -2629,7 +2622,7 @@ xfs_ifree(
*/
error = xfs_difree(tp, pag, ip->i_ino, &xic);
if (error)
- return error;
+ goto out;
error = xfs_iunlink_remove(tp, pag, ip);
if (error)
@@ -3212,35 +3205,6 @@ retry:
/*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
- *
- * Extent count overflow check:
- *
- * From the perspective of src_dp, a rename operation is essentially a
- * directory entry remove operation. Hence the only place where we check
- * for extent count overflow for src_dp is in
- * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
- * -ENOSPC when it detects a possible extent count overflow and in
- * response, the higher layers of directory handling code do the
- * following:
- * 1. Data/Free blocks: XFS lets these blocks linger until a
- * future remove operation removes them.
- * 2. Dabtree blocks: XFS swaps the blocks with the last block in the
- * Leaf space and unmaps the last block.
- *
- * For target_dp, there are two cases depending on whether the
- * destination directory entry exists or not.
- *
- * When destination directory entry does not exist (i.e. target_ip ==
- * NULL), extent count overflow check is performed only when transaction
- * has a non-zero sized space reservation associated with it. With a
- * zero-sized space reservation, XFS allows a rename operation to
- * continue only when the directory has sufficient free space in its
- * data/leaf/free space blocks to hold the new entry.
- *
- * When destination directory entry exists (i.e. target_ip != NULL), all
- * we need to do is change the inode number associated with the already
- * existing entry. Hence there is no need to perform an extent count
- * overflow check.
*/
if (target_ip == NULL) {
/*
@@ -3251,12 +3215,6 @@ retry:
error = xfs_dir_canenter(tp, target_dp, target_name);
if (error)
goto out_trans_cancel;
- } else {
- error = xfs_iext_count_may_overflow(target_dp,
- XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
}
} else {
/*
@@ -3424,18 +3382,12 @@ retry:
* inode number of the whiteout inode rather than removing it
* altogether.
*/
- if (wip) {
+ if (wip)
error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
spaceres);
- } else {
- /*
- * NOTE: We don't need to check for extent count overflow here
- * because the dir remove name code will leave the dir block in
- * place if the extent count would overflow.
- */
+ else
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
spaceres);
- }
if (error)
goto out_trans_cancel;
@@ -3517,8 +3469,8 @@ xfs_iflush(
if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: detected corrupt incore inode %Lu, "
- "total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
+ "%s: detected corrupt incore inode %llu, "
+ "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
__func__, ip->i_ino,
ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
ip->i_nblocks, ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 740ab13d1aa2..7be6f8e705ab 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -218,6 +218,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
}
+static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
+{
+ return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
+}
+
/*
* Return the buftarg used for data allocations on a given inode.
*/
@@ -278,12 +283,12 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
*/
-#define XFS_IOLOCK_EXCL (1<<0)
-#define XFS_IOLOCK_SHARED (1<<1)
-#define XFS_ILOCK_EXCL (1<<2)
-#define XFS_ILOCK_SHARED (1<<3)
-#define XFS_MMAPLOCK_EXCL (1<<4)
-#define XFS_MMAPLOCK_SHARED (1<<5)
+#define XFS_IOLOCK_EXCL (1u << 0)
+#define XFS_IOLOCK_SHARED (1u << 1)
+#define XFS_ILOCK_EXCL (1u << 2)
+#define XFS_ILOCK_SHARED (1u << 3)
+#define XFS_MMAPLOCK_EXCL (1u << 4)
+#define XFS_MMAPLOCK_SHARED (1u << 5)
#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
@@ -350,19 +355,19 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
*/
#define XFS_IOLOCK_SHIFT 16
#define XFS_IOLOCK_MAX_SUBCLASS 3
-#define XFS_IOLOCK_DEP_MASK 0x000f0000
+#define XFS_IOLOCK_DEP_MASK 0x000f0000u
#define XFS_MMAPLOCK_SHIFT 20
#define XFS_MMAPLOCK_NUMORDER 0
#define XFS_MMAPLOCK_MAX_SUBCLASS 3
-#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
+#define XFS_MMAPLOCK_DEP_MASK 0x00f00000u
#define XFS_ILOCK_SHIFT 24
-#define XFS_ILOCK_PARENT_VAL 5
+#define XFS_ILOCK_PARENT_VAL 5u
#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1)
-#define XFS_ILOCK_RTBITMAP_VAL 6
-#define XFS_ILOCK_RTSUM_VAL 7
-#define XFS_ILOCK_DEP_MASK 0xff000000
+#define XFS_ILOCK_RTBITMAP_VAL 6u
+#define XFS_ILOCK_RTSUM_VAL 7u
+#define XFS_ILOCK_DEP_MASK 0xff000000u
#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9e6ef55cf29e..721def0639fd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -71,7 +71,7 @@ xfs_inode_item_data_fork_size(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- *nbytes += roundup(ip->i_df.if_bytes, 4);
+ *nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes);
*nvecs += 1;
}
break;
@@ -112,7 +112,7 @@ xfs_inode_item_attr_fork_size(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
- *nbytes += roundup(ip->i_afp->if_bytes, 4);
+ *nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes);
*nvecs += 1;
}
break;
@@ -204,17 +204,12 @@ xfs_inode_item_format_data_fork(
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- /*
- * Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed
- * to be there by xfs_idata_realloc().
- */
- data_bytes = roundup(ip->i_df.if_bytes, 4);
ASSERT(ip->i_df.if_u1.if_data != NULL);
ASSERT(ip->i_disk_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
- ip->i_df.if_u1.if_data, data_bytes);
- ilf->ilf_dsize = (unsigned)data_bytes;
+ ip->i_df.if_u1.if_data,
+ ip->i_df.if_bytes);
+ ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DDATA;
@@ -288,17 +283,11 @@ xfs_inode_item_format_attr_fork(
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
- /*
- * Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed
- * to be there by xfs_idata_realloc().
- */
- data_bytes = roundup(ip->i_afp->if_bytes, 4);
ASSERT(ip->i_afp->if_u1.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
ip->i_afp->if_u1.if_data,
- data_bytes);
- ilf->ilf_asize = (unsigned)data_bytes;
+ ip->i_afp->if_bytes);
+ ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ADATA;
@@ -359,6 +348,21 @@ xfs_copy_dm_fields_to_log_dinode(
}
}
+static inline void
+xfs_inode_to_log_dinode_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_log_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_big_anextents = xfs_ifork_nextents(ip->i_afp);
+ to->di_nrext64_pad = 0;
+ } else {
+ to->di_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_anextents = xfs_ifork_nextents(ip->i_afp);
+ }
+}
+
static void
xfs_inode_to_log_dinode(
struct xfs_inode *ip,
@@ -374,7 +378,6 @@ xfs_inode_to_log_dinode(
to->di_projid_lo = ip->i_projid & 0xffff;
to->di_projid_hi = ip->i_projid >> 16;
- memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
@@ -386,8 +389,6 @@ xfs_inode_to_log_dinode(
to->di_size = ip->i_disk_size;
to->di_nblocks = ip->i_nblocks;
to->di_extsize = ip->i_extsize;
- to->di_nextents = xfs_ifork_nextents(&ip->i_df);
- to->di_anextents = xfs_ifork_nextents(ip->i_afp);
to->di_forkoff = ip->i_forkoff;
to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_flags = ip->i_diflags;
@@ -407,11 +408,14 @@ xfs_inode_to_log_dinode(
to->di_lsn = lsn;
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_inode_to_log_dinode_iext_counters(ip, to);
}
/*
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 239dd2e3384e..d28ffaebd067 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -142,6 +142,29 @@ xfs_log_dinode_to_disk_ts(
return ts;
}
+static inline bool xfs_log_dinode_has_large_extent_counts(
+ const struct xfs_log_dinode *ld)
+{
+ return ld->di_version >= 3 &&
+ (ld->di_flags2 & XFS_DIFLAG2_NREXT64);
+}
+
+static inline void
+xfs_log_dinode_to_disk_iext_counters(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to)
+{
+ if (xfs_log_dinode_has_large_extent_counts(from)) {
+ to->di_big_nextents = cpu_to_be64(from->di_big_nextents);
+ to->di_big_anextents = cpu_to_be32(from->di_big_anextents);
+ to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad);
+ } else {
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ }
+
+}
+
STATIC void
xfs_log_dinode_to_disk(
struct xfs_log_dinode *from,
@@ -158,7 +181,6 @@ xfs_log_dinode_to_disk(
to->di_nlink = cpu_to_be32(from->di_nlink);
to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
@@ -167,8 +189,6 @@ xfs_log_dinode_to_disk(
to->di_size = cpu_to_be64(from->di_size);
to->di_nblocks = cpu_to_be64(from->di_nblocks);
to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
to->di_forkoff = from->di_forkoff;
to->di_aformat = from->di_aformat;
to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
@@ -184,12 +204,66 @@ xfs_log_dinode_to_disk(
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(lsn);
- memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &from->di_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_log_dinode_to_disk_iext_counters(from, to);
+}
+
+STATIC int
+xlog_dinode_verify_extent_counts(
+ struct xfs_mount *mp,
+ struct xfs_log_dinode *ldip)
+{
+ xfs_extnum_t nextents;
+ xfs_aextnum_t anextents;
+
+ if (xfs_log_dinode_has_large_extent_counts(ldip)) {
+ if (!xfs_has_large_extent_counts(mp) ||
+ (ldip->di_nrext64_pad != 0)) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode large extent count format",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, large extent counts %d, padding 0x%x",
+ ldip->di_ino, xfs_has_large_extent_counts(mp),
+ ldip->di_nrext64_pad);
+ return -EFSCORRUPTED;
+ }
+
+ nextents = ldip->di_big_nextents;
+ anextents = ldip->di_big_anextents;
+ } else {
+ if (ldip->di_version == 3 && ldip->di_v3_pad != 0) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode di_v3_pad",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, di_v3_pad 0x%llx",
+ ldip->di_ino, ldip->di_v3_pad);
+ return -EFSCORRUPTED;
+ }
+
+ nextents = ldip->di_nextents;
+ anextents = ldip->di_anextents;
+ }
+
+ if (unlikely(nextents + anextents > ldip->di_nblocks)) {
+ XFS_CORRUPTION_ERROR("Bad log dinode extent counts",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx",
+ ldip->di_ino, xfs_has_large_extent_counts(mp), nextents,
+ anextents, ldip->di_nblocks);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
}
STATIC int
@@ -317,13 +391,12 @@ xlog_recover_inode_commit_pass2(
if (unlikely(S_ISREG(ldip->di_mode))) {
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
(ldip->di_format != XFS_DINODE_FMT_BTREE)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode data fork format for regular file",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
+ "Bad inode 0x%llx, data fork format 0x%x",
+ in_f->ilf_ino, ldip->di_format);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -331,49 +404,37 @@ xlog_recover_inode_commit_pass2(
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
(ldip->di_format != XFS_DINODE_FMT_BTREE) &&
(ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode data fork format for directory",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
+ "Bad inode 0x%llx, data fork format 0x%x",
+ in_f->ilf_ino, ldip->di_format);
error = -EFSCORRUPTED;
goto out_release;
}
}
- if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
- __func__, item, dip, bp, in_f->ilf_ino,
- ldip->di_nextents + ldip->di_anextents,
- ldip->di_nblocks);
- error = -EFSCORRUPTED;
+
+ error = xlog_dinode_verify_extent_counts(mp, ldip);
+ if (error)
goto out_release;
- }
+
if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR("Bad log dinode fork offset",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
- item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
+ "Bad inode 0x%llx, di_forkoff 0x%x",
+ in_f->ilf_ino, ldip->di_forkoff);
error = -EFSCORRUPTED;
goto out_release;
}
isize = xfs_log_dinode_size(mp);
if (unlikely(item->ri_buf[1].i_len > isize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
+ mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
- __func__, item->ri_buf[1].i_len, item);
+ "Bad inode 0x%llx log dinode size 0x%x",
+ in_f->ilf_ino, item->ri_buf[1].i_len);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -401,7 +462,7 @@ xlog_recover_inode_commit_pass2(
ASSERT(in_f->ilf_size <= 4);
ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
ASSERT(!(fields & XFS_ILOG_DFORK) ||
- (len == in_f->ilf_dsize));
+ (len == xlog_calc_iovec_len(in_f->ilf_dsize)));
switch (fields & XFS_ILOG_DFORK) {
case XFS_ILOG_DDATA:
@@ -436,7 +497,7 @@ xlog_recover_inode_commit_pass2(
}
len = item->ri_buf[attr_index].i_len;
src = item->ri_buf[attr_index].i_addr;
- ASSERT(len == in_f->ilf_asize);
+ ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
case XFS_ILOG_ADATA:
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 83481005317a..0d67ff8a8961 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -15,6 +15,8 @@
#include "xfs_iwalk.h"
#include "xfs_itable.h"
#include "xfs_error.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -35,8 +37,7 @@
#include "xfs_health.h"
#include "xfs_reflink.h"
#include "xfs_ioctl.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
+#include "xfs_xattr.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -524,7 +525,7 @@ xfs_attrmulti_attr_set(
args.valuelen = len;
}
- error = xfs_attr_set(&args);
+ error = xfs_attr_change(&args);
if (!error && (flags & XFS_IOC_ATTR_ROOT))
xfs_forget_acl(inode, name);
kfree(args.value);
@@ -813,6 +814,9 @@ xfs_bulk_ireq_setup(
if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
return -ECANCELED;
+ if (hdr->flags & XFS_BULK_IREQ_NREXT64)
+ breq->flags |= XFS_IBULK_NREXT64;
+
return 0;
}
@@ -1092,7 +1096,8 @@ xfs_flags2diflags2(
{
uint64_t di_flags2 =
(ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
- XFS_DIFLAG2_BIGTIME));
+ XFS_DIFLAG2_BIGTIME |
+ XFS_DIFLAG2_NREXT64));
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ca25ed89b706..2f54b701eead 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -17,6 +17,8 @@
#include "xfs_itable.h"
#include "xfs_fsops.h"
#include "xfs_rtalloc.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_ioctl.h"
#include "xfs_ioctl32.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index e552ce541ec2..5a393259a3a3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -251,6 +251,8 @@ xfs_iomap_write_direct(
return error;
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, nr_exts);
if (error)
goto out_trans_cancel;
@@ -402,7 +404,7 @@ xfs_iomap_prealloc_size(
*/
plen = prev.br_blockcount;
while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
- if (plen > MAXEXTLEN / 2 ||
+ if (plen > XFS_MAX_BMBT_EXTLEN / 2 ||
isnullstartblock(got.br_startblock) ||
got.br_startoff + got.br_blockcount != prev.br_startoff ||
got.br_startblock + got.br_blockcount != prev.br_startblock)
@@ -414,23 +416,23 @@ xfs_iomap_prealloc_size(
/*
* If the size of the extents is greater than half the maximum extent
* length, then use the current offset as the basis. This ensures that
- * for large files the preallocation size always extends to MAXEXTLEN
- * rather than falling short due to things like stripe unit/width
- * alignment of real extents.
+ * for large files the preallocation size always extends to
+ * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
+ * unit/width alignment of real extents.
*/
alloc_blocks = plen * 2;
- if (alloc_blocks > MAXEXTLEN)
+ if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
alloc_blocks = XFS_B_TO_FSB(mp, offset);
qblocks = alloc_blocks;
/*
- * MAXEXTLEN is not a power of two value but we round the prealloc down
- * to the nearest power of two value after throttling. To prevent the
- * round down from unconditionally reducing the maximum supported
- * prealloc size, we round up first, apply appropriate throttling,
- * round down and cap the value to MAXEXTLEN.
+ * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
+ * down to the nearest power of two value after throttling. To prevent
+ * the round down from unconditionally reducing the maximum supported
+ * prealloc size, we round up first, apply appropriate throttling, round
+ * down and cap the value to XFS_BMBT_MAX_EXTLEN.
*/
- alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
+ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
alloc_blocks);
freesp = percpu_counter_read_positive(&mp->m_fdblocks);
@@ -478,14 +480,14 @@ xfs_iomap_prealloc_size(
*/
if (alloc_blocks)
alloc_blocks = rounddown_pow_of_two(alloc_blocks);
- if (alloc_blocks > MAXEXTLEN)
- alloc_blocks = MAXEXTLEN;
+ if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
+ alloc_blocks = XFS_MAX_BMBT_EXTLEN;
/*
* If we are still trying to allocate more space than is
* available, squash the prealloc hard. This can happen if we
* have a large file on a small filesystem and the above
- * lowspace thresholds are smaller than MAXEXTLEN.
+ * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
*/
while (alloc_blocks && alloc_blocks >= freesp)
alloc_blocks >>= 4;
@@ -555,6 +557,9 @@ xfs_iomap_write_unwritten(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_WRITE_UNWRITTEN_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index b34e8e4344a8..29f5b8b8aca6 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -13,6 +13,8 @@
#include "xfs_inode.h"
#include "xfs_acl.h"
#include "xfs_quota.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
#include "xfs_trace.h"
@@ -22,6 +24,7 @@
#include "xfs_iomap.h"
#include "xfs_error.h"
#include "xfs_ioctl.h"
+#include "xfs_xattr.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -59,7 +62,7 @@ xfs_initxattrs(
.value = xattr->value,
.valuelen = xattr->value_len,
};
- error = xfs_attr_set(&args);
+ error = xfs_attr_change(&args);
if (error < 0)
break;
}
@@ -209,7 +212,6 @@ xfs_generic_create(
if (unlikely(error))
goto out_cleanup_inode;
-#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
@@ -220,7 +222,6 @@ xfs_generic_create(
if (error)
goto out_cleanup_inode;
}
-#endif
xfs_setup_iops(ip);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c08c79d9e311..f74c9fff72bb 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -64,6 +64,7 @@ xfs_bulkstat_one_int(
struct xfs_inode *ip; /* incore inode pointer */
struct inode *inode;
struct xfs_bulkstat *buf = bc->buf;
+ xfs_extnum_t nextents;
int error = -EINVAL;
if (xfs_internal_inum(mp, ino))
@@ -102,7 +103,13 @@ xfs_bulkstat_one_int(
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize_blks = ip->i_extsize;
- buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
+
+ nextents = xfs_ifork_nextents(&ip->i_df);
+ if (!(bc->breq->flags & XFS_IBULK_NREXT64))
+ buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL);
+ else
+ buf->bs_extents64 = nextents;
+
xfs_bulkstat_health(ip, buf);
buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
@@ -256,6 +263,7 @@ xfs_bulkstat(
.breq = breq,
};
struct xfs_trans *tp;
+ unsigned int iwalk_flags = 0;
int error;
if (breq->mnt_userns != &init_user_ns) {
@@ -279,7 +287,10 @@ xfs_bulkstat(
if (error)
goto out;
- error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags,
+ if (breq->flags & XFS_IBULK_SAME_AG)
+ iwalk_flags |= XFS_IWALK_SAME_AG;
+
+ error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
out:
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 7078d10c9b12..e2d0eba43f35 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -17,7 +17,10 @@ struct xfs_ibulk {
};
/* Only iterate within the same AG as startino */
-#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
+#define XFS_IBULK_SAME_AG (1U << 0)
+
+/* Fill out the bs_extents64 field if set. */
+#define XFS_IBULK_NREXT64 (1U << 1)
/*
* Advance the user buffer pointer by one record of the given size. If the
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 37a795f03267..83699089755e 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
unsigned int inode_records, bool poll, void *data);
/* Only iterate inodes within the same AG as @startino. */
-#define XFS_IWALK_SAME_AG (0x1)
+#define XFS_IWALK_SAME_AG (1U << 0)
#define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 499e15b24215..1e972f884a81 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -49,7 +49,6 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclog,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp);
STATIC void
xlog_grant_push_ail(
@@ -61,10 +60,6 @@ xlog_sync(
struct xlog_in_core *iclog);
#if defined(DEBUG)
STATIC void
-xlog_verify_dest_ptr(
- struct xlog *log,
- void *ptr);
-STATIC void
xlog_verify_grant_tail(
struct xlog *log);
STATIC void
@@ -77,7 +72,6 @@ xlog_verify_tail_lsn(
struct xlog *log,
struct xlog_in_core *iclog);
#else
-#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
#define xlog_verify_iclog(a,b,c)
#define xlog_verify_tail_lsn(a,b)
@@ -90,6 +84,62 @@ xlog_iclogs_empty(
static int
xfs_log_cover(struct xfs_mount *);
+/*
+ * We need to make sure the buffer pointer returned is naturally aligned for the
+ * biggest basic data type we put into it. We have already accounted for this
+ * padding when sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ *
+ * We also add space for the xlog_op_header that describes this region in the
+ * log. This prepends the data region we return to the caller to copy their data
+ * into, so do all the static initialisation of the ophdr now. Because the ophdr
+ * is not 8 byte aligned, we have to be careful to ensure that we align the
+ * start of the buffer such that the region we return to the call is 8 byte
+ * aligned and packed against the tail of the ophdr.
+ */
+void *
+xlog_prepare_iovec(
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
+ uint type)
+{
+ struct xfs_log_iovec *vec = *vecp;
+ struct xlog_op_header *oph;
+ uint32_t len;
+ void *buf;
+
+ if (vec) {
+ ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+ vec++;
+ } else {
+ vec = &lv->lv_iovecp[0];
+ }
+
+ len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+ if (!IS_ALIGNED(len, sizeof(uint64_t))) {
+ lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+ sizeof(struct xlog_op_header);
+ }
+
+ vec->i_type = type;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+ oph = vec->i_addr;
+ oph->oh_clientid = XFS_TRANSACTION;
+ oph->oh_res2 = 0;
+ oph->oh_flags = 0;
+
+ buf = vec->i_addr + sizeof(struct xlog_op_header);
+ ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+
+ *vecp = vec;
+ return buf;
+}
+
static void
xlog_grant_sub_space(
struct xlog *log,
@@ -322,30 +372,6 @@ xlog_grant_head_check(
return error;
}
-static void
-xlog_tic_reset_res(xlog_ticket_t *tic)
-{
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- tic->t_res_num_ophdrs = 0;
-}
-
-static void
-xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
-{
- if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
- /* add to overflow and start again */
- tic->t_res_o_flow += tic->t_res_arr_sum;
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- }
-
- tic->t_res_arr[tic->t_res_num].r_len = len;
- tic->t_res_arr[tic->t_res_num].r_type = type;
- tic->t_res_arr_sum += len;
- tic->t_res_num++;
-}
-
bool
xfs_log_writable(
struct xfs_mount *mp)
@@ -395,8 +421,6 @@ xfs_log_regrant(
xlog_grant_push_ail(log, tic->t_unit_res);
tic->t_curr_res = tic->t_unit_res;
- xlog_tic_reset_res(tic);
-
if (tic->t_cnt > 0)
return 0;
@@ -434,10 +458,9 @@ out_error:
int
xfs_log_reserve(
struct xfs_mount *mp,
- int unit_bytes,
- int cnt,
+ int unit_bytes,
+ int cnt,
struct xlog_ticket **ticp,
- uint8_t client,
bool permanent)
{
struct xlog *log = mp->m_log;
@@ -445,15 +468,13 @@ xfs_log_reserve(
int need_bytes;
int error = 0;
- ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-
if (xlog_is_shutdown(log))
return -EIO;
XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent);
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -901,12 +922,22 @@ xlog_write_unmount_record(
struct xlog *log,
struct xlog_ticket *ticket)
{
- struct xfs_unmount_log_format ulf = {
- .magic = XLOG_UNMOUNT_TYPE,
+ struct {
+ struct xlog_op_header ophdr;
+ struct xfs_unmount_log_format ulf;
+ } unmount_rec = {
+ .ophdr = {
+ .oh_clientid = XFS_LOG,
+ .oh_tid = cpu_to_be32(ticket->t_tid),
+ .oh_flags = XLOG_UNMOUNT_TRANS,
+ },
+ .ulf = {
+ .magic = XLOG_UNMOUNT_TYPE,
+ },
};
struct xfs_log_iovec reg = {
- .i_addr = &ulf,
- .i_len = sizeof(ulf),
+ .i_addr = &unmount_rec,
+ .i_len = sizeof(unmount_rec),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
@@ -914,10 +945,14 @@ xlog_write_unmount_record(
.lv_iovecp = &reg,
};
+ BUILD_BUG_ON((sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_unmount_log_format)) !=
+ sizeof(unmount_rec));
+
/* account for space used by record data */
- ticket->t_curr_res -= sizeof(ulf);
+ ticket->t_curr_res -= sizeof(unmount_rec);
- return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS);
+ return xlog_write(log, NULL, &vec, ticket, reg.i_len);
}
/*
@@ -933,7 +968,7 @@ xlog_unmount_write(
struct xlog_ticket *tic = NULL;
int error;
- error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+ error = xfs_log_reserve(mp, 600, 1, &tic, 0);
if (error)
goto out_err;
@@ -1584,9 +1619,6 @@ xlog_alloc_log(
GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!iclog->ic_data)
goto out_free_iclog;
-#ifdef DEBUG
- log->l_iclog_bak[i] = &iclog->ic_header;
-#endif
head = &iclog->ic_header;
memset(head, 0, sizeof(xlog_rec_header_t));
head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -1602,7 +1634,7 @@ xlog_alloc_log(
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
INIT_LIST_HEAD(&iclog->ic_callbacks);
- iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
+ iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize;
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -2111,63 +2143,11 @@ xlog_print_tic_res(
struct xfs_mount *mp,
struct xlog_ticket *ticket)
{
- uint i;
- uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
-
- /* match with XLOG_REG_TYPE_* in xfs_log.h */
-#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
- static char *res_type_str[] = {
- REG_TYPE_STR(BFORMAT, "bformat"),
- REG_TYPE_STR(BCHUNK, "bchunk"),
- REG_TYPE_STR(EFI_FORMAT, "efi_format"),
- REG_TYPE_STR(EFD_FORMAT, "efd_format"),
- REG_TYPE_STR(IFORMAT, "iformat"),
- REG_TYPE_STR(ICORE, "icore"),
- REG_TYPE_STR(IEXT, "iext"),
- REG_TYPE_STR(IBROOT, "ibroot"),
- REG_TYPE_STR(ILOCAL, "ilocal"),
- REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
- REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
- REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
- REG_TYPE_STR(QFORMAT, "qformat"),
- REG_TYPE_STR(DQUOT, "dquot"),
- REG_TYPE_STR(QUOTAOFF, "quotaoff"),
- REG_TYPE_STR(LRHEADER, "LR header"),
- REG_TYPE_STR(UNMOUNT, "unmount"),
- REG_TYPE_STR(COMMIT, "commit"),
- REG_TYPE_STR(TRANSHDR, "trans header"),
- REG_TYPE_STR(ICREATE, "inode create"),
- REG_TYPE_STR(RUI_FORMAT, "rui_format"),
- REG_TYPE_STR(RUD_FORMAT, "rud_format"),
- REG_TYPE_STR(CUI_FORMAT, "cui_format"),
- REG_TYPE_STR(CUD_FORMAT, "cud_format"),
- REG_TYPE_STR(BUI_FORMAT, "bui_format"),
- REG_TYPE_STR(BUD_FORMAT, "bud_format"),
- };
- BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
-#undef REG_TYPE_STR
-
xfs_warn(mp, "ticket reservation summary:");
- xfs_warn(mp, " unit res = %d bytes",
- ticket->t_unit_res);
- xfs_warn(mp, " current res = %d bytes",
- ticket->t_curr_res);
- xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)",
- ticket->t_res_arr_sum, ticket->t_res_o_flow);
- xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)",
- ticket->t_res_num_ophdrs, ophdr_spc);
- xfs_warn(mp, " ophdr + reg = %u bytes",
- ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
- xfs_warn(mp, " num regions = %u",
- ticket->t_res_num);
-
- for (i = 0; i < ticket->t_res_num; i++) {
- uint r_type = ticket->t_res_arr[i].r_type;
- xfs_warn(mp, "region[%u]: %s - %u bytes", i,
- ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
- "bad-rtype" : res_type_str[r_type]),
- ticket->t_res_arr[i].r_len);
- }
+ xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res);
+ xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res);
+ xfs_warn(mp, " original count = %d", ticket->t_ocnt);
+ xfs_warn(mp, " remaining count = %d", ticket->t_cnt);
}
/*
@@ -2220,187 +2200,226 @@ xlog_print_trans(
}
}
+static inline void
+xlog_write_iovec(
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ void *data,
+ uint32_t write_len,
+ int *bytes_left,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
+{
+ ASSERT(*log_offset < iclog->ic_log->l_iclog_size);
+ ASSERT(*log_offset % sizeof(int32_t) == 0);
+ ASSERT(write_len % sizeof(int32_t) == 0);
+
+ memcpy(iclog->ic_datap + *log_offset, data, write_len);
+ *log_offset += write_len;
+ *bytes_left -= write_len;
+ (*record_cnt)++;
+ *data_cnt += write_len;
+}
+
/*
- * Calculate the potential space needed by the log vector. We may need a start
- * record, and each region gets its own struct xlog_op_header and may need to be
- * double word aligned.
+ * Write log vectors into a single iclog which is guaranteed by the caller
+ * to have enough space to write the entire log vector into.
*/
-static int
-xlog_write_calc_vec_length(
+static void
+xlog_write_full(
+ struct xfs_log_vec *lv,
struct xlog_ticket *ticket,
- struct xfs_log_vec *log_vector,
- uint optype)
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- struct xfs_log_vec *lv;
- int headers = 0;
- int len = 0;
- int i;
+ int index;
- if (optype & XLOG_START_TRANS)
- headers++;
+ ASSERT(*log_offset + *len <= iclog->ic_size ||
+ iclog->ic_state == XLOG_STATE_WANT_SYNC);
- for (lv = log_vector; lv; lv = lv->lv_next) {
- /* we don't write ordered log vectors */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
- continue;
-
- headers += lv->lv_niovecs;
-
- for (i = 0; i < lv->lv_niovecs; i++) {
- struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
+ /*
+ * Ordered log vectors have no regions to write so this
+ * loop will naturally skip them.
+ */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ struct xlog_op_header *ophdr = reg->i_addr;
- len += vecp->i_len;
- xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
- }
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ xlog_write_iovec(iclog, log_offset, reg->i_addr,
+ reg->i_len, len, record_cnt, data_cnt);
}
-
- ticket->t_res_num_ophdrs += headers;
- len += headers * sizeof(struct xlog_op_header);
-
- return len;
-}
-
-static void
-xlog_write_start_rec(
- struct xlog_op_header *ophdr,
- struct xlog_ticket *ticket)
-{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_len = 0;
- ophdr->oh_flags = XLOG_START_TRANS;
- ophdr->oh_res2 = 0;
}
-static xlog_op_header_t *
-xlog_write_setup_ophdr(
- struct xlog *log,
- struct xlog_op_header *ophdr,
+static int
+xlog_write_get_more_iclog_space(
struct xlog_ticket *ticket,
- uint flags)
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_res2 = 0;
-
- /* are we copying a commit or unmount record? */
- ophdr->oh_flags = flags;
+ struct xlog_in_core *iclog = *iclogp;
+ struct xlog *log = iclog->ic_log;
+ int error;
- /*
- * We've seen logs corrupted with bad transaction client ids. This
- * makes sure that XFS doesn't generate them on. Turn this into an EIO
- * and shut down the filesystem.
- */
- switch (ophdr->oh_clientid) {
- case XFS_TRANSACTION:
- case XFS_VOLUME:
- case XFS_LOG:
- break;
- default:
- xfs_warn(log->l_mp,
- "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT,
- ophdr->oh_clientid, ticket);
- return NULL;
- }
+ spin_lock(&log->l_icloglock);
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ error = xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ if (error)
+ return error;
- return ophdr;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ log_offset);
+ if (error)
+ return error;
+ *record_cnt = 0;
+ *data_cnt = 0;
+ *iclogp = iclog;
+ return 0;
}
/*
- * Set up the parameters of the region copy into the log. This has
- * to handle region write split across multiple log buffers - this
- * state is kept external to this function so that this code can
- * be written in an obvious, self documenting manner.
+ * Write log vectors into a single iclog which is smaller than the current chain
+ * length. We write until we cannot fit a full record into the remaining space
+ * and then stop. We return the log vector that is to be written that cannot
+ * wholly fit in the iclog.
*/
static int
-xlog_write_setup_copy(
+xlog_write_partial(
+ struct xfs_log_vec *lv,
struct xlog_ticket *ticket,
- struct xlog_op_header *ophdr,
- int space_available,
- int space_required,
- int *copy_off,
- int *copy_len,
- int *last_was_partial_copy,
- int *bytes_consumed)
-{
- int still_to_copy;
-
- still_to_copy = space_required - *bytes_consumed;
- *copy_off = *bytes_consumed;
-
- if (still_to_copy <= space_available) {
- /* write of region completes here */
- *copy_len = still_to_copy;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- if (*last_was_partial_copy)
- ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
- *last_was_partial_copy = 0;
- *bytes_consumed = 0;
- return 0;
- }
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
+{
+ struct xlog_in_core *iclog = *iclogp;
+ struct xlog_op_header *ophdr;
+ int index = 0;
+ uint32_t rlen;
+ int error;
+
+ /* walk the logvec, copying until we run out of space in the iclog */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ uint32_t reg_offset = 0;
+
+ /*
+ * The first region of a continuation must have a non-zero
+ * length otherwise log recovery will just skip over it and
+ * start recovering from the next opheader it finds. Because we
+ * mark the next opheader as a continuation, recovery will then
+ * incorrectly add the continuation to the previous region and
+ * that breaks stuff.
+ *
+ * Hence if there isn't space for region data after the
+ * opheader, then we need to start afresh with a new iclog.
+ */
+ if (iclog->ic_size - *log_offset <=
+ sizeof(struct xlog_op_header)) {
+ error = xlog_write_get_more_iclog_space(ticket,
+ &iclog, log_offset, *len, record_cnt,
+ data_cnt);
+ if (error)
+ return error;
+ }
- /* partial write of region, needs extra log op header reservation */
- *copy_len = space_available;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- if (*last_was_partial_copy)
- ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
- *bytes_consumed += *copy_len;
- (*last_was_partial_copy)++;
+ ophdr = reg->i_addr;
+ rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset);
- /* account for new log op header */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
- ticket->t_res_num_ophdrs++;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header));
+ if (rlen != reg->i_len)
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- return sizeof(struct xlog_op_header);
-}
+ xlog_write_iovec(iclog, log_offset, reg->i_addr,
+ rlen, len, record_cnt, data_cnt);
-static int
-xlog_write_copy_finish(
- struct xlog *log,
- struct xlog_in_core *iclog,
- uint flags,
- int *record_cnt,
- int *data_cnt,
- int *partial_copy,
- int *partial_copy_len,
- int log_offset)
-{
- int error;
+ /* If we wrote the whole region, move to the next. */
+ if (rlen == reg->i_len)
+ continue;
- if (*partial_copy) {
/*
- * This iclog has already been marked WANT_SYNC by
- * xlog_state_get_iclog_space.
+ * We now have a partially written iovec, but it can span
+ * multiple iclogs so we loop here. First we release the iclog
+ * we currently have, then we get a new iclog and add a new
+ * opheader. Then we continue copying from where we were until
+ * we either complete the iovec or fill the iclog. If we
+ * complete the iovec, then we increment the index and go right
+ * back to the top of the outer loop. if we fill the iclog, we
+ * run the inner loop again.
+ *
+ * This is complicated by the tail of a region using all the
+ * space in an iclog and hence requiring us to release the iclog
+ * and get a new one before returning to the outer loop. We must
+ * always guarantee that we exit this inner loop with at least
+ * space for log transaction opheaders left in the current
+ * iclog, hence we cannot just terminate the loop at the end
+ * of the of the continuation. So we loop while there is no
+ * space left in the current iclog, and check for the end of the
+ * continuation after getting a new iclog.
*/
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
- goto release_iclog;
- }
+ do {
+ /*
+ * Ensure we include the continuation opheader in the
+ * space we need in the new iclog by adding that size
+ * to the length we require. This continuation opheader
+ * needs to be accounted to the ticket as the space it
+ * consumes hasn't been accounted to the lv we are
+ * writing.
+ */
+ error = xlog_write_get_more_iclog_space(ticket,
+ &iclog, log_offset,
+ *len + sizeof(struct xlog_op_header),
+ record_cnt, data_cnt);
+ if (error)
+ return error;
- *partial_copy = 0;
- *partial_copy_len = 0;
+ ophdr = iclog->ic_datap + *log_offset;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_clientid = XFS_TRANSACTION;
+ ophdr->oh_res2 = 0;
+ ophdr->oh_flags = XLOG_WAS_CONT_TRANS;
- if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t))
- return 0;
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
+ *log_offset += sizeof(struct xlog_op_header);
+ *data_cnt += sizeof(struct xlog_op_header);
- /* no more space in this iclog - push it. */
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
+ /*
+ * If rlen fits in the iclog, then end the region
+ * continuation. Otherwise we're going around again.
+ */
+ reg_offset += rlen;
+ rlen = reg->i_len - reg_offset;
+ if (rlen <= iclog->ic_size - *log_offset)
+ ophdr->oh_flags |= XLOG_END_TRANS;
+ else
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- if (iclog->ic_state == XLOG_STATE_ACTIVE)
- xlog_state_switch_iclogs(log, iclog, 0);
- else
- ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- xlog_is_shutdown(log));
-release_iclog:
- error = xlog_state_release_iclog(log, iclog);
- spin_unlock(&log->l_icloglock);
- return error;
+ rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset);
+ ophdr->oh_len = cpu_to_be32(rlen);
+
+ xlog_write_iovec(iclog, log_offset,
+ reg->i_addr + reg_offset,
+ rlen, len, record_cnt, data_cnt);
+
+ } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS);
+ }
+
+ /*
+ * No more iovecs remain in this logvec so return the next log vec to
+ * the caller so it can go back to fast path copying.
+ */
+ *iclogp = iclog;
+ return 0;
}
/*
@@ -2449,27 +2468,16 @@ xlog_write(
struct xfs_cil_ctx *ctx,
struct xfs_log_vec *log_vector,
struct xlog_ticket *ticket,
- uint optype)
+ uint32_t len)
+
{
struct xlog_in_core *iclog = NULL;
struct xfs_log_vec *lv = log_vector;
- struct xfs_log_iovec *vecp = lv->lv_iovecp;
- int index = 0;
- int len;
- int partial_copy = 0;
- int partial_copy_len = 0;
- int contwr = 0;
- int record_cnt = 0;
- int data_cnt = 0;
+ uint32_t record_cnt = 0;
+ uint32_t data_cnt = 0;
int error = 0;
+ int log_offset;
- /*
- * If this is a commit or unmount transaction, we don't need a start
- * record to be written. We do, however, have to account for the
- * commit or unmount header that gets written. Hence we always have
- * to account for an extra xlog_op_header here.
- */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
@@ -2477,144 +2485,54 @@ xlog_write(
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
- len = xlog_write_calc_vec_length(ticket, log_vector, optype);
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- void *ptr;
- int log_offset;
-
- error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
- &contwr, &log_offset);
- if (error)
- return error;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ &log_offset);
+ if (error)
+ return error;
- ASSERT(log_offset <= iclog->ic_size - 1);
- ptr = iclog->ic_datap + log_offset;
+ ASSERT(log_offset <= iclog->ic_size - 1);
- /*
- * If we have a context pointer, pass it the first iclog we are
- * writing to so it can record state needed for iclog write
- * ordering.
- */
- if (ctx) {
- xlog_cil_set_ctx_write_state(ctx, iclog);
- ctx = NULL;
- }
+ /*
+ * If we have a context pointer, pass it the first iclog we are
+ * writing to so it can record state needed for iclog write
+ * ordering.
+ */
+ if (ctx)
+ xlog_cil_set_ctx_write_state(ctx, iclog);
+ while (lv) {
/*
- * This loop writes out as many regions as can fit in the amount
- * of space which was allocated by xlog_state_get_iclog_space().
+ * If the entire log vec does not fit in the iclog, punt it to
+ * the partial copy loop which can handle this case.
*/
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- struct xfs_log_iovec *reg;
- struct xlog_op_header *ophdr;
- int copy_len;
- int copy_off;
- bool ordered = false;
- bool wrote_start_rec = false;
-
- /* ordered log vectors have no regions to write */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
- ASSERT(lv->lv_niovecs == 0);
- ordered = true;
- goto next_lv;
- }
-
- reg = &vecp[index];
- ASSERT(reg->i_len % sizeof(int32_t) == 0);
- ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
-
- /*
- * Before we start formatting log vectors, we need to
- * write a start record. Only do this for the first
- * iclog we write to.
- */
- if (optype & XLOG_START_TRANS) {
- xlog_write_start_rec(ptr, ticket);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
- optype &= ~XLOG_START_TRANS;
- wrote_start_rec = true;
- }
-
- ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype);
- if (!ophdr)
- return -EIO;
-
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
-
- len += xlog_write_setup_copy(ticket, ophdr,
- iclog->ic_size-log_offset,
- reg->i_len,
- &copy_off, &copy_len,
- &partial_copy,
- &partial_copy_len);
- xlog_verify_dest_ptr(log, ptr);
-
- /*
- * Copy region.
- *
- * Unmount records just log an opheader, so can have
- * empty payloads with no data region to copy. Hence we
- * only copy the payload if the vector says it has data
- * to copy.
- */
- ASSERT(copy_len >= 0);
- if (copy_len > 0) {
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- copy_len);
- }
- copy_len += sizeof(struct xlog_op_header);
- record_cnt++;
- if (wrote_start_rec) {
- copy_len += sizeof(struct xlog_op_header);
- record_cnt++;
- }
- data_cnt += contwr ? copy_len : 0;
-
- error = xlog_write_copy_finish(log, iclog, optype,
- &record_cnt, &data_cnt,
- &partial_copy,
- &partial_copy_len,
- log_offset);
- if (error)
+ if (lv->lv_niovecs &&
+ lv->lv_bytes > iclog->ic_size - log_offset) {
+ error = xlog_write_partial(lv, ticket, &iclog,
+ &log_offset, &len, &record_cnt,
+ &data_cnt);
+ if (error) {
+ /*
+ * We have no iclog to release, so just return
+ * the error immediately.
+ */
return error;
-
- /*
- * if we had a partial copy, we need to get more iclog
- * space but we don't want to increment the region
- * index because there is still more is this region to
- * write.
- *
- * If we completed writing this region, and we flushed
- * the iclog (indicated by resetting of the record
- * count), then we also need to get more log space. If
- * this was the last record, though, we are done and
- * can just return.
- */
- if (partial_copy)
- break;
-
- if (++index == lv->lv_niovecs) {
-next_lv:
- lv = lv->lv_next;
- index = 0;
- if (lv)
- vecp = lv->lv_iovecp;
- }
- if (record_cnt == 0 && !ordered) {
- if (!lv)
- return 0;
- break;
}
+ } else {
+ xlog_write_full(lv, ticket, iclog, &log_offset,
+ &len, &record_cnt, &data_cnt);
}
+ lv = lv->lv_next;
}
-
ASSERT(len == 0);
+ /*
+ * We've already been guaranteed that the last writes will fit inside
+ * the current iclog, and hence it will already have the space used by
+ * those writes accounted to it. Hence we do not need to update the
+ * iclog with the number of bytes written here.
+ */
spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+ xlog_state_finish_copy(log, iclog, record_cnt, 0);
error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock);
@@ -2971,7 +2889,6 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclogp,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp)
{
int log_offset;
@@ -3008,9 +2925,6 @@ restart:
*/
if (log_offset == 0) {
ticket->t_curr_res -= log->l_iclog_hsize;
- xlog_tic_add_region(ticket,
- log->l_iclog_hsize,
- XLOG_REG_TYPE_LRHEADER);
head->h_cycle = cpu_to_be32(log->l_curr_cycle);
head->h_lsn = cpu_to_be64(
xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
@@ -3052,13 +2966,10 @@ restart:
* iclogs (to mark it taken), this particular iclog will release/sync
* to disk in xlog_write().
*/
- if (len <= iclog->ic_size - iclog->ic_offset) {
- *continued_write = 0;
+ if (len <= iclog->ic_size - iclog->ic_offset)
iclog->ic_offset += len;
- } else {
- *continued_write = 1;
+ else
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
- }
*iclogp = iclog;
ASSERT(iclog->ic_offset <= iclog->ic_size);
@@ -3090,7 +3001,6 @@ xfs_log_ticket_regrant(
xlog_grant_sub_space(log, &log->l_write_head.grant,
ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
trace_xfs_log_ticket_regrant_sub(log, ticket);
@@ -3101,7 +3011,6 @@ xfs_log_ticket_regrant(
trace_xfs_log_ticket_regrant_exit(log, ticket);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
}
xfs_log_ticket_put(ticket);
@@ -3591,7 +3500,6 @@ xlog_ticket_alloc(
struct xlog *log,
int unit_bytes,
int cnt,
- char client,
bool permanent)
{
struct xlog_ticket *tic;
@@ -3609,40 +3517,14 @@ xlog_ticket_alloc(
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
- tic->t_clientid = client;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- xlog_tic_reset_res(tic);
-
return tic;
}
#if defined(DEBUG)
/*
- * Make sure that the destination ptr is within the valid data region of
- * one of the iclogs. This uses backup pointers stored in a different
- * part of the log in case we trash the log structure.
- */
-STATIC void
-xlog_verify_dest_ptr(
- struct xlog *log,
- void *ptr)
-{
- int i;
- int good_ptr = 0;
-
- for (i = 0; i < log->l_iclog_bufs; i++) {
- if (ptr >= log->l_iclog_bak[i] &&
- ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
- good_ptr++;
- }
-
- if (!good_ptr)
- xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
-}
-
-/*
* Check to make sure the grant write head didn't just over lap the tail. If
* the cycles are the same, we can't be overlapping. Otherwise, make sure that
* the cycles differ by exactly one and check the byte count.
@@ -3769,7 +3651,7 @@ xlog_verify_iclog(
if (field_offset & 0x1ff) {
clientid = ophead->oh_clientid;
} else {
- idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
+ idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3780,11 +3662,12 @@ xlog_verify_iclog(
iclog->ic_header.h_cycle_data[idx]);
}
}
- if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+ if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
xfs_warn(log->l_mp,
- "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx",
- __func__, clientid, ophead,
+ "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx",
+ __func__, i, clientid, ophead,
(unsigned long)field_offset);
+ }
/* check length */
p = &ophead->oh_len;
@@ -3792,8 +3675,7 @@ xlog_verify_iclog(
if (field_offset & 0x1ff) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
- idx = BTOBBT((uintptr_t)&ophead->oh_len -
- (uintptr_t)iclog->ic_datap);
+ idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,7 +3711,7 @@ xlog_verify_iclog(
bool
xlog_force_shutdown(
struct xlog *log,
- int shutdown_flags)
+ uint32_t shutdown_flags)
{
bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index dc1b77b92fc1..f3ce046a7d45 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -21,46 +21,59 @@ struct xfs_log_vec {
#define XFS_LOG_VEC_ORDERED (-1)
-static inline void *
-xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
- uint type)
+/*
+ * Calculate the log iovec length for a given user buffer length. Intended to be
+ * used by ->iop_size implementations when sizing buffers of arbitrary
+ * alignments.
+ */
+static inline int
+xlog_calc_iovec_len(int len)
{
- struct xfs_log_iovec *vec = *vecp;
+ return roundup(len, sizeof(uint32_t));
+}
+
+void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ uint type);
- if (vec) {
- ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
- vec++;
- } else {
- vec = &lv->lv_iovecp[0];
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec,
+ int data_len)
+{
+ struct xlog_op_header *oph = vec->i_addr;
+ int len;
+
+ /*
+ * Always round up the length to the correct alignment so callers don't
+ * need to know anything about this log vec layout requirement. This
+ * means we have to zero the area the data to be written does not cover.
+ * This is complicated by fact the payload region is offset into the
+ * logvec region by the opheader that tracks the payload.
+ */
+ len = xlog_calc_iovec_len(data_len);
+ if (len - data_len != 0) {
+ char *buf = vec->i_addr + sizeof(struct xlog_op_header);
+
+ memset(buf + data_len, 0, len - data_len);
}
- vec->i_type = type;
- vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+ /*
+ * The opheader tracks aligned payload length, whilst the logvec tracks
+ * the overall region length.
+ */
+ oph->oh_len = cpu_to_be32(len);
- ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+ len += sizeof(struct xlog_op_header);
+ lv->lv_buf_len += len;
+ lv->lv_bytes += len;
+ vec->i_len = len;
- *vecp = vec;
- return vec->i_addr;
+ /* Catch buffer overruns */
+ ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size);
}
/*
- * We need to make sure the next buffer is naturally aligned for the biggest
- * basic data type we put into it. We already accounted for this padding when
- * sizing the buffer.
- *
- * However, this padding does not get written into the log, and hence we have to
- * track the space used by the log vectors separately to prevent log space hangs
- * due to inaccurate accounting (i.e. a leak) of the used log space through the
- * CIL context ticket.
+ * Copy the amount of data requested by the caller into a new log iovec.
*/
-static inline void
-xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
-{
- lv->lv_buf_len += round_up(len, sizeof(uint64_t));
- lv->lv_bytes += len;
- vec->i_len = len;
-}
-
static inline void *
xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
uint type, void *data, int len)
@@ -73,6 +86,13 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
return buf;
}
+static inline void *
+xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ const struct xfs_log_iovec *src)
+{
+ return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len);
+}
+
/*
* By comparing each component, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
@@ -117,15 +137,11 @@ int xfs_log_mount_finish(struct xfs_mount *mp);
void xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
-void xfs_log_space_wake(struct xfs_mount *mp);
-int xfs_log_reserve(struct xfs_mount *mp,
- int length,
- int count,
- struct xlog_ticket **ticket,
- uint8_t clientid,
- bool permanent);
-int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-void xfs_log_unmount(struct xfs_mount *mp);
+void xfs_log_space_wake(struct xfs_mount *mp);
+int xfs_log_reserve(struct xfs_mount *mp, int length, int count,
+ struct xlog_ticket **ticket, bool permanent);
+int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
+void xfs_log_unmount(struct xfs_mount *mp);
bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
@@ -140,9 +156,10 @@ void xfs_log_clean(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
-bool xlog_force_shutdown(struct xlog *log, int shutdown_flags);
+bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags);
void xlog_use_incompat_feat(struct xlog *log);
void xlog_drop_incompat_feat(struct xlog *log);
+int xfs_attr_use_log_assist(struct xfs_mount *mp);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index c9f55e4f0957..db6cb7800251 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -37,7 +37,7 @@ xlog_cil_ticket_alloc(
{
struct xlog_ticket *tic;
- tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0);
+ tic = xlog_ticket_alloc(log, 0, 1, 0);
/*
* set the current reservation to zero so we know to steal the basic
@@ -48,6 +48,38 @@ xlog_cil_ticket_alloc(
}
/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+static bool
+xlog_item_in_current_chkpt(
+ struct xfs_cil *cil,
+ struct xfs_log_item *lip)
+{
+ if (list_empty(&lip->li_cil))
+ return false;
+
+ /*
+ * li_seq is written on the first commit of a log item to record the
+ * first checkpoint it is written to. Hence if it is different to the
+ * current sequence, we're in a new checkpoint.
+ */
+ return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
+}
+
+bool
+xfs_log_item_in_current_chkpt(
+ struct xfs_log_item *lip)
+{
+ return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip);
+}
+
+/*
* Unavoidable forward declaration - xlog_cil_push_work() calls
* xlog_cil_ctx_alloc() itself.
*/
@@ -103,39 +135,6 @@ xlog_cil_iovec_space(
}
/*
- * shadow buffers can be large, so we need to use kvmalloc() here to ensure
- * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall
- * back to vmalloc, so we can't actually do anything useful with gfp flags to
- * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do
- * direct reclaim and compaction in the slow path, both of which are
- * horrendously expensive. We just want kmalloc to fail fast and fall back to
- * vmalloc if it can't get somethign straight away from the free lists or buddy
- * allocator. Hence we have to open code kvmalloc outselves here.
- *
- * Also, we are in memalloc_nofs_save task context here, so despite the use of
- * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This
- * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets
- * just all pretend this is a GFP_KERNEL context operation....
- */
-static inline void *
-xlog_cil_kvmalloc(
- size_t buf_size)
-{
- gfp_t flags = GFP_KERNEL;
- void *p;
-
- flags &= ~__GFP_DIRECT_RECLAIM;
- flags |= __GFP_NOWARN | __GFP_NORETRY;
- do {
- p = kmalloc(buf_size, flags);
- if (!p)
- p = vmalloc(buf_size);
- } while (!p);
-
- return p;
-}
-
-/*
* Allocate or pin log vector buffers for CIL insertion.
*
* The CIL currently uses disposable buffers for copying a snapshot of the
@@ -214,13 +213,20 @@ xlog_cil_alloc_shadow_bufs(
}
/*
- * We 64-bit align the length of each iovec so that the start
- * of the next one is naturally aligned. We'll need to
- * account for that slack space here. Then round nbytes up
- * to 64-bit alignment so that the initial buffer alignment is
- * easy to calculate and verify.
+ * We 64-bit align the length of each iovec so that the start of
+ * the next one is naturally aligned. We'll need to account for
+ * that slack space here.
+ *
+ * We also add the xlog_op_header to each region when
+ * formatting, but that's not accounted to the size of the item
+ * at this point. Hence we'll need an addition number of bytes
+ * for each vector to hold an opheader.
+ *
+ * Then round nbytes up to 64-bit alignment so that the initial
+ * buffer alignment is easy to calculate and verify.
*/
- nbytes += niovecs * sizeof(uint64_t);
+ nbytes += niovecs *
+ (sizeof(uint64_t) + sizeof(struct xlog_op_header));
nbytes = round_up(nbytes, sizeof(uint64_t));
/*
@@ -244,7 +250,7 @@ xlog_cil_alloc_shadow_bufs(
* storage.
*/
kmem_free(lip->li_lv_shadow);
- lv = xlog_cil_kvmalloc(buf_size);
+ lv = xlog_kvmalloc(buf_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
@@ -277,22 +283,18 @@ xlog_cil_alloc_shadow_bufs(
/*
* Prepare the log item for insertion into the CIL. Calculate the difference in
- * log space and vectors it will consume, and if it is a new item pin it as
- * well.
+ * log space it will consume, and if it is a new item pin it as well.
*/
STATIC void
xfs_cil_prepare_item(
struct xlog *log,
struct xfs_log_vec *lv,
struct xfs_log_vec *old_lv,
- int *diff_len,
- int *diff_iovecs)
+ int *diff_len)
{
/* Account for the new LV being passed in */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
*diff_len += lv->lv_bytes;
- *diff_iovecs += lv->lv_niovecs;
- }
/*
* If there is no old LV, this is the first time we've seen the item in
@@ -309,7 +311,6 @@ xfs_cil_prepare_item(
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
*diff_len -= old_lv->lv_bytes;
- *diff_iovecs -= old_lv->lv_niovecs;
lv->lv_item->li_lv_shadow = old_lv;
}
@@ -358,12 +359,10 @@ static void
xlog_cil_insert_format_items(
struct xlog *log,
struct xfs_trans *tp,
- int *diff_len,
- int *diff_iovecs)
+ int *diff_len)
{
struct xfs_log_item *lip;
-
/* Bail out if we didn't find a log item. */
if (list_empty(&tp->t_items)) {
ASSERT(0);
@@ -406,7 +405,6 @@ xlog_cil_insert_format_items(
* set the item up as though it is a new insertion so
* that the space reservation accounting is correct.
*/
- *diff_iovecs -= lv->lv_niovecs;
*diff_len -= lv->lv_bytes;
/* Ensure the lv is set up according to ->iop_size */
@@ -431,7 +429,7 @@ xlog_cil_insert_format_items(
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
lip->li_ops->iop_format(lip, lv);
insert:
- xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
+ xfs_cil_prepare_item(log, lv, old_lv, diff_len);
}
}
@@ -445,13 +443,13 @@ insert:
static void
xlog_cil_insert_items(
struct xlog *log,
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ uint32_t released_space)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx = cil->xc_ctx;
struct xfs_log_item *lip;
int len = 0;
- int diff_iovecs = 0;
int iclog_space;
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
@@ -461,15 +459,10 @@ xlog_cil_insert_items(
* We can do this safely because the context can't checkpoint until we
* are done so it doesn't matter exactly how we update the CIL.
*/
- xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
+ xlog_cil_insert_format_items(log, tp, &len);
spin_lock(&cil->xc_cil_lock);
- /* account for space used by new iovec headers */
- iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t);
- len += iovhdr_res;
- ctx->nvecs += diff_iovecs;
-
/* attach the transaction to the CIL if it has any busy extents */
if (!list_empty(&tp->t_busy))
list_splice_init(&tp->t_busy, &ctx->busy_extents);
@@ -500,7 +493,9 @@ xlog_cil_insert_items(
ASSERT(tp->t_ticket->t_curr_res >= len);
}
tp->t_ticket->t_curr_res -= len;
+ tp->t_ticket->t_curr_res += released_space;
ctx->space_used += len;
+ ctx->space_used -= released_space;
/*
* If we've overrun the reservation, dump the tx details before we move
@@ -822,7 +817,8 @@ restart:
static int
xlog_cil_write_chain(
struct xfs_cil_ctx *ctx,
- struct xfs_log_vec *chain)
+ struct xfs_log_vec *chain,
+ uint32_t chain_len)
{
struct xlog *log = ctx->cil->xc_log;
int error;
@@ -830,7 +826,7 @@ xlog_cil_write_chain(
error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
if (error)
return error;
- return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS);
+ return xlog_write(log, ctx, chain, ctx->ticket, chain_len);
}
/*
@@ -844,9 +840,14 @@ xlog_cil_write_commit_record(
struct xfs_cil_ctx *ctx)
{
struct xlog *log = ctx->cil->xc_log;
+ struct xlog_op_header ophdr = {
+ .oh_clientid = XFS_TRANSACTION,
+ .oh_tid = cpu_to_be32(ctx->ticket->t_tid),
+ .oh_flags = XLOG_COMMIT_TRANS,
+ };
struct xfs_log_iovec reg = {
- .i_addr = NULL,
- .i_len = 0,
+ .i_addr = &ophdr,
+ .i_len = sizeof(struct xlog_op_header),
.i_type = XLOG_REG_TYPE_COMMIT,
};
struct xfs_log_vec vec = {
@@ -862,12 +863,138 @@ xlog_cil_write_commit_record(
if (error)
return error;
- error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS);
+ /* account for space used by record data */
+ ctx->ticket->t_curr_res -= reg.i_len;
+ error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len);
if (error)
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return error;
}
+struct xlog_cil_trans_hdr {
+ struct xlog_op_header oph[2];
+ struct xfs_trans_header thdr;
+ struct xfs_log_iovec lhdr[2];
+};
+
+/*
+ * Build a checkpoint transaction header to begin the journal transaction. We
+ * need to account for the space used by the transaction header here as it is
+ * not accounted for in xlog_write().
+ *
+ * This is the only place we write a transaction header, so we also build the
+ * log opheaders that indicate the start of a log transaction and wrap the
+ * transaction header. We keep the start record in it's own log vector rather
+ * than compacting them into a single region as this ends up making the logic
+ * in xlog_write() for handling empty opheaders for start, commit and unmount
+ * records much simpler.
+ */
+static void
+xlog_cil_build_trans_hdr(
+ struct xfs_cil_ctx *ctx,
+ struct xlog_cil_trans_hdr *hdr,
+ struct xfs_log_vec *lvhdr,
+ int num_iovecs)
+{
+ struct xlog_ticket *tic = ctx->ticket;
+ __be32 tid = cpu_to_be32(tic->t_tid);
+
+ memset(hdr, 0, sizeof(*hdr));
+
+ /* Log start record */
+ hdr->oph[0].oh_tid = tid;
+ hdr->oph[0].oh_clientid = XFS_TRANSACTION;
+ hdr->oph[0].oh_flags = XLOG_START_TRANS;
+
+ /* log iovec region pointer */
+ hdr->lhdr[0].i_addr = &hdr->oph[0];
+ hdr->lhdr[0].i_len = sizeof(struct xlog_op_header);
+ hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER;
+
+ /* log opheader */
+ hdr->oph[1].oh_tid = tid;
+ hdr->oph[1].oh_clientid = XFS_TRANSACTION;
+ hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header));
+
+ /* transaction header in host byte order format */
+ hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+ hdr->thdr.th_type = XFS_TRANS_CHECKPOINT;
+ hdr->thdr.th_tid = tic->t_tid;
+ hdr->thdr.th_num_items = num_iovecs;
+
+ /* log iovec region pointer */
+ hdr->lhdr[1].i_addr = &hdr->oph[1];
+ hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_trans_header);
+ hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR;
+
+ lvhdr->lv_niovecs = 2;
+ lvhdr->lv_iovecp = &hdr->lhdr[0];
+ lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len;
+ lvhdr->lv_next = ctx->lv_chain;
+
+ tic->t_curr_res -= lvhdr->lv_bytes;
+}
+
+/*
+ * Pull all the log vectors off the items in the CIL, and remove the items from
+ * the CIL. We don't need the CIL lock here because it's only needed on the
+ * transaction commit side which is currently locked out by the flush lock.
+ *
+ * If a log item is marked with a whiteout, we do not need to write it to the
+ * journal and so we just move them to the whiteout list for the caller to
+ * dispose of appropriately.
+ */
+static void
+xlog_cil_build_lv_chain(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx,
+ struct list_head *whiteouts,
+ uint32_t *num_iovecs,
+ uint32_t *num_bytes)
+{
+ struct xfs_log_vec *lv = NULL;
+
+ while (!list_empty(&cil->xc_cil)) {
+ struct xfs_log_item *item;
+
+ item = list_first_entry(&cil->xc_cil,
+ struct xfs_log_item, li_cil);
+
+ if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) {
+ list_move(&item->li_cil, whiteouts);
+ trace_xfs_cil_whiteout_skip(item);
+ continue;
+ }
+
+ list_del_init(&item->li_cil);
+ if (!ctx->lv_chain)
+ ctx->lv_chain = item->li_lv;
+ else
+ lv->lv_next = item->li_lv;
+ lv = item->li_lv;
+ item->li_lv = NULL;
+ *num_iovecs += lv->lv_niovecs;
+
+ /* we don't write ordered log vectors */
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ *num_bytes += lv->lv_bytes;
+ }
+}
+
+static void
+xlog_cil_cleanup_whiteouts(
+ struct list_head *whiteouts)
+{
+ while (!list_empty(whiteouts)) {
+ struct xfs_log_item *item = list_first_entry(whiteouts,
+ struct xfs_log_item, li_cil);
+ list_del_init(&item->li_cil);
+ trace_xfs_cil_whiteout_unpin(item);
+ item->li_ops->iop_unpin(item, 1);
+ }
+}
+
/*
* Push the Committed Item List to the log.
*
@@ -890,16 +1017,15 @@ xlog_cil_push_work(
container_of(work, struct xfs_cil_ctx, push_work);
struct xfs_cil *cil = ctx->cil;
struct xlog *log = cil->xc_log;
- struct xfs_log_vec *lv;
struct xfs_cil_ctx *new_ctx;
- struct xlog_ticket *tic;
- int num_iovecs;
+ int num_iovecs = 0;
+ int num_bytes = 0;
int error = 0;
- struct xfs_trans_header thdr;
- struct xfs_log_iovec lhdr;
+ struct xlog_cil_trans_hdr thdr;
struct xfs_log_vec lvhdr = { NULL };
xfs_csn_t push_seq;
bool push_commit_stable;
+ LIST_HEAD (whiteouts);
new_ctx = xlog_cil_ctx_alloc();
new_ctx->ticket = xlog_cil_ticket_alloc(log);
@@ -968,28 +1094,7 @@ xlog_cil_push_work(
list_add(&ctx->committing, &cil->xc_committing);
spin_unlock(&cil->xc_push_lock);
- /*
- * Pull all the log vectors off the items in the CIL, and remove the
- * items from the CIL. We don't need the CIL lock here because it's only
- * needed on the transaction commit side which is currently locked out
- * by the flush lock.
- */
- lv = NULL;
- num_iovecs = 0;
- while (!list_empty(&cil->xc_cil)) {
- struct xfs_log_item *item;
-
- item = list_first_entry(&cil->xc_cil,
- struct xfs_log_item, li_cil);
- list_del_init(&item->li_cil);
- if (!ctx->lv_chain)
- ctx->lv_chain = item->li_lv;
- else
- lv->lv_next = item->li_lv;
- lv = item->li_lv;
- item->li_lv = NULL;
- num_iovecs += lv->lv_niovecs;
- }
+ xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes);
/*
* Switch the contexts so we can drop the context lock and move out
@@ -1025,26 +1130,11 @@ xlog_cil_push_work(
* Build a checkpoint transaction header and write it to the log to
* begin the transaction. We need to account for the space used by the
* transaction header here as it is not accounted for in xlog_write().
- *
- * The LSN we need to pass to the log items on transaction commit is
- * the LSN reported by the first log vector write. If we use the commit
- * record lsn then we can move the tail beyond the grant write head.
*/
- tic = ctx->ticket;
- thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
- thdr.th_type = XFS_TRANS_CHECKPOINT;
- thdr.th_tid = tic->t_tid;
- thdr.th_num_items = num_iovecs;
- lhdr.i_addr = &thdr;
- lhdr.i_len = sizeof(xfs_trans_header_t);
- lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
- tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
-
- lvhdr.lv_niovecs = 1;
- lvhdr.lv_iovecp = &lhdr;
- lvhdr.lv_next = ctx->lv_chain;
-
- error = xlog_cil_write_chain(ctx, &lvhdr);
+ xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs);
+ num_bytes += lvhdr.lv_bytes;
+
+ error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes);
if (error)
goto out_abort_free_ticket;
@@ -1052,7 +1142,7 @@ xlog_cil_push_work(
if (error)
goto out_abort_free_ticket;
- xfs_log_ticket_ungrant(log, tic);
+ xfs_log_ticket_ungrant(log, ctx->ticket);
/*
* If the checkpoint spans multiple iclogs, wait for all previous iclogs
@@ -1107,6 +1197,7 @@ xlog_cil_push_work(
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
+ xlog_cil_cleanup_whiteouts(&whiteouts);
return;
out_skip:
@@ -1116,8 +1207,9 @@ out_skip:
return;
out_abort_free_ticket:
- xfs_log_ticket_ungrant(log, tic);
+ xfs_log_ticket_ungrant(log, ctx->ticket);
ASSERT(xlog_is_shutdown(log));
+ xlog_cil_cleanup_whiteouts(&whiteouts);
if (!ctx->commit_iclog) {
xlog_cil_committed(ctx);
return;
@@ -1267,6 +1359,43 @@ xlog_cil_empty(
}
/*
+ * If there are intent done items in this transaction and the related intent was
+ * committed in the current (same) CIL checkpoint, we don't need to write either
+ * the intent or intent done item to the journal as the change will be
+ * journalled atomically within this checkpoint. As we cannot remove items from
+ * the CIL here, mark the related intent with a whiteout so that the CIL push
+ * can remove it rather than writing it to the journal. Then remove the intent
+ * done item from the current transaction and release it so it doesn't get put
+ * into the CIL at all.
+ */
+static uint32_t
+xlog_cil_process_intents(
+ struct xfs_cil *cil,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *ilip, *next;
+ uint32_t len = 0;
+
+ list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+ if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE))
+ continue;
+
+ ilip = lip->li_ops->iop_intent(lip);
+ if (!ilip || !xlog_item_in_current_chkpt(cil, ilip))
+ continue;
+ set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
+ trace_xfs_cil_whiteout_mark(ilip);
+ len += ilip->li_lv->lv_bytes;
+ kmem_free(ilip->li_lv);
+ ilip->li_lv = NULL;
+
+ xfs_trans_del_item(lip);
+ lip->li_ops->iop_release(lip);
+ }
+ return len;
+}
+
+/*
* Commit a transaction with the given vector to the Committed Item List.
*
* To do this, we need to format the item, pin it in memory if required and
@@ -1288,6 +1417,7 @@ xlog_cil_commit(
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_log_item *lip, *next;
+ uint32_t released_space = 0;
/*
* Do all necessary memory allocation before we lock the CIL.
@@ -1299,7 +1429,10 @@ xlog_cil_commit(
/* lock out background commit */
down_read(&cil->xc_ctx_lock);
- xlog_cil_insert_items(log, tp);
+ if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE)
+ released_space = xlog_cil_process_intents(cil, tp);
+
+ xlog_cil_insert_items(log, tp, released_space);
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1456,32 +1589,6 @@ out_shutdown:
}
/*
- * Check if the current log item was first committed in this sequence.
- * We can't rely on just the log item being in the CIL, we have to check
- * the recorded commit sequence number.
- *
- * Note: for this to be used in a non-racy manner, it has to be called with
- * CIL flushing locked out. As a result, it should only be used during the
- * transaction commit process when deciding what to format into the item.
- */
-bool
-xfs_log_item_in_current_chkpt(
- struct xfs_log_item *lip)
-{
- struct xfs_cil *cil = lip->li_log->l_cilp;
-
- if (list_empty(&lip->li_cil))
- return false;
-
- /*
- * li_seq is written on the first commit of a log item to record the
- * first checkpoint it is written to. Hence if it is different to the
- * current sequence, we're in a new checkpoint.
- */
- return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
-}
-
-/*
* Perform initial CIL structure initialisation.
*/
int
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 401cdc400980..686c01eb3661 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -51,8 +51,8 @@ enum xlog_iclog_state {
/*
* In core log flags
*/
-#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
-#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
+#define XLOG_ICL_NEED_FLUSH (1u << 0) /* iclog needs REQ_PREFLUSH */
+#define XLOG_ICL_NEED_FUA (1u << 1) /* iclog needs REQ_FUA */
#define XLOG_ICL_STRINGS \
{ XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \
@@ -62,7 +62,7 @@ enum xlog_iclog_state {
/*
* Log ticket flags
*/
-#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV (1u << 0) /* permanent reservation */
#define XLOG_TIC_FLAGS \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
@@ -142,19 +142,6 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
-/* Ticket reservation region accounting */
-#define XLOG_TIC_LEN_MAX 15
-
-/*
- * Reservation region
- * As would be stored in xfs_log_iovec but without the i_addr which
- * we don't care about.
- */
-typedef struct xlog_res {
- uint r_len; /* region length :4 */
- uint r_type; /* region's transaction type :4 */
-} xlog_res_t;
-
typedef struct xlog_ticket {
struct list_head t_queue; /* reserve/write queue */
struct task_struct *t_task; /* task that owns this ticket */
@@ -164,15 +151,7 @@ typedef struct xlog_ticket {
int t_unit_res; /* unit reservation in bytes : 4 */
char t_ocnt; /* original count : 1 */
char t_cnt; /* current count : 1 */
- char t_clientid; /* who does this belong to; : 1 */
- char t_flags; /* properties of reservation : 1 */
-
- /* reservation array fields */
- uint t_res_num; /* num in array : 4 */
- uint t_res_num_ophdrs; /* num op hdrs : 4 */
- uint t_res_arr_sum; /* array sum : 4 */
- uint t_res_o_flow; /* sum overflow : 4 */
- xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
+ uint8_t t_flags; /* properties of reservation : 1 */
} xlog_ticket_t;
/*
@@ -211,7 +190,7 @@ typedef struct xlog_in_core {
u32 ic_offset;
enum xlog_iclog_state ic_state;
unsigned int ic_flags;
- char *ic_datap; /* pointer to iclog data */
+ void *ic_datap; /* pointer to iclog data */
struct list_head ic_callbacks;
/* reference counts need their own cacheline */
@@ -242,7 +221,6 @@ struct xfs_cil_ctx {
xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
- int nvecs; /* number of regions */
int space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
@@ -441,10 +419,6 @@ struct xlog {
struct xfs_kobj l_kobj;
- /* The following field are used for debugging; need to hold icloglock */
-#ifdef DEBUG
- void *l_iclog_bak[XLOG_MAX_ICLOGS];
-#endif
/* log recovery lsn tracking (for buffer submission */
xfs_lsn_t l_recovery_lsn;
@@ -454,9 +428,6 @@ struct xlog {
struct rw_semaphore l_incompat_users;
};
-#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
- ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
-
/*
* Bits for operational state
*/
@@ -509,27 +480,14 @@ extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
extern struct kmem_cache *xfs_log_ticket_cache;
-struct xlog_ticket *
-xlog_ticket_alloc(
- struct xlog *log,
- int unit_bytes,
- int count,
- char client,
- bool permanent);
-
-static inline void
-xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
-{
- *ptr += bytes;
- *len -= bytes;
- *off += bytes;
-}
+struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
+ int count, bool permanent);
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
struct xfs_log_vec *log_vector, struct xlog_ticket *tic,
- uint optype);
+ uint32_t len);
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
@@ -690,4 +648,38 @@ xlog_valid_lsn(
return valid;
}
+/*
+ * Log vector and shadow buffers can be large, so we need to use kvmalloc() here
+ * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts
+ * to fall back to vmalloc, so we can't actually do anything useful with gfp
+ * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
+ * will do direct reclaim and compaction in the slow path, both of which are
+ * horrendously expensive. We just want kmalloc to fail fast and fall back to
+ * vmalloc if it can't get somethign straight away from the free lists or
+ * buddy allocator. Hence we have to open code kvmalloc outselves here.
+ *
+ * This assumes that the caller uses memalloc_nofs_save task context here, so
+ * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS
+ * allocations. This is actually the only way to make vmalloc() do GFP_NOFS
+ * allocations, so lets just all pretend this is a GFP_KERNEL context
+ * operation....
+ */
+static inline void *
+xlog_kvmalloc(
+ size_t buf_size)
+{
+ gfp_t flags = GFP_KERNEL;
+ void *p;
+
+ flags &= ~__GFP_DIRECT_RECLAIM;
+ flags |= __GFP_NOWARN | __GFP_NORETRY;
+ do {
+ p = kmalloc(buf_size, flags);
+ if (!p)
+ p = vmalloc(buf_size);
+ } while (!p);
+
+ return p;
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c4ad4296c540..5f7e4e6e33ce 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -39,13 +39,6 @@ STATIC int
xlog_clear_stale_blocks(
struct xlog *,
xfs_lsn_t);
-#if defined(DEBUG)
-STATIC void
-xlog_recover_check_summary(
- struct xlog *);
-#else
-#define xlog_recover_check_summary(log)
-#endif
STATIC int
xlog_do_recovery_pass(
struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
@@ -1800,6 +1793,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
&xlog_cud_item_ops,
&xlog_bui_item_ops,
&xlog_bud_item_ops,
+ &xlog_attri_item_ops,
+ &xlog_attrd_item_ops,
};
static const struct xlog_recover_item_ops *
@@ -3228,7 +3223,7 @@ xlog_do_log_recovery(
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
- int error, i;
+ int error;
ASSERT(head_blk != tail_blk);
@@ -3236,37 +3231,25 @@ xlog_do_log_recovery(
* First do a pass to find all of the cancelled buf log items.
* Store them in the buf_cancel_table for use in the second pass.
*/
- log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
- sizeof(struct list_head),
- 0);
- for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
- INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+ error = xlog_alloc_buf_cancel_table(log);
+ if (error)
+ return error;
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
XLOG_RECOVER_PASS1, NULL);
- if (error != 0) {
- kmem_free(log->l_buf_cancel_table);
- log->l_buf_cancel_table = NULL;
- return error;
- }
+ if (error != 0)
+ goto out_cancel;
+
/*
* Then do a second pass to actually recover the items in the log.
* When it is complete free the table of buf cancel items.
*/
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
XLOG_RECOVER_PASS2, NULL);
-#ifdef DEBUG
- if (!error) {
- int i;
-
- for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
- ASSERT(list_empty(&log->l_buf_cancel_table[i]));
- }
-#endif /* DEBUG */
-
- kmem_free(log->l_buf_cancel_table);
- log->l_buf_cancel_table = NULL;
-
+ if (!error)
+ xlog_check_buf_cancel_table(log);
+out_cancel:
+ xlog_free_buf_cancel_table(log);
return error;
}
@@ -3337,8 +3320,6 @@ xlog_do_recover(
}
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
- xlog_recover_check_summary(log);
-
/* Normal transactions can now occur */
clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
return 0;
@@ -3481,7 +3462,6 @@ xlog_recover_finish(
}
xlog_recover_process_iunlinks(log);
- xlog_recover_check_summary(log);
/*
* Recover any CoW staging blocks that are still referenced by the
@@ -3515,52 +3495,3 @@ xlog_recover_cancel(
xlog_recover_cancel_intents(log);
}
-#if defined(DEBUG)
-/*
- * Read all of the agf and agi counters and check that they
- * are consistent with the superblock counters.
- */
-STATIC void
-xlog_recover_check_summary(
- struct xlog *log)
-{
- struct xfs_mount *mp = log->l_mp;
- struct xfs_perag *pag;
- struct xfs_buf *agfbp;
- struct xfs_buf *agibp;
- xfs_agnumber_t agno;
- uint64_t freeblks;
- uint64_t itotal;
- uint64_t ifree;
- int error;
-
- freeblks = 0LL;
- itotal = 0LL;
- ifree = 0LL;
- for_each_perag(mp, agno, pag) {
- error = xfs_read_agf(mp, NULL, pag->pag_agno, 0, &agfbp);
- if (error) {
- xfs_alert(mp, "%s agf read failed agno %d error %d",
- __func__, pag->pag_agno, error);
- } else {
- struct xfs_agf *agfp = agfbp->b_addr;
-
- freeblks += be32_to_cpu(agfp->agf_freeblks) +
- be32_to_cpu(agfp->agf_flcount);
- xfs_buf_relse(agfbp);
- }
-
- error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp);
- if (error) {
- xfs_alert(mp, "%s agi read failed agno %d error %d",
- __func__, pag->pag_agno, error);
- } else {
- struct xfs_agi *agi = agibp->b_addr;
-
- itotal += be32_to_cpu(agi->agi_count);
- ifree += be32_to_cpu(agi->agi_freecount);
- xfs_buf_relse(agibp);
- }
- }
-}
-#endif /* DEBUG */
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index bc66d95c8d4c..8f495cc23903 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -27,42 +27,34 @@ __xfs_printk(
printk("%sXFS: %pV\n", level, vaf);
}
-#define define_xfs_printk_level(func, kern_level) \
-void func(const struct xfs_mount *mp, const char *fmt, ...) \
-{ \
- struct va_format vaf; \
- va_list args; \
- int level; \
- \
- va_start(args, fmt); \
- \
- vaf.fmt = fmt; \
- vaf.va = &args; \
- \
- __xfs_printk(kern_level, mp, &vaf); \
- va_end(args); \
- \
- if (!kstrtoint(kern_level, 0, &level) && \
- level <= LOGLEVEL_ERR && \
- xfs_error_level >= XFS_ERRLEVEL_HIGH) \
- xfs_stack_trace(); \
-} \
-
-define_xfs_printk_level(xfs_emerg, KERN_EMERG);
-define_xfs_printk_level(xfs_alert, KERN_ALERT);
-define_xfs_printk_level(xfs_crit, KERN_CRIT);
-define_xfs_printk_level(xfs_err, KERN_ERR);
-define_xfs_printk_level(xfs_warn, KERN_WARNING);
-define_xfs_printk_level(xfs_notice, KERN_NOTICE);
-define_xfs_printk_level(xfs_info, KERN_INFO);
-#ifdef DEBUG
-define_xfs_printk_level(xfs_debug, KERN_DEBUG);
-#endif
+void
+xfs_printk_level(
+ const char *kern_level,
+ const struct xfs_mount *mp,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int level;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ __xfs_printk(kern_level, mp, &vaf);
+
+ va_end(args);
+
+ if (!kstrtoint(kern_level, 0, &level) &&
+ level <= LOGLEVEL_ERR &&
+ xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+}
void
-xfs_alert_tag(
+_xfs_alert_tag(
const struct xfs_mount *mp,
- int panic_tag,
+ uint32_t panic_tag,
const char *fmt, ...)
{
struct va_format vaf;
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index bb9860ec9a93..cc323775a12c 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -6,33 +6,46 @@
struct xfs_mount;
-extern __printf(2, 3)
-void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
extern __printf(3, 4)
-void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
+void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp,
+ const char *fmt, ...);
+#define xfs_printk_index_wrap(kern_level, mp, fmt, ...) \
+({ \
+ printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \
+ xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__); \
+})
+#define xfs_emerg(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__)
+#define xfs_alert(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__)
+#define xfs_crit(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__)
+#define xfs_err(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__)
+#define xfs_warn(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__)
+#define xfs_notice(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__)
+#define xfs_info(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__)
#ifdef DEBUG
-extern __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
+#define xfs_debug(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__)
#else
-static inline __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
-{
-}
+#define xfs_debug(mp, fmt, ...) do {} while (0)
#endif
+#define xfs_alert_tag(mp, tag, fmt, ...) \
+({ \
+ printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \
+ _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__); \
+})
+
+extern __printf(3, 4)
+void _xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag,
+ const char *fmt, ...);
+
#define xfs_printk_ratelimited(func, dev, fmt, ...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
@@ -62,6 +75,12 @@ do { \
#define xfs_debug_ratelimited(dev, fmt, ...) \
xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
+#define xfs_warn_mount(mp, warntag, fmt, ...) \
+do { \
+ if (xfs_should_warn((mp), (warntag))) \
+ xfs_warn((mp), (fmt), ##__VA_ARGS__); \
+} while (0)
+
#define xfs_warn_once(dev, fmt, ...) \
xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__)
#define xfs_notice_once(dev, fmt, ...) \
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c5f153c3693f..daa8d29c46b4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -468,6 +468,8 @@ STATIC int
xfs_check_summary_counts(
struct xfs_mount *mp)
{
+ int error = 0;
+
/*
* The AG0 superblock verifier rejects in-progress filesystems,
* so we should never see the flag set this far into mounting.
@@ -506,11 +508,32 @@ xfs_check_summary_counts(
* superblock to be correct and we don't need to do anything here.
* Otherwise, recalculate the summary counters.
*/
- if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) &&
- !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
- return 0;
+ if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
+ xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
+ error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+ if (error)
+ return error;
+ }
- return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+ /*
+ * Older kernels misused sb_frextents to reflect both incore
+ * reservations made by running transactions and the actual count of
+ * free rt extents in the ondisk metadata. Transactions committed
+ * during runtime can therefore contain a superblock update that
+ * undercounts the number of free rt extents tracked in the rt bitmap.
+ * A clean unmount record will have the correct frextents value since
+ * there can be no other transactions running at that point.
+ *
+ * If we're mounting the rt volume after recovering the log, recompute
+ * frextents from the rtbitmap file to fix the inconsistency.
+ */
+ if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+ error = xfs_rtalloc_reinit_frextents(mp);
+ if (error)
+ return error;
+ }
+
+ return 0;
}
/*
@@ -784,11 +807,6 @@ xfs_mountfs(
goto out_inodegc_shrinker;
}
- /* Make sure the summary counts are ok. */
- error = xfs_check_summary_counts(mp);
- if (error)
- goto out_log_dealloc;
-
/* Enable background inode inactivation workers. */
xfs_inodegc_start(mp);
xfs_blockgc_start(mp);
@@ -844,6 +862,11 @@ xfs_mountfs(
goto out_rele_rip;
}
+ /* Make sure the summary counts are ok. */
+ error = xfs_check_summary_counts(mp);
+ if (error)
+ goto out_rtunmount;
+
/*
* If this is a read-only mount defer the superblock updates until
* the next remount into writeable mode. Otherwise we would never
@@ -1087,24 +1110,33 @@ xfs_fs_writable(
return true;
}
+/* Adjust m_fdblocks or m_frextents. */
int
-xfs_mod_fdblocks(
+xfs_mod_freecounter(
struct xfs_mount *mp,
+ struct percpu_counter *counter,
int64_t delta,
bool rsvd)
{
int64_t lcounter;
long long res_used;
+ uint64_t set_aside = 0;
s32 batch;
- uint64_t set_aside;
+ bool has_resv_pool;
+
+ ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
+ has_resv_pool = (counter == &mp->m_fdblocks);
+ if (rsvd)
+ ASSERT(has_resv_pool);
if (delta > 0) {
/*
* If the reserve pool is depleted, put blocks back into it
* first. Most of the time the pool is full.
*/
- if (likely(mp->m_resblks == mp->m_resblks_avail)) {
- percpu_counter_add(&mp->m_fdblocks, delta);
+ if (likely(!has_resv_pool ||
+ mp->m_resblks == mp->m_resblks_avail)) {
+ percpu_counter_add(counter, delta);
return 0;
}
@@ -1116,7 +1148,7 @@ xfs_mod_fdblocks(
} else {
delta -= res_used;
mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(&mp->m_fdblocks, delta);
+ percpu_counter_add(counter, delta);
}
spin_unlock(&mp->m_sb_lock);
return 0;
@@ -1130,7 +1162,7 @@ xfs_mod_fdblocks(
* then make everything serialise as we are real close to
* ENOSPC.
*/
- if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
+ if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
@@ -1147,9 +1179,10 @@ xfs_mod_fdblocks(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
- set_aside = xfs_fdblocks_unavailable(mp);
- percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
- if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
+ if (has_resv_pool)
+ set_aside = xfs_fdblocks_unavailable(mp);
+ percpu_counter_add_batch(counter, delta, batch);
+ if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
@@ -1160,8 +1193,8 @@ xfs_mod_fdblocks(
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
- percpu_counter_add(&mp->m_fdblocks, -delta);
- if (!rsvd)
+ percpu_counter_add(counter, -delta);
+ if (!has_resv_pool || !rsvd)
goto fdblocks_enospc;
lcounter = (long long)mp->m_resblks_avail + delta;
@@ -1178,24 +1211,6 @@ fdblocks_enospc:
return -ENOSPC;
}
-int
-xfs_mod_frextents(
- struct xfs_mount *mp,
- int64_t delta)
-{
- int64_t lcounter;
- int ret = 0;
-
- spin_lock(&mp->m_sb_lock);
- lcounter = mp->m_sb.sb_frextents + delta;
- if (lcounter < 0)
- ret = -ENOSPC;
- else
- mp->m_sb.sb_frextents = lcounter;
- spin_unlock(&mp->m_sb_lock);
- return ret;
-}
-
/*
* Used to free the superblock along various error paths.
*/
@@ -1341,7 +1356,6 @@ xfs_clear_incompat_log_features(
if (xfs_sb_has_incompat_log_feature(&mp->m_sb,
XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
- xfs_info(mp, "Clearing log incompat feature flags.");
xfs_sb_remove_incompat_log_features(&mp->m_sb);
ret = true;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f6dc19de8322..ba5d42abf66e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -183,6 +183,8 @@ typedef struct xfs_mount {
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
struct percpu_counter m_fdblocks; /* free block counter */
+ struct percpu_counter m_frextents; /* free rt extent counter */
+
/*
* Count of data device blocks reserved for delayed allocations,
* including indlen blocks. Does not include allocated CoW staging
@@ -276,6 +278,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */
#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
+#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -338,6 +341,7 @@ __XFS_HAS_FEAT(realtime, REALTIME)
__XFS_HAS_FEAT(inobtcounts, INOBTCNT)
__XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
+__XFS_HAS_FEAT(large_extent_counts, NREXT64)
/*
* Mount features
@@ -387,6 +391,13 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
*/
#define XFS_OPSTATE_BLOCKGC_ENABLED 6
+/* Kernel has logged a warning about online fsck being used on this fs. */
+#define XFS_OPSTATE_WARNED_SCRUB 7
+/* Kernel has logged a warning about shrink being used on this fs. */
+#define XFS_OPSTATE_WARNED_SHRINK 8
+/* Kernel has logged a warning about logged xattr updates being used. */
+#define XFS_OPSTATE_WARNED_LARP 9
+
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
{ \
@@ -409,6 +420,12 @@ __XFS_IS_OPSTATE(readonly, READONLY)
__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
+static inline bool
+xfs_should_warn(struct xfs_mount *mp, long nr)
+{
+ return !test_and_set_bit(nr, &mp->m_opstate);
+}
+
#define XFS_OPSTATE_STRINGS \
{ (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \
{ (1UL << XFS_OPSTATE_CLEAN), "clean" }, \
@@ -416,7 +433,10 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
{ (1UL << XFS_OPSTATE_INODE32), "inode32" }, \
{ (1UL << XFS_OPSTATE_READONLY), "read_only" }, \
{ (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \
- { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }
+ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \
+ { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
+ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
+ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }
/*
* Max and min values for mount-option defined I/O
@@ -425,16 +445,15 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
#define XFS_MAX_IO_LOG 30 /* 1G */
#define XFS_MIN_IO_LOG PAGE_SHIFT
-#define xfs_is_shutdown(mp) xfs_is_shutdown(mp)
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
+void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname,
int lnnum);
#define xfs_force_shutdown(m,f) \
xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
-#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
+#define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */
#define XFS_SHUTDOWN_STRINGS \
{ SHUTDOWN_META_IO_ERROR, "metadata_io" }, \
@@ -494,9 +513,20 @@ xfs_fdblocks_unavailable(
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
-extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
- bool reserved);
-extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
+int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+ int64_t delta, bool rsvd);
+
+static inline int
+xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
+{
+ return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+}
+
+static inline int
+xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
+{
+ return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
+}
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 25991923c1a8..758702b9495f 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -132,6 +132,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16);
/*
* The v5 superblock format extended several v4 header structures with
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f165d1a3de1d..abf08bbf34a9 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -582,9 +582,6 @@ xfs_qm_init_timelimits(
defq->blk.time = XFS_QM_BTIMELIMIT;
defq->ino.time = XFS_QM_ITIMELIMIT;
defq->rtb.time = XFS_QM_RTBTIMELIMIT;
- defq->blk.warn = XFS_QM_BWARNLIMIT;
- defq->ino.warn = XFS_QM_IWARNLIMIT;
- defq->rtb.warn = XFS_QM_RTBWARNLIMIT;
/*
* We try to get the limits from the superuser's limits fields.
@@ -608,12 +605,6 @@ xfs_qm_init_timelimits(
defq->ino.time = dqp->q_ino.timer;
if (dqp->q_rtb.timer)
defq->rtb.time = dqp->q_rtb.timer;
- if (dqp->q_blk.warnings)
- defq->blk.warn = dqp->q_blk.warnings;
- if (dqp->q_ino.warnings)
- defq->ino.warn = dqp->q_ino.warnings;
- if (dqp->q_rtb.warnings)
- defq->rtb.warn = dqp->q_rtb.warnings;
xfs_qm_dqdestroy(dqp);
}
@@ -1317,8 +1308,15 @@ xfs_qm_quotacheck(
error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
NULL);
- if (error)
+ if (error) {
+ /*
+ * The inode walk may have partially populated the dquot
+ * caches. We must purge them before disabling quota and
+ * tearing down the quotainfo, or else the dquots will leak.
+ */
+ xfs_qm_dqpurge_all(mp);
goto error_return;
+ }
/*
* We've made all the changes that we need to make incore. Flush them
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5bb12717ea28..9683f0457d19 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -34,7 +34,6 @@ struct xfs_quota_limits {
xfs_qcnt_t hard; /* default hard limit */
xfs_qcnt_t soft; /* default soft limit */
time64_t time; /* limit for timers */
- xfs_qwarncnt_t warn; /* limit for warnings */
};
/* Defaults for each quota type: time limits, warn limits, usage limits */
@@ -134,10 +133,6 @@ struct xfs_dquot_acct {
#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */
#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */
-#define XFS_QM_BWARNLIMIT 5
-#define XFS_QM_IWARNLIMIT 5
-#define XFS_QM_RTBWARNLIMIT 5
-
extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
/* quota ops */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 7d5a31827681..74ac9ca9e119 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -217,8 +217,7 @@ xfs_qm_scall_quotaon(
return 0;
}
-#define XFS_QC_MASK \
- (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK)
/*
* Adjust limits of this quota, and the defaults if passed in. Returns true
@@ -251,17 +250,6 @@ xfs_setqlim_limits(
}
static inline void
-xfs_setqlim_warns(
- struct xfs_dquot_res *res,
- struct xfs_quota_limits *qlim,
- int warns)
-{
- res->warnings = warns;
- if (qlim)
- qlim->warn = warns;
-}
-
-static inline void
xfs_setqlim_timer(
struct xfs_mount *mp,
struct xfs_dquot_res *res,
@@ -354,8 +342,6 @@ xfs_qm_scall_setqlim(
if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk"))
xfs_dquot_set_prealloc_limits(dqp);
- if (newlim->d_fieldmask & QC_SPC_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_spc_warns);
if (newlim->d_fieldmask & QC_SPC_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer);
@@ -370,8 +356,6 @@ xfs_qm_scall_setqlim(
qlim = id == 0 ? &defq->rtb : NULL;
xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb");
- if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns);
if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer);
@@ -386,8 +370,6 @@ xfs_qm_scall_setqlim(
qlim = id == 0 ? &defq->ino : NULL;
xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino");
- if (newlim->d_fieldmask & QC_INO_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_ino_warns);
if (newlim->d_fieldmask & QC_INO_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer);
@@ -428,13 +410,13 @@ xfs_qm_scall_getquota_fill_qc(
dst->d_ino_count = dqp->q_ino.reserved;
dst->d_spc_timer = dqp->q_blk.timer;
dst->d_ino_timer = dqp->q_ino.timer;
- dst->d_ino_warns = dqp->q_ino.warnings;
- dst->d_spc_warns = dqp->q_blk.warnings;
+ dst->d_ino_warns = 0;
+ dst->d_spc_warns = 0;
dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit);
dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit);
dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved);
dst->d_rt_spc_timer = dqp->q_rtb.timer;
- dst->d_rt_spc_warns = dqp->q_rtb.warnings;
+ dst->d_rt_spc_warns = 0;
/*
* Internally, we don't reset all the timers when quota enforcement
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 07989bd67728..9c162e69976b 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -40,9 +40,9 @@ xfs_qm_fill_state(
tstate->spc_timelimit = (u32)defq->blk.time;
tstate->ino_timelimit = (u32)defq->ino.time;
tstate->rt_spc_timelimit = (u32)defq->rtb.time;
- tstate->spc_warnlimit = defq->blk.warn;
- tstate->ino_warnlimit = defq->ino.warn;
- tstate->rt_spc_warnlimit = defq->rtb.warn;
+ tstate->spc_warnlimit = 0;
+ tstate->ino_warnlimit = 0;
+ tstate->rt_spc_warnlimit = 0;
if (tempqip)
xfs_irele(ip);
}
@@ -98,7 +98,7 @@ xfs_quota_type(int type)
}
}
-#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK)
/*
* Adjust quota timers & warnings
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0d868c93144d..7e97bf19793d 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -53,10 +54,11 @@ xfs_cui_release(
struct xfs_cui_log_item *cuip)
{
ASSERT(atomic_read(&cuip->cui_refcount) > 0);
- if (atomic_dec_and_test(&cuip->cui_refcount)) {
- xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_cui_item_free(cuip);
- }
+ if (!atomic_dec_and_test(&cuip->cui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&cuip->cui_item, 0);
+ xfs_cui_item_free(cuip);
}
@@ -204,14 +206,24 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_cache, cudp);
}
+static struct xfs_log_item *
+xfs_cud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &CUD_ITEM(lip)->cud_cuip->cui_item;
+}
+
static const struct xfs_item_ops xfs_cud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_cud_item_size,
.iop_format = xfs_cud_item_format,
.iop_release = xfs_cud_item_release,
+ .iop_intent = xfs_cud_item_intent,
};
static struct xfs_cud_log_item *
@@ -259,7 +271,7 @@ xfs_trans_log_finish_refcount_update(
* 1.) releases the CUI and frees the CUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
return error;
@@ -600,6 +612,7 @@ xfs_cui_item_relog(
}
static const struct xfs_item_ops xfs_cui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
.iop_unpin = xfs_cui_item_unpin,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 54e68e5693fd..e7a7c00d93be 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -586,21 +586,21 @@ out:
STATIC int
xfs_reflink_end_cow_extent(
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb,
- xfs_fileoff_t *end_fsb)
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
{
- struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got, del, data;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- xfs_filblks_t rlen;
unsigned int resblks;
+ int nmaps;
int error;
/* No COW extents? That's easy! */
if (ifp->if_bytes == 0) {
- *end_fsb = offset_fsb;
+ *offset_fsb = end_fsb;
return 0;
}
@@ -620,6 +620,9 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_REFLINK_END_COW_CNT);
if (error)
goto out_cancel;
@@ -628,42 +631,66 @@ xfs_reflink_end_cow_extent(
* left by the time I/O completes for the loser of the race. In that
* case we are done.
*/
- if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
- got.br_startoff + got.br_blockcount <= offset_fsb) {
- *end_fsb = offset_fsb;
+ if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
goto out_cancel;
}
/*
- * Structure copy @got into @del, then trim @del to the range that we
- * were asked to remap. We preserve @got for the eventual CoW fork
- * deletion; from now on @del represents the mapping that we're
- * actually remapping.
- */
- del = got;
- xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
-
- ASSERT(del.br_blockcount > 0);
-
- /*
* Only remap real extents that contain data. With AIO, speculative
* preallocations can leak into the range we are called upon, and we
- * need to skip them.
+ * need to skip them. Preserve @got for the eventual CoW fork
+ * deletion; from now on @del represents the mapping that we're
+ * actually remapping.
*/
- if (!xfs_bmap_is_written_extent(&got)) {
- *end_fsb = del.br_startoff;
- goto out_cancel;
+ while (!xfs_bmap_is_written_extent(&got)) {
+ if (!xfs_iext_next_extent(ifp, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
+ goto out_cancel;
+ }
}
+ del = got;
- /* Unmap the old blocks in the data fork. */
- rlen = del.br_blockcount;
- error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
+ /* Grab the corresponding mapping in the data fork. */
+ nmaps = 1;
+ error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
+ &nmaps, 0);
if (error)
goto out_cancel;
- /* Trim the extent to whatever got unmapped. */
- xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
- trace_xfs_reflink_cow_remap(ip, &del);
+ /* We can only remap the smaller of the two extent sizes. */
+ data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
+ del.br_blockcount = data.br_blockcount;
+
+ trace_xfs_reflink_cow_remap_from(ip, &del);
+ trace_xfs_reflink_cow_remap_to(ip, &data);
+
+ if (xfs_bmap_is_real_extent(&data)) {
+ /*
+ * If the extent we're remapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
+ */
+ xfs_bmap_unmap_extent(tp, ip, &data);
+ xfs_refcount_decrease_extent(tp, &data);
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -data.br_blockcount);
+ } else if (data.br_startblock == DELAYSTARTBLOCK) {
+ int done;
+
+ /*
+ * If the extent we're remapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = xfs_bunmapi(NULL, ip, data.br_startoff,
+ data.br_blockcount, 0, 1, &done);
+ if (error)
+ goto out_cancel;
+ ASSERT(done);
+ }
/* Free the CoW orphan record. */
xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
@@ -684,7 +711,7 @@ xfs_reflink_end_cow_extent(
return error;
/* Update the caller about how much progress we made. */
- *end_fsb = del.br_startoff;
+ *offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
out_cancel:
@@ -712,7 +739,7 @@ xfs_reflink_end_cow(
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
/*
- * Walk backwards until we're out of the I/O range. The loop function
+ * Walk forwards until we've remapped the I/O range. The loop function
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
*
@@ -744,7 +771,7 @@ xfs_reflink_end_cow(
* blocks will be remapped.
*/
while (end_fsb > offset_fsb && !error)
- error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
+ error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
if (error)
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
@@ -1121,6 +1148,8 @@ xfs_reflink_remap_extent(
++iext_delta;
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto out_cancel;
@@ -1133,7 +1162,7 @@ xfs_reflink_remap_extent(
xfs_refcount_decrease_extent(tp, &smap);
qdelta -= smap.br_blockcount;
} else if (smap.br_startblock == DELAYSTARTBLOCK) {
- xfs_filblks_t len = smap.br_blockcount;
+ int done;
/*
* If the extent we're unmapping is a delalloc reservation,
@@ -1141,10 +1170,11 @@ xfs_reflink_remap_extent(
* incore state. Dropping the delalloc reservation takes care
* of the quota reservation for us.
*/
- error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+ error = xfs_bunmapi(NULL, ip, smap.br_startoff,
+ smap.br_blockcount, 0, 1, &done);
if (error)
goto out_cancel;
- ASSERT(len == 0);
+ ASSERT(done);
}
/*
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index a22b2d19ef91..fef92e02f3bb 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -53,10 +54,11 @@ xfs_rui_release(
struct xfs_rui_log_item *ruip)
{
ASSERT(atomic_read(&ruip->rui_refcount) > 0);
- if (atomic_dec_and_test(&ruip->rui_refcount)) {
- xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_rui_item_free(ruip);
- }
+ if (!atomic_dec_and_test(&ruip->rui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&ruip->rui_item, 0);
+ xfs_rui_item_free(ruip);
}
STATIC void
@@ -227,14 +229,24 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_cache, rudp);
}
+static struct xfs_log_item *
+xfs_rud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &RUD_ITEM(lip)->rud_ruip->rui_item;
+}
+
static const struct xfs_item_ops xfs_rud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_rud_item_size,
.iop_format = xfs_rud_item_format,
.iop_release = xfs_rud_item_release,
+ .iop_intent = xfs_rud_item_intent,
};
static struct xfs_rud_log_item *
@@ -327,7 +339,7 @@ xfs_trans_log_finish_rmap_update(
* 1.) releases the RUI and frees the RUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
return error;
@@ -630,6 +642,7 @@ xfs_rui_item_relog(
}
static const struct xfs_item_ops xfs_rui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
.iop_unpin = xfs_rui_item_unpin,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index b8c79ee791af..292d5e54a92c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -806,6 +806,9 @@ xfs_growfs_rt_alloc(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
@@ -1284,6 +1287,44 @@ xfs_rtmount_init(
return 0;
}
+static int
+xfs_rtalloc_count_frextent(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ uint64_t *valp = priv;
+
+ *valp += rec->ar_extcount;
+ return 0;
+}
+
+/*
+ * Reinitialize the number of free realtime extents from the realtime bitmap.
+ * Callers must ensure that there is no other activity in the filesystem.
+ */
+int
+xfs_rtalloc_reinit_frextents(
+ struct xfs_mount *mp)
+{
+ uint64_t val = 0;
+ int error;
+
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
+ &val);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_frextents = val;
+ spin_unlock(&mp->m_sb_lock);
+ percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ return 0;
+}
+
/*
* Get the bitmap and summary inodes and the summary cache into the mount
* structure at mount time.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 91b00289509b..62c7ad79cbb6 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -22,6 +22,7 @@ struct xfs_rtalloc_rec {
};
typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv);
@@ -123,27 +124,29 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_trans *tp,
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
const struct xfs_rtalloc_rec *low_rec,
const struct xfs_rtalloc_rec *high_rec,
xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_trans *tp,
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
bool *is_free);
+int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
# define xfs_growfs_rt(mp,in) (ENOSYS)
# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
-# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS)
# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
# define xfs_verify_rtbno(m, r) (false)
# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a276b8111f63..ed18160e6181 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -38,6 +38,8 @@
#include "xfs_pwork.h"
#include "xfs_ag.h"
#include "xfs_defer.h"
+#include "xfs_attr_item.h"
+#include "xfs_xattr.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -843,9 +845,11 @@ xfs_fs_statfs(
if (XFS_IS_REALTIME_MOUNT(mp) &&
(ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
+ s64 freertx;
+
statp->f_blocks = sbp->sb_rblocks;
- statp->f_bavail = statp->f_bfree =
- sbp->sb_frextents * sbp->sb_rextsize;
+ freertx = percpu_counter_sum_positive(&mp->m_frextents);
+ statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
}
return 0;
@@ -1015,8 +1019,14 @@ xfs_init_percpu_counters(
if (error)
goto free_fdblocks;
+ error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+ if (error)
+ goto free_delalloc;
+
return 0;
+free_delalloc:
+ percpu_counter_destroy(&mp->m_delalloc_blks);
free_fdblocks:
percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
@@ -1033,6 +1043,7 @@ xfs_reinit_percpu_counters(
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+ percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
}
static void
@@ -1045,6 +1056,7 @@ xfs_destroy_percpu_counters(
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
+ percpu_counter_destroy(&mp->m_frextents);
}
static int
@@ -1635,6 +1647,10 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_has_large_extent_counts(mp))
+ xfs_warn(mp,
+ "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -2065,8 +2081,24 @@ xfs_init_caches(void)
if (!xfs_bui_cache)
goto out_destroy_bud_cache;
+ xfs_attrd_cache = kmem_cache_create("xfs_attrd_item",
+ sizeof(struct xfs_attrd_log_item),
+ 0, 0, NULL);
+ if (!xfs_attrd_cache)
+ goto out_destroy_bui_cache;
+
+ xfs_attri_cache = kmem_cache_create("xfs_attri_item",
+ sizeof(struct xfs_attri_log_item),
+ 0, 0, NULL);
+ if (!xfs_attri_cache)
+ goto out_destroy_attrd_cache;
+
return 0;
+ out_destroy_attrd_cache:
+ kmem_cache_destroy(xfs_attrd_cache);
+ out_destroy_bui_cache:
+ kmem_cache_destroy(xfs_bui_cache);
out_destroy_bud_cache:
kmem_cache_destroy(xfs_bud_cache);
out_destroy_cui_cache:
@@ -2113,6 +2145,8 @@ xfs_destroy_caches(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_cache_destroy(xfs_attri_cache);
+ kmem_cache_destroy(xfs_attrd_cache);
kmem_cache_destroy(xfs_bui_cache);
kmem_cache_destroy(xfs_bud_cache);
kmem_cache_destroy(xfs_cui_cache);
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 167d23f92ffe..3cd5a51bace1 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -91,7 +91,6 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
xfs_agnumber_t agcount);
extern const struct export_operations xfs_export_operations;
-extern const struct xattr_handler *xfs_xattr_handlers[];
extern const struct quotactl_ops xfs_quotactl_operations;
extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index affbedf78160..4145ba872547 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -226,11 +226,6 @@ xfs_symlink(
goto out_trans_cancel;
}
- error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
-
/*
* Allocate an inode for the symlink.
*/
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 7692e76ead33..f78ad6b10ea5 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -83,6 +83,7 @@ extern xfs_param_t xfs_params;
struct xfs_globals {
#ifdef DEBUG
int pwork_threads; /* parallel workqueue threads */
+ bool larp; /* log attribute replay */
#endif
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 574b80c29fe1..f7faf6e70d7f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -228,6 +228,29 @@ pwork_threads_show(
return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
}
XFS_SYSFS_ATTR_RW(pwork_threads);
+
+static ssize_t
+larp_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.larp);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+STATIC ssize_t
+larp_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
+}
+XFS_SYSFS_ATTR_RW(larp);
#endif /* DEBUG */
static struct attribute *xfs_dbg_attrs[] = {
@@ -237,6 +260,7 @@ static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(always_cow),
#ifdef DEBUG
ATTR_LIST(pwork_threads),
+ ATTR_LIST(larp),
#endif
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index b141ef78c755..d32026585c1b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -418,6 +418,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__field(unsigned, lockval)
__field(unsigned, flags)
__field(unsigned long, caller_ip)
+ __field(const void *, buf_ops)
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
@@ -428,9 +429,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
__entry->caller_ip = caller_ip;
+ __entry->buf_ops = bp->b_ops;
),
TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
- "lock %d flags %s caller %pS",
+ "lock %d flags %s bufops %pS caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
__entry->nblks,
@@ -438,6 +440,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->pincount,
__entry->lockval,
__print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+ __entry->buf_ops,
(void *)__entry->caller_ip)
)
@@ -1096,22 +1099,6 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done);
DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before);
DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after);
-#define XFS_QMOPT_FLAGS \
- { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
- { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
- { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
- { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
- { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
- { XFS_QMOPT_INHERIT, "INHERIT" }, \
- { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
- { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
- { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
- { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
- { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
- { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
- { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
- { XFS_QMOPT_RES_INOS, "RES_INOS" }
-
TRACE_EVENT(xfs_trans_mod_dquot,
TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp,
unsigned int field, int64_t delta),
@@ -1348,6 +1335,9 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
DECLARE_EVENT_CLASS(xfs_ail_class,
TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
@@ -1924,7 +1914,7 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__field(int, namelen)
__field(xfs_dahash_t, hashval)
__field(xfs_ino_t, inumber)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -1990,7 +1980,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__field(xfs_dahash_t, hashval)
__field(unsigned int, attr_filter)
__field(unsigned int, attr_flags)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -2097,7 +2087,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_space_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
__field(int, idx)
),
TP_fast_assign(
@@ -2128,7 +2118,7 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
__field(int, src_idx)
__field(int, dst_idx)
__field(int, count)
@@ -2169,7 +2159,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__field(int, which)
__field(xfs_ino_t, ino)
__field(int, format)
- __field(int, nex)
+ __field(xfs_extnum_t, nex)
__field(int, broot_size)
__field(int, fork_off)
),
@@ -2182,7 +2172,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->broot_size = ip->i_df.if_broot_bytes;
__entry->fork_off = XFS_IFORK_BOFF(ip);
),
- TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+ TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
"broot size %d, forkoff 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
@@ -3418,7 +3408,8 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@@ -3513,7 +3504,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
-TRACE_EVENT(xfs_trans_resv_calc,
+DECLARE_EVENT_CLASS(xfs_trans_resv_class,
TP_PROTO(struct xfs_mount *mp, unsigned int type,
struct xfs_trans_res *res),
TP_ARGS(mp, type, res),
@@ -3537,6 +3528,33 @@ TRACE_EVENT(xfs_trans_resv_calc,
__entry->logres,
__entry->logcount,
__entry->logflags)
+)
+
+#define DEFINE_TRANS_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_trans_resv_class, name, \
+ TP_PROTO(struct xfs_mount *mp, unsigned int type, \
+ struct xfs_trans_res *res), \
+ TP_ARGS(mp, type, res))
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc);
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize);
+
+TRACE_EVENT(xfs_log_get_max_trans_res,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res),
+ TP_ARGS(mp, res),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint, logres)
+ __field(int, logcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->logres = res->tr_logres;
+ __entry->logcount = res->tr_logcount;
+ ),
+ TP_printk("dev %d:%d logres %u logcount %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->logres,
+ __entry->logcount)
);
DECLARE_EVENT_CLASS(xfs_trans_class,
@@ -4111,6 +4129,27 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
DEFINE_ICLOG_EVENT(xlog_iclog_write);
+TRACE_DEFINE_ENUM(XFS_DAS_UNINIT);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_DONE);
+
DECLARE_EVENT_CLASS(xfs_das_state_class,
TP_PROTO(int das, struct xfs_inode *ip),
TP_ARGS(das, ip),
@@ -4122,8 +4161,9 @@ DECLARE_EVENT_CLASS(xfs_das_state_class,
__entry->das = das;
__entry->ino = ip->i_ino;
),
- TP_printk("state change %d ino 0x%llx",
- __entry->das, __entry->ino)
+ TP_printk("state change %s ino 0x%llx",
+ __print_symbolic(__entry->das, XFS_DAS_STRINGS),
+ __entry->ino)
)
#define DEFINE_DAS_STATE_EVENT(name) \
@@ -4132,9 +4172,15 @@ DEFINE_EVENT(xfs_das_state_class, name, \
TP_ARGS(das, ip))
DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
+
TRACE_EVENT(xfs_force_shutdown,
TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 0ac717aad380..82cf0189c0db 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -32,7 +32,6 @@ static void
xfs_trans_trace_reservations(
struct xfs_mount *mp)
{
- struct xfs_trans_res resv;
struct xfs_trans_res *res;
struct xfs_trans_res *end_res;
int i;
@@ -41,8 +40,6 @@ xfs_trans_trace_reservations(
end_res = (struct xfs_trans_res *)(M_RES(mp) + 1);
for (i = 0; res < end_res; i++, res++)
trace_xfs_trans_resv_calc(mp, i, res);
- xfs_log_get_max_trans_res(mp, &resv);
- trace_xfs_trans_resv_calc(mp, -1, &resv);
}
#else
# define xfs_trans_trace_reservations(mp)
@@ -194,11 +191,9 @@ xfs_trans_reserve(
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
error = xfs_log_regrant(mp, tp->t_ticket);
} else {
- error = xfs_log_reserve(mp,
- resp->tr_logres,
+ error = xfs_log_reserve(mp, resp->tr_logres,
resp->tr_logcount,
- &tp->t_ticket, XFS_TRANSACTION,
- permanent);
+ &tp->t_ticket, permanent);
}
if (error)
@@ -498,10 +493,31 @@ xfs_trans_apply_sb_deltas(
be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
}
- if (tp->t_frextents_delta)
- be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
- if (tp->t_res_frextents_delta)
- be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
+ /*
+ * Updating frextents requires careful handling because it does not
+ * behave like the lazysb counters because we cannot rely on log
+ * recovery in older kenels to recompute the value from the rtbitmap.
+ * This means that the ondisk frextents must be consistent with the
+ * rtbitmap.
+ *
+ * Therefore, log the frextents change to the ondisk superblock and
+ * update the incore superblock so that future calls to xfs_log_sb
+ * write the correct value ondisk.
+ *
+ * Don't touch m_frextents because it includes incore reservations,
+ * and those are handled by the unreserve function.
+ */
+ if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
+ struct xfs_mount *mp = tp->t_mountp;
+ int64_t rtxdelta;
+
+ rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
+
+ spin_lock(&mp->m_sb_lock);
+ be64_add_cpu(&sbp->sb_frextents, rtxdelta);
+ mp->m_sb.sb_frextents += rtxdelta;
+ spin_unlock(&mp->m_sb_lock);
+ }
if (tp->t_dblocks_delta) {
be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
@@ -614,7 +630,12 @@ xfs_trans_unreserve_and_mod_sb(
if (ifreedelta)
percpu_counter_add(&mp->m_ifree, ifreedelta);
- if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+ if (rtxdelta) {
+ error = xfs_mod_frextents(mp, rtxdelta);
+ ASSERT(!error);
+ }
+
+ if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
return;
/* apply remaining deltas */
@@ -622,7 +643,12 @@ xfs_trans_unreserve_and_mod_sb(
mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
mp->m_sb.sb_icount += idelta;
mp->m_sb.sb_ifree += ifreedelta;
- mp->m_sb.sb_frextents += rtxdelta;
+ /*
+ * Do not touch sb_frextents here because we are dealing with incore
+ * reservation. sb_frextents is not part of the lazy sb counters so it
+ * must be consistent with the ondisk rtbitmap and must never include
+ * incore reservations.
+ */
mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
mp->m_sb.sb_agcount += tp->t_agcount_delta;
mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0c82673238f4..9561f193e7e1 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -55,13 +55,15 @@ struct xfs_log_item {
#define XFS_LI_IN_AIL 0
#define XFS_LI_ABORTED 1
#define XFS_LI_FAILED 2
-#define XFS_LI_DIRTY 3 /* log item dirty in transaction */
+#define XFS_LI_DIRTY 3
+#define XFS_LI_WHITEOUT 4
#define XFS_LI_FLAGS \
- { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \
- { (1 << XFS_LI_ABORTED), "ABORTED" }, \
- { (1 << XFS_LI_FAILED), "FAILED" }, \
- { (1 << XFS_LI_DIRTY), "DIRTY" }
+ { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \
+ { (1u << XFS_LI_ABORTED), "ABORTED" }, \
+ { (1u << XFS_LI_FAILED), "FAILED" }, \
+ { (1u << XFS_LI_DIRTY), "DIRTY" }, \
+ { (1u << XFS_LI_WHITEOUT), "WHITEOUT" }
struct xfs_item_ops {
unsigned flags;
@@ -78,30 +80,32 @@ struct xfs_item_ops {
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
struct xfs_trans *tp);
+ struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
};
-/* Is this log item a deferred action intent? */
+/*
+ * Log item ops flags
+ */
+/*
+ * Release the log item when the journal commits instead of inserting into the
+ * AIL for writeback tracking and/or log tail pinning.
+ */
+#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
+#define XFS_ITEM_INTENT (1 << 1)
+#define XFS_ITEM_INTENT_DONE (1 << 2)
+
static inline bool
xlog_item_is_intent(struct xfs_log_item *lip)
{
- return lip->li_ops->iop_recover != NULL &&
- lip->li_ops->iop_match != NULL;
+ return lip->li_ops->flags & XFS_ITEM_INTENT;
}
-/* Is this a log intent-done item? */
static inline bool
xlog_item_is_intent_done(struct xfs_log_item *lip)
{
- return lip->li_ops->iop_unpin == NULL &&
- lip->li_ops->iop_push == NULL;
+ return lip->li_ops->flags & XFS_ITEM_INTENT_DONE;
}
-/*
- * Release the log item as soon as committed. This is for items just logging
- * intents that never need to be written back in place.
- */
-#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
-
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
int type, const struct xfs_item_ops *ops);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 9ba7e6b9bed3..aa00cf67ad72 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -597,13 +597,11 @@ xfs_dqresv_check(
if (softlimit && total_count > softlimit) {
time64_t now = ktime_get_real_seconds();
- if ((res->timer != 0 && now > res->timer) ||
- (res->warnings != 0 && res->warnings >= qlim->warn)) {
+ if (res->timer != 0 && now > res->timer) {
*fatal = true;
return QUOTA_NL_ISOFTLONGWARN;
}
- res->warnings++;
return QUOTA_NL_ISOFTWARN;
}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 0d050f8829ef..c325a28b89a8 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -12,12 +12,104 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_acl.h"
-#include "xfs_da_btree.h"
+#include "xfs_log.h"
+#include "xfs_xattr.h"
#include <linux/posix_acl_xattr.h>
+/*
+ * Get permission to use log-assisted atomic exchange of file extents.
+ *
+ * Callers must not be running any transactions or hold any inode locks, and
+ * they must release the permission by calling xlog_drop_incompat_feat
+ * when they're done.
+ */
+static inline int
+xfs_attr_grab_log_assist(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+
+ /*
+ * Protect ourselves from an idle log clearing the logged xattrs log
+ * incompat feature bit.
+ */
+ xlog_use_incompat_feat(mp->m_log);
+
+ /*
+ * If log-assisted xattrs are already enabled, the caller can use the
+ * log assisted swap functions with the log-incompat reference we got.
+ */
+ if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ return 0;
+
+ /* Enable log-assisted xattrs. */
+ error = xfs_add_incompat_log_feature(mp,
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+ if (error)
+ goto drop_incompat;
+
+ xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP,
+ "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!");
+
+ return 0;
+drop_incompat:
+ xlog_drop_incompat_feat(mp->m_log);
+ return error;
+}
+
+static inline void
+xfs_attr_rele_log_assist(
+ struct xfs_mount *mp)
+{
+ xlog_drop_incompat_feat(mp->m_log);
+}
+
+static inline bool
+xfs_attr_want_log_assist(
+ struct xfs_mount *mp)
+{
+#ifdef DEBUG
+ /* Logged xattrs require a V5 super for log_incompat */
+ return xfs_has_crc(mp) && xfs_globals.larp;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Set or remove an xattr, having grabbed the appropriate logging resources
+ * prior to calling libxfs.
+ */
+int
+xfs_attr_change(
+ struct xfs_da_args *args)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+ bool use_logging = false;
+ int error;
+
+ ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED));
+
+ if (xfs_attr_want_log_assist(mp)) {
+ error = xfs_attr_grab_log_assist(mp);
+ if (error)
+ return error;
+
+ args->op_flags |= XFS_DA_OP_LOGGED;
+ use_logging = true;
+ }
+
+ error = xfs_attr_set(args);
+
+ if (use_logging)
+ xfs_attr_rele_log_assist(mp);
+ return error;
+}
+
static int
xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
@@ -56,7 +148,7 @@ xfs_xattr_set(const struct xattr_handler *handler,
};
int error;
- error = xfs_attr_set(&args);
+ error = xfs_attr_change(&args);
if (!error && (handler->flags & XFS_ATTR_ROOT))
xfs_forget_acl(inode, name);
return error;
diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h
new file mode 100644
index 000000000000..2b09133b1b9b
--- /dev/null
+++ b/fs/xfs/xfs_xattr.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_XATTR_H__
+#define __XFS_XATTR_H__
+
+int xfs_attr_change(struct xfs_da_args *args);
+
+extern const struct xattr_handler *xfs_xattr_handlers[];
+
+#endif /* __XFS_XATTR_H__ */
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index bcb21aea990a..053299758deb 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -110,15 +110,51 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
}
}
-static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
+static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct super_block *sb = inode->i_sb;
loff_t isize;
- /* All I/Os should always be within the file maximum size */
+ /*
+ * All blocks are always mapped below EOF. If reading past EOF,
+ * act as if there is a hole up to the file maximum size.
+ */
+ mutex_lock(&zi->i_truncate_mutex);
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+ isize = i_size_read(inode);
+ if (iomap->offset >= isize) {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = length;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+ iomap->length = isize - iomap->offset;
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ trace_zonefs_iomap_begin(inode, iomap);
+
+ return 0;
+}
+
+static const struct iomap_ops zonefs_read_iomap_ops = {
+ .iomap_begin = zonefs_read_iomap_begin,
+};
+
+static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ loff_t isize;
+
+ /* All write I/Os should always be within the file maximum size */
if (WARN_ON_ONCE(offset + length > zi->i_max_size))
return -EIO;
@@ -128,7 +164,7 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
* operation.
*/
if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
- (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)))
+ !(flags & IOMAP_DIRECT)))
return -EIO;
/*
@@ -137,47 +173,44 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
* write pointer) and unwriten beyond.
*/
mutex_lock(&zi->i_truncate_mutex);
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
isize = i_size_read(inode);
- if (offset >= isize)
+ if (iomap->offset >= isize) {
iomap->type = IOMAP_UNWRITTEN;
- else
+ iomap->length = zi->i_max_size - iomap->offset;
+ } else {
iomap->type = IOMAP_MAPPED;
- if (flags & IOMAP_WRITE)
- length = zi->i_max_size - offset;
- else
- length = min(length, isize - offset);
+ iomap->length = isize - iomap->offset;
+ }
mutex_unlock(&zi->i_truncate_mutex);
- iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
- iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
-
trace_zonefs_iomap_begin(inode, iomap);
return 0;
}
-static const struct iomap_ops zonefs_iomap_ops = {
- .iomap_begin = zonefs_iomap_begin,
+static const struct iomap_ops zonefs_write_iomap_ops = {
+ .iomap_begin = zonefs_write_iomap_begin,
};
static int zonefs_read_folio(struct file *unused, struct folio *folio)
{
- return iomap_read_folio(folio, &zonefs_iomap_ops);
+ return iomap_read_folio(folio, &zonefs_read_iomap_ops);
}
static void zonefs_readahead(struct readahead_control *rac)
{
- iomap_readahead(rac, &zonefs_iomap_ops);
+ iomap_readahead(rac, &zonefs_read_iomap_ops);
}
/*
* Map blocks for page writeback. This is used only on conventional zone files,
* which implies that the page range can only be within the fixed inode size.
*/
-static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset)
+static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
@@ -191,12 +224,12 @@ static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
offset < wpc->iomap.offset + wpc->iomap.length)
return 0;
- return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+ return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
}
static const struct iomap_writeback_ops zonefs_writeback_ops = {
- .map_blocks = zonefs_map_blocks,
+ .map_blocks = zonefs_write_map_blocks,
};
static int zonefs_writepage(struct page *page, struct writeback_control *wbc)
@@ -226,7 +259,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis,
return -EINVAL;
}
- return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops);
+ return iomap_swapfile_activate(sis, swap_file, span,
+ &zonefs_read_iomap_ops);
}
static const struct address_space_operations zonefs_file_aops = {
@@ -647,7 +681,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
/* Serialize against truncates */
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
@@ -899,7 +933,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
if (append)
ret = zonefs_file_dio_append(iocb, from);
else
- ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
+ ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
&zonefs_write_dio_ops, 0, NULL, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
@@ -948,7 +982,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
if (ret <= 0)
goto inode_unlock;
- ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops);
+ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
if (ret > 0)
iocb->ki_pos += ret;
else if (ret == -EIO)
@@ -1041,7 +1075,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
goto inode_unlock;
}
file_accessed(iocb->ki_filp);
- ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
+ ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
&zonefs_read_dio_ops, 0, NULL, 0);
} else {
ret = generic_file_read_iter(iocb, to);
@@ -1085,7 +1119,8 @@ static int zonefs_seq_file_write_open(struct inode *inode)
if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
- if (wro > sbi->s_max_wro_seq_files) {
+ if (sbi->s_max_wro_seq_files
+ && wro > sbi->s_max_wro_seq_files) {
atomic_dec(&sbi->s_wro_seq_files);
ret = -EBUSY;
goto unlock;
@@ -1760,12 +1795,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
atomic_set(&sbi->s_wro_seq_files, 0);
sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
- if (!sbi->s_max_wro_seq_files &&
- sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
- zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
- sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
- }
-
atomic_set(&sbi->s_active_seq_files, 0);
sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
@@ -1790,6 +1819,14 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
zonefs_info(sb, "Mounting %u zones",
blkdev_nr_zones(sb->s_bdev->bd_disk));
+ if (!sbi->s_max_wro_seq_files &&
+ !sbi->s_max_active_seq_files &&
+ sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+ zonefs_info(sb,
+ "No open and active zone limits. Ignoring explicit_open mount option\n");
+ sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
+ }
+
/* Create root directory inode */
ret = -ENOMEM;
inode = new_inode(sb);