summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_addr.c353
-rw-r--r--fs/9p/vfs_file.c129
-rw-r--r--fs/9p/vfs_inode.c16
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/9p/vfs_super.c14
-rw-r--r--fs/Kconfig9
-rw-r--r--fs/Makefile4
-rw-r--r--fs/affs/affs.h1
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/dir.c32
-rw-r--r--fs/afs/dynroot.c11
-rw-r--r--fs/afs/file.c221
-rw-r--r--fs/afs/flock.c60
-rw-r--r--fs/afs/inode.c28
-rw-r--r--fs/afs/internal.h78
-rw-r--r--fs/afs/main.c3
-rw-r--r--fs/afs/proc.c5
-rw-r--r--fs/afs/server.c14
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/volume.c4
-rw-r--r--fs/afs/write.c826
-rw-r--r--fs/aio.c17
-rw-r--r--fs/anon_inodes.c51
-rw-r--r--fs/attr.c2
-rw-r--r--fs/backing-file.c4
-rw-r--r--fs/bcachefs/Makefile2
-rw-r--r--fs/bcachefs/alloc_background.c91
-rw-r--r--fs/bcachefs/alloc_background_format.h92
-rw-r--r--fs/bcachefs/alloc_foreground.c7
-rw-r--r--fs/bcachefs/backpointers.c108
-rw-r--r--fs/bcachefs/backpointers.h1
-rw-r--r--fs/bcachefs/bcachefs.h17
-rw-r--r--fs/bcachefs/bcachefs_format.h888
-rw-r--r--fs/bcachefs/bkey.c2
-rw-r--r--fs/bcachefs/bkey_methods.c9
-rw-r--r--fs/bcachefs/bkey_methods.h10
-rw-r--r--fs/bcachefs/bset.c7
-rw-r--r--fs/bcachefs/bset.h3
-rw-r--r--fs/bcachefs/btree_cache.c12
-rw-r--r--fs/bcachefs/btree_cache.h19
-rw-r--r--fs/bcachefs/btree_gc.c36
-rw-r--r--fs/bcachefs/btree_io.c38
-rw-r--r--fs/bcachefs/btree_iter.c6
-rw-r--r--fs/bcachefs/btree_iter.h5
-rw-r--r--fs/bcachefs/btree_locking.c44
-rw-r--r--fs/bcachefs/btree_locking.h9
-rw-r--r--fs/bcachefs/btree_trans_commit.c35
-rw-r--r--fs/bcachefs/btree_types.h12
-rw-r--r--fs/bcachefs/btree_update_interior.c11
-rw-r--r--fs/bcachefs/btree_update_interior.h42
-rw-r--r--fs/bcachefs/btree_write_buffer.c7
-rw-r--r--fs/bcachefs/buckets.c148
-rw-r--r--fs/bcachefs/buckets.h17
-rw-r--r--fs/bcachefs/buckets_types.h15
-rw-r--r--fs/bcachefs/clock.c4
-rw-r--r--fs/bcachefs/compress.h8
-rw-r--r--fs/bcachefs/data_update.c6
-rw-r--r--fs/bcachefs/debug.c18
-rw-r--r--fs/bcachefs/dirent_format.h42
-rw-r--r--fs/bcachefs/ec.c6
-rw-r--r--fs/bcachefs/ec_format.h19
-rw-r--r--fs/bcachefs/extents.c11
-rw-r--r--fs/bcachefs/extents.h2
-rw-r--r--fs/bcachefs/extents_format.h295
-rw-r--r--fs/bcachefs/eytzinger.h4
-rw-r--r--fs/bcachefs/fs-io-buffered.c21
-rw-r--r--fs/bcachefs/fs-io-direct.c6
-rw-r--r--fs/bcachefs/fs-io-pagecache.c37
-rw-r--r--fs/bcachefs/fs-io-pagecache.h2
-rw-r--r--fs/bcachefs/fs-io.c9
-rw-r--r--fs/bcachefs/fs-ioctl.c15
-rw-r--r--fs/bcachefs/fs.c9
-rw-r--r--fs/bcachefs/fsck.c23
-rw-r--r--fs/bcachefs/inode.c29
-rw-r--r--fs/bcachefs/inode_format.h166
-rw-r--r--fs/bcachefs/io_misc.c4
-rw-r--r--fs/bcachefs/io_write.c14
-rw-r--r--fs/bcachefs/journal.c111
-rw-r--r--fs/bcachefs/journal_io.c12
-rw-r--r--fs/bcachefs/journal_reclaim.c10
-rw-r--r--fs/bcachefs/logged_ops_format.h30
-rw-r--r--fs/bcachefs/mean_and_variance.h2
-rw-r--r--fs/bcachefs/move.c65
-rw-r--r--fs/bcachefs/opts.c4
-rw-r--r--fs/bcachefs/opts.h9
-rw-r--r--fs/bcachefs/printbuf.c1
-rw-r--r--fs/bcachefs/quota_format.h47
-rw-r--r--fs/bcachefs/rebalance.c13
-rw-r--r--fs/bcachefs/recovery.c13
-rw-r--r--fs/bcachefs/reflink.c21
-rw-r--r--fs/bcachefs/reflink.h8
-rw-r--r--fs/bcachefs/reflink_format.h33
-rw-r--r--fs/bcachefs/replicas.c28
-rw-r--r--fs/bcachefs/sb-clean.c2
-rw-r--r--fs/bcachefs/sb-counters.c (renamed from fs/bcachefs/counters.c)2
-rw-r--r--fs/bcachefs/sb-counters.h (renamed from fs/bcachefs/counters.h)7
-rw-r--r--fs/bcachefs/sb-counters_format.h98
-rw-r--r--fs/bcachefs/sb-members.c6
-rw-r--r--fs/bcachefs/snapshot.c6
-rw-r--r--fs/bcachefs/snapshot_format.h36
-rw-r--r--fs/bcachefs/str_hash.h22
-rw-r--r--fs/bcachefs/subvolume_format.h35
-rw-r--r--fs/bcachefs/super-io.c28
-rw-r--r--fs/bcachefs/super.c14
-rw-r--r--fs/bcachefs/super_types.h2
-rw-r--r--fs/bcachefs/sysfs.c15
-rw-r--r--fs/bcachefs/thread_with_file.c2
-rw-r--r--fs/bcachefs/trace.h76
-rw-r--r--fs/bcachefs/util.c32
-rw-r--r--fs/bcachefs/util.h7
-rw-r--r--fs/bcachefs/xattr.c5
-rw-r--r--fs/bcachefs/xattr_format.h19
-rw-r--r--fs/btrfs/block-group.c80
-rw-r--r--fs/btrfs/block-group.h7
-rw-r--r--fs/btrfs/block-rsv.c2
-rw-r--r--fs/btrfs/block-rsv.h32
-rw-r--r--fs/btrfs/compression.c23
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/defrag.c2
-rw-r--r--fs/btrfs/delalloc-space.c29
-rw-r--r--fs/btrfs/dev-replace.c38
-rw-r--r--fs/btrfs/disk-io.c29
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c53
-rw-r--r--fs/btrfs/extent_io.c186
-rw-r--r--fs/btrfs/inode.c70
-rw-r--r--fs/btrfs/ioctl.c30
-rw-r--r--fs/btrfs/lzo.c34
-rw-r--r--fs/btrfs/qgroup.c14
-rw-r--r--fs/btrfs/ref-verify.c6
-rw-r--r--fs/btrfs/scrub.c36
-rw-r--r--fs/btrfs/send.c23
-rw-r--r--fs/btrfs/space-info.c26
-rw-r--r--fs/btrfs/subpage.c3
-rw-r--r--fs/btrfs/super.c8
-rw-r--r--fs/btrfs/transaction.c40
-rw-r--r--fs/btrfs/tree-checker.c2
-rw-r--r--fs/btrfs/volumes.c94
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/btrfs/zlib.c73
-rw-r--r--fs/btrfs/zoned.c53
-rw-r--r--fs/buffer.c22
-rw-r--r--fs/cachefiles/Kconfig2
-rw-r--r--fs/cachefiles/cache.c2
-rw-r--r--fs/cachefiles/daemon.c1
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/io.c34
-rw-r--r--fs/cachefiles/ondemand.c5
-rw-r--r--fs/ceph/Kconfig1
-rw-r--r--fs/ceph/addr.c33
-rw-r--r--fs/ceph/cache.h45
-rw-r--r--fs/ceph/caps.c83
-rw-r--r--fs/ceph/dir.c21
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c8
-rw-r--r--fs/ceph/inode.c6
-rw-r--r--fs/ceph/locks.c74
-rw-r--r--fs/ceph/mds_client.c92
-rw-r--r--fs/ceph/mds_client.h7
-rw-r--r--fs/ceph/mdsmap.c7
-rw-r--r--fs/ceph/mdsmap.h6
-rw-r--r--fs/ceph/quota.c39
-rw-r--r--fs/ceph/super.h16
-rw-r--r--fs/coda/inode.c143
-rw-r--r--fs/coredump.c45
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/crypto/fname.c8
-rw-r--r--fs/crypto/hooks.c15
-rw-r--r--fs/dcache.c7
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/plock.c44
-rw-r--r--fs/ecryptfs/crypto.c10
-rw-r--r--fs/efivarfs/internal.h2
-rw-r--r--fs/efivarfs/super.c7
-rw-r--r--fs/efivarfs/vars.c23
-rw-r--r--fs/efs/super.c118
-rw-r--r--fs/erofs/Kconfig7
-rw-r--r--fs/erofs/compress.h5
-rw-r--r--fs/erofs/data.c7
-rw-r--r--fs/erofs/decompressor.c10
-rw-r--r--fs/erofs/decompressor_deflate.c19
-rw-r--r--fs/erofs/decompressor_lzma.c17
-rw-r--r--fs/erofs/fscache.c15
-rw-r--r--fs/erofs/inode.c2
-rw-r--r--fs/erofs/internal.h2
-rw-r--r--fs/erofs/namei.c28
-rw-r--r--fs/erofs/super.c16
-rw-r--r--fs/erofs/utils.c2
-rw-r--r--fs/erofs/zdata.c98
-rw-r--r--fs/erofs/zmap.c23
-rw-r--r--fs/eventfd.c16
-rw-r--r--fs/eventpoll.c16
-rw-r--r--fs/exec.c107
-rw-r--r--fs/exfat/exfat_fs.h1
-rw-r--r--fs/exfat/file.c37
-rw-r--r--fs/exfat/inode.c7
-rw-r--r--fs/exfat/nls.c14
-rw-r--r--fs/exfat/super.c20
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/ext4/ext4.h10
-rw-r--r--fs/ext4/extents.c124
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsmap.c8
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inode.c90
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c140
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/move_extent.c10
-rw-r--r--fs/ext4/namei.c1
-rw-r--r--fs/ext4/super.c57
-rw-r--r--fs/ext4/symlink.c8
-rw-r--r--fs/f2fs/f2fs.h14
-rw-r--r--fs/f2fs/namei.c1
-rw-r--r--fs/f2fs/segment.c15
-rw-r--r--fs/f2fs/super.c16
-rw-r--r--fs/fat/inode.c3
-rw-r--r--fs/fcntl.c72
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file_table.c83
-rw-r--r--fs/fs-writeback.c35
-rw-r--r--fs/fs_parser.c4
-rw-r--r--fs/fscache/Kconfig40
-rw-r--r--fs/fscache/Makefile16
-rw-r--r--fs/fscache/internal.h277
-rw-r--r--fs/fuse/cuse.c3
-rw-r--r--fs/fuse/file.c14
-rw-r--r--fs/fuse/fuse_i.h1
-rw-r--r--fs/fuse/inode.c15
-rw-r--r--fs/gfs2/bmap.c2
-rw-r--r--fs/gfs2/dentry.c23
-rw-r--r--fs/gfs2/file.c16
-rw-r--r--fs/gfs2/inode.c8
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h1
-rw-r--r--fs/hfsplus/super.c12
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/hugetlbfs/inode.c44
-rw-r--r--fs/inode.c8
-rw-r--r--fs/internal.h8
-rw-r--r--fs/ioctl.c33
-rw-r--r--fs/iomap/buffered-io.c579
-rw-r--r--fs/iomap/direct-io.c1
-rw-r--r--fs/iomap/trace.h48
-rw-r--r--fs/jfs/jfs_dmap.c8
-rw-r--r--fs/jfs/jfs_logmgr.c26
-rw-r--r--fs/jfs/jfs_logmgr.h2
-rw-r--r--fs/jfs/jfs_mount.c2
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/dir.c62
-rw-r--r--fs/kernfs/file.c2
-rw-r--r--fs/kernfs/mount.c7
-rw-r--r--fs/libfs.c344
-rw-r--r--fs/lockd/clnt4xdr.c14
-rw-r--r--fs/lockd/clntlock.c2
-rw-r--r--fs/lockd/clntproc.c65
-rw-r--r--fs/lockd/clntxdr.c14
-rw-r--r--fs/lockd/svc4proc.c10
-rw-r--r--fs/lockd/svclock.c64
-rw-r--r--fs/lockd/svcproc.c10
-rw-r--r--fs/lockd/svcsubs.c24
-rw-r--r--fs/lockd/xdr.c14
-rw-r--r--fs/lockd/xdr4.c14
-rw-r--r--fs/locks.c894
-rw-r--r--fs/mbcache.c4
-rw-r--r--fs/minix/inode.c2
-rw-r--r--fs/mnt_idmapping.c2
-rw-r--r--fs/mpage.c1
-rw-r--r--fs/namei.c12
-rw-r--r--fs/namespace.c61
-rw-r--r--fs/netfs/Kconfig39
-rw-r--r--fs/netfs/Makefile22
-rw-r--r--fs/netfs/buffered_read.c237
-rw-r--r--fs/netfs/buffered_write.c1257
-rw-r--r--fs/netfs/direct_read.c125
-rw-r--r--fs/netfs/direct_write.c174
-rw-r--r--fs/netfs/fscache_cache.c (renamed from fs/fscache/cache.c)3
-rw-r--r--fs/netfs/fscache_cookie.c (renamed from fs/fscache/cookie.c)0
-rw-r--r--fs/netfs/fscache_internal.h14
-rw-r--r--fs/netfs/fscache_io.c (renamed from fs/fscache/io.c)42
-rw-r--r--fs/netfs/fscache_main.c (renamed from fs/fscache/main.c)25
-rw-r--r--fs/netfs/fscache_proc.c (renamed from fs/fscache/proc.c)23
-rw-r--r--fs/netfs/fscache_stats.c (renamed from fs/fscache/stats.c)13
-rw-r--r--fs/netfs/fscache_volume.c (renamed from fs/fscache/volume.c)0
-rw-r--r--fs/netfs/internal.h284
-rw-r--r--fs/netfs/io.c217
-rw-r--r--fs/netfs/iterator.c97
-rw-r--r--fs/netfs/locking.c216
-rw-r--r--fs/netfs/main.c109
-rw-r--r--fs/netfs/misc.c260
-rw-r--r--fs/netfs/objects.c59
-rw-r--r--fs/netfs/output.c478
-rw-r--r--fs/netfs/stats.c42
-rw-r--r--fs/nfs/Kconfig4
-rw-r--r--fs/nfs/blocklayout/blocklayout.h2
-rw-r--r--fs/nfs/blocklayout/dev.c68
-rw-r--r--fs/nfs/client.c13
-rw-r--r--fs/nfs/delegation.c4
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/nfs/file.c22
-rw-r--r--fs/nfs/fscache.c7
-rw-r--r--fs/nfs/fscache.h2
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c39
-rw-r--r--fs/nfs/nfs4state.c22
-rw-r--r--fs/nfs/nfs4trace.h4
-rw-r--r--fs/nfs/nfs4xdr.c8
-rw-r--r--fs/nfs/write.c8
-rw-r--r--fs/nfsd/filecache.c4
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/nfsd/nfs4layouts.c35
-rw-r--r--fs/nfsd/nfs4state.c161
-rw-r--r--fs/nilfs2/file.c8
-rw-r--r--fs/nilfs2/recovery.c7
-rw-r--r--fs/nilfs2/segment.c8
-rw-r--r--fs/nsfs.c121
-rw-r--r--fs/ntfs/Kconfig81
-rw-r--r--fs/ntfs/Makefile15
-rw-r--r--fs/ntfs/aops.c1744
-rw-r--r--fs/ntfs/aops.h88
-rw-r--r--fs/ntfs/attrib.c2624
-rw-r--r--fs/ntfs/attrib.h102
-rw-r--r--fs/ntfs/bitmap.c179
-rw-r--r--fs/ntfs/bitmap.h104
-rw-r--r--fs/ntfs/collate.c110
-rw-r--r--fs/ntfs/collate.h36
-rw-r--r--fs/ntfs/compress.c950
-rw-r--r--fs/ntfs/debug.c159
-rw-r--r--fs/ntfs/debug.h57
-rw-r--r--fs/ntfs/dir.c1540
-rw-r--r--fs/ntfs/dir.h34
-rw-r--r--fs/ntfs/endian.h79
-rw-r--r--fs/ntfs/file.c1997
-rw-r--r--fs/ntfs/index.c440
-rw-r--r--fs/ntfs/index.h134
-rw-r--r--fs/ntfs/inode.c3102
-rw-r--r--fs/ntfs/inode.h310
-rw-r--r--fs/ntfs/layout.h2421
-rw-r--r--fs/ntfs/lcnalloc.c1000
-rw-r--r--fs/ntfs/lcnalloc.h131
-rw-r--r--fs/ntfs/logfile.c849
-rw-r--r--fs/ntfs/logfile.h295
-rw-r--r--fs/ntfs/malloc.h77
-rw-r--r--fs/ntfs/mft.c2907
-rw-r--r--fs/ntfs/mft.h110
-rw-r--r--fs/ntfs/mst.c189
-rw-r--r--fs/ntfs/namei.c392
-rw-r--r--fs/ntfs/ntfs.h150
-rw-r--r--fs/ntfs/quota.c103
-rw-r--r--fs/ntfs/quota.h21
-rw-r--r--fs/ntfs/runlist.c1893
-rw-r--r--fs/ntfs/runlist.h88
-rw-r--r--fs/ntfs/super.c3202
-rw-r--r--fs/ntfs/sysctl.c58
-rw-r--r--fs/ntfs/sysctl.h27
-rw-r--r--fs/ntfs/time.h89
-rw-r--r--fs/ntfs/types.h55
-rw-r--r--fs/ntfs/unistr.c384
-rw-r--r--fs/ntfs/upcase.c73
-rw-r--r--fs/ntfs/usnjrnl.c70
-rw-r--r--fs/ntfs/usnjrnl.h191
-rw-r--r--fs/ntfs/volume.h164
-rw-r--r--fs/ntfs3/attrib.c45
-rw-r--r--fs/ntfs3/attrlist.c12
-rw-r--r--fs/ntfs3/bitmap.c4
-rw-r--r--fs/ntfs3/dir.c48
-rw-r--r--fs/ntfs3/file.c76
-rw-r--r--fs/ntfs3/frecord.c19
-rw-r--r--fs/ntfs3/fslog.c232
-rw-r--r--fs/ntfs3/fsntfs.c29
-rw-r--r--fs/ntfs3/index.c8
-rw-r--r--fs/ntfs3/inode.c32
-rw-r--r--fs/ntfs3/namei.c14
-rw-r--r--fs/ntfs3/ntfs.h4
-rw-r--r--fs/ntfs3/ntfs_fs.h29
-rw-r--r--fs/ntfs3/record.c18
-rw-r--r--fs/ntfs3/super.c54
-rw-r--r--fs/ntfs3/xattr.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c32
-rw-r--r--fs/ocfs2/locks.c12
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/open.c55
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/overlayfs/copy_up.c14
-rw-r--r--fs/overlayfs/namei.c43
-rw-r--r--fs/overlayfs/overlayfs.h23
-rw-r--r--fs/overlayfs/ovl_entry.h4
-rw-r--r--fs/overlayfs/params.c14
-rw-r--r--fs/overlayfs/readdir.c7
-rw-r--r--fs/overlayfs/super.c67
-rw-r--r--fs/overlayfs/util.c71
-rw-r--r--fs/pidfs.c290
-rw-r--r--fs/pipe.c81
-rw-r--r--fs/posix_acl.c4
-rw-r--r--fs/proc/array.c66
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/inode.c21
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/task_mmu.c24
-rw-r--r--fs/qnx4/inode.c47
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/reiserfs/journal.c38
-rw-r--r--fs/reiserfs/procfs.c2
-rw-r--r--fs/reiserfs/reiserfs.h8
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/remap_range.c31
-rw-r--r--fs/romfs/super.c6
-rw-r--r--fs/select.c15
-rw-r--r--fs/smb/client/cached_dir.c25
-rw-r--r--fs/smb/client/cifs_debug.c6
-rw-r--r--fs/smb/client/cifsencrypt.c2
-rw-r--r--fs/smb/client/cifsfs.c36
-rw-r--r--fs/smb/client/cifsglob.h24
-rw-r--r--fs/smb/client/cifssmb.c8
-rw-r--r--fs/smb/client/connect.c29
-rw-r--r--fs/smb/client/dfs.c7
-rw-r--r--fs/smb/client/file.c118
-rw-r--r--fs/smb/client/fs_context.c19
-rw-r--r--fs/smb/client/fs_context.h2
-rw-r--r--fs/smb/client/fscache.c2
-rw-r--r--fs/smb/client/inode.c37
-rw-r--r--fs/smb/client/misc.c1
-rw-r--r--fs/smb/client/namespace.c16
-rw-r--r--fs/smb/client/readdir.c29
-rw-r--r--fs/smb/client/sess.c7
-rw-r--r--fs/smb/client/smb2file.c2
-rw-r--r--fs/smb/client/smb2inode.c267
-rw-r--r--fs/smb/client/smb2maperror.c2
-rw-r--r--fs/smb/client/smb2ops.c165
-rw-r--r--fs/smb/client/smb2pdu.c422
-rw-r--r--fs/smb/client/smb2proto.h9
-rw-r--r--fs/smb/client/smb2status.h2
-rw-r--r--fs/smb/client/smbencrypt.c7
-rw-r--r--fs/smb/client/transport.c18
-rw-r--r--fs/smb/server/asn1.c5
-rw-r--r--fs/smb/server/connection.c6
-rw-r--r--fs/smb/server/connection.h2
-rw-r--r--fs/smb/server/ksmbd_netlink.h3
-rw-r--r--fs/smb/server/misc.c1
-rw-r--r--fs/smb/server/oplock.c6
-rw-r--r--fs/smb/server/smb2pdu.c74
-rw-r--r--fs/smb/server/transport_ipc.c4
-rw-r--r--fs/smb/server/transport_rdma.c11
-rw-r--r--fs/smb/server/transport_tcp.c15
-rw-r--r--fs/smb/server/vfs.c14
-rw-r--r--fs/super.c31
-rw-r--r--fs/sysfs/dir.c2
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/sysv/itree.c10
-rw-r--r--fs/tracefs/event_inode.c905
-rw-r--r--fs/tracefs/inode.c281
-rw-r--r--fs/tracefs/internal.h48
-rw-r--r--fs/ubifs/auth.c21
-rw-r--r--fs/ubifs/commit.c13
-rw-r--r--fs/ubifs/dir.c3
-rw-r--r--fs/ubifs/file.c30
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/super.c3
-rw-r--r--fs/userfaultfd.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr.c6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c14
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h16
-rw-r--r--fs/xfs/libxfs/xfs_sb.c14
-rw-r--r--fs/xfs/libxfs/xfs_sb.h2
-rw-r--r--fs/xfs/libxfs/xfs_types.h12
-rw-r--r--fs/xfs/scrub/rtbitmap.c1
-rw-r--r--fs/xfs/scrub/rtsummary.c1
-rw-r--r--fs/xfs/xfs_aops.c9
-rw-r--r--fs/xfs/xfs_buf.c10
-rw-r--r--fs/xfs/xfs_buf.h4
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_super.c72
-rw-r--r--fs/zonefs/file.c45
-rw-r--r--fs/zonefs/super.c68
479 files changed, 11922 insertions, 36938 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 731e3d14b67d..0e8418066a48 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -42,6 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
dev_t rdev);
+void v9fs_set_netfs_context(struct inode *inode);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 8a635999a7d6..047855033d32 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -19,12 +19,45 @@
#include <linux/netfs.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
+#include <trace/events/netfs.h>
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "cache.h"
#include "fid.h"
+static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq)
+{
+ struct p9_fid *fid = subreq->rreq->netfs_priv;
+ int err, len;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+ netfs_write_subrequest_terminated(subreq, len ?: err, false);
+}
+
+static void v9fs_upload_to_server_worker(struct work_struct *work)
+{
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
+
+ v9fs_upload_to_server(subreq);
+}
+
+/*
+ * Set up write requests for a writeback slice. We need to add a write request
+ * for each write we want to make.
+ */
+static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
+ start, len, v9fs_upload_to_server_worker);
+ if (subreq)
+ netfs_queue_write_request(subreq);
+}
+
/**
* v9fs_issue_read - Issue a read from 9P
* @subreq: The read to make
@@ -33,14 +66,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct p9_fid *fid = rreq->netfs_priv;
- struct iov_iter to;
- loff_t pos = subreq->start + subreq->transferred;
- size_t len = subreq->len - subreq->transferred;
int total, err;
- iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len);
-
- total = p9_client_read(fid, pos, &to, &err);
+ total = p9_client_read(fid, subreq->start + subreq->transferred,
+ &subreq->io_iter, &err);
/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
@@ -50,25 +79,42 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
}
/**
- * v9fs_init_request - Initialise a read request
+ * v9fs_init_request - Initialise a request
* @rreq: The read request
* @file: The file being read from
*/
static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- struct p9_fid *fid = file->private_data;
-
- BUG_ON(!fid);
+ struct p9_fid *fid;
+ bool writing = (rreq->origin == NETFS_READ_FOR_WRITE ||
+ rreq->origin == NETFS_WRITEBACK ||
+ rreq->origin == NETFS_WRITETHROUGH ||
+ rreq->origin == NETFS_LAUNDER_WRITE ||
+ rreq->origin == NETFS_UNBUFFERED_WRITE ||
+ rreq->origin == NETFS_DIO_WRITE);
+
+ if (file) {
+ fid = file->private_data;
+ if (!fid)
+ goto no_fid;
+ p9_fid_get(fid);
+ } else {
+ fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true);
+ if (!fid)
+ goto no_fid;
+ }
/* we might need to read from a fid that was opened write-only
* for read-modify-write of page cache, use the writeback fid
* for that */
- WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE &&
- !(fid->mode & P9_ORDWR));
-
- p9_fid_get(fid);
+ WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR));
rreq->netfs_priv = fid;
return 0;
+
+no_fid:
+ WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n",
+ rreq->inode->i_ino);
+ return -EINVAL;
}
/**
@@ -82,281 +128,20 @@ static void v9fs_free_request(struct netfs_io_request *rreq)
p9_fid_put(fid);
}
-/**
- * v9fs_begin_cache_operation - Begin a cache operation for a read
- * @rreq: The read request
- */
-static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-#ifdef CONFIG_9P_FSCACHE
- struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode));
-
- return fscache_begin_read_operation(&rreq->cache_resources, cookie);
-#else
- return -ENOBUFS;
-#endif
-}
-
const struct netfs_request_ops v9fs_req_ops = {
.init_request = v9fs_init_request,
.free_request = v9fs_free_request,
- .begin_cache_operation = v9fs_begin_cache_operation,
.issue_read = v9fs_issue_read,
+ .create_write_requests = v9fs_create_write_requests,
};
-/**
- * v9fs_release_folio - release the private state associated with a folio
- * @folio: The folio to be released
- * @gfp: The caller's allocation restrictions
- *
- * Returns true if the page can be released, false otherwise.
- */
-
-static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
-{
- if (folio_test_private(folio))
- return false;
-#ifdef CONFIG_9P_FSCACHE
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio))));
-#endif
- return true;
-}
-
-static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
-{
- folio_wait_fscache(folio);
-}
-
-#ifdef CONFIG_9P_FSCACHE
-static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct v9fs_inode *v9inode = priv;
- __le32 version;
-
- if (IS_ERR_VALUE(transferred_or_error) &&
- transferred_or_error != -ENOBUFS) {
- version = cpu_to_le32(v9inode->qid.version);
- fscache_invalidate(v9fs_inode_cookie(v9inode), &version,
- i_size_read(&v9inode->netfs.inode), 0);
- }
-}
-#endif
-
-static int v9fs_vfs_write_folio_locked(struct folio *folio)
-{
- struct inode *inode = folio_inode(folio);
- loff_t start = folio_pos(folio);
- loff_t i_size = i_size_read(inode);
- struct iov_iter from;
- size_t len = folio_size(folio);
- struct p9_fid *writeback_fid;
- int err;
- struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
- struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode);
-
- if (start >= i_size)
- return 0; /* Simultaneous truncation occurred */
-
- len = min_t(loff_t, i_size - start, len);
-
- iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
-
- writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true);
- if (!writeback_fid) {
- WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n",
- inode->i_private);
- return -EINVAL;
- }
-
- folio_wait_fscache(folio);
- folio_start_writeback(folio);
-
- p9_client_write(writeback_fid, start, &from, &err);
-
-#ifdef CONFIG_9P_FSCACHE
- if (err == 0 &&
- fscache_cookie_enabled(cookie) &&
- test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
- folio_start_fscache(folio);
- fscache_write_to_cache(v9fs_inode_cookie(v9inode),
- folio_mapping(folio), start, len, i_size,
- v9fs_write_to_cache_done, v9inode,
- true);
- }
-#endif
-
- folio_end_writeback(folio);
- p9_fid_put(writeback_fid);
-
- return err;
-}
-
-static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
- int retval;
-
- p9_debug(P9_DEBUG_VFS, "folio %p\n", folio);
-
- retval = v9fs_vfs_write_folio_locked(folio);
- if (retval < 0) {
- if (retval == -EAGAIN) {
- folio_redirty_for_writepage(wbc, folio);
- retval = 0;
- } else {
- mapping_set_error(folio_mapping(folio), retval);
- }
- } else
- retval = 0;
-
- folio_unlock(folio);
- return retval;
-}
-
-static int v9fs_launder_folio(struct folio *folio)
-{
- int retval;
-
- if (folio_clear_dirty_for_io(folio)) {
- retval = v9fs_vfs_write_folio_locked(folio);
- if (retval)
- return retval;
- }
- folio_wait_fscache(folio);
- return 0;
-}
-
-/**
- * v9fs_direct_IO - 9P address space operation for direct I/O
- * @iocb: target I/O control block
- * @iter: The data/buffer to use
- *
- * The presence of v9fs_direct_IO() in the address space ops vector
- * allowes open() O_DIRECT flags which would have failed otherwise.
- *
- * In the non-cached mode, we shunt off direct read and write requests before
- * the VFS gets them, so this method should never be called.
- *
- * Direct IO is not 'yet' supported in the cached mode. Hence when
- * this routine is called through generic_file_aio_read(), the read/write fails
- * with an error.
- *
- */
-static ssize_t
-v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- loff_t pos = iocb->ki_pos;
- ssize_t n;
- int err = 0;
-
- if (iov_iter_rw(iter) == WRITE) {
- n = p9_client_write(file->private_data, pos, iter, &err);
- if (n) {
- struct inode *inode = file_inode(file);
- loff_t i_size = i_size_read(inode);
-
- if (pos + n > i_size)
- inode_add_bytes(inode, pos + n - i_size);
- }
- } else {
- n = p9_client_read(file->private_data, pos, iter, &err);
- }
- return n ? n : err;
-}
-
-static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len,
- struct page **subpagep, void **fsdata)
-{
- int retval;
- struct folio *folio;
- struct v9fs_inode *v9inode = V9FS_I(mapping->host);
-
- p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
-
- /* Prefetch area to be written into the cache if we're caching this
- * file. We need to do this before we get a lock on the page in case
- * there's more than one writer competing for the same cache block.
- */
- retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata);
- if (retval < 0)
- return retval;
-
- *subpagep = &folio->page;
- return retval;
-}
-
-static int v9fs_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
- struct page *subpage, void *fsdata)
-{
- loff_t last_pos = pos + copied;
- struct folio *folio = page_folio(subpage);
- struct inode *inode = mapping->host;
-
- p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
-
- if (!folio_test_uptodate(folio)) {
- if (unlikely(copied < len)) {
- copied = 0;
- goto out;
- }
-
- folio_mark_uptodate(folio);
- }
-
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold the i_mutex.
- */
- if (last_pos > inode->i_size) {
- inode_add_bytes(inode, last_pos - inode->i_size);
- i_size_write(inode, last_pos);
-#ifdef CONFIG_9P_FSCACHE
- fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL,
- &last_pos);
-#endif
- }
- folio_mark_dirty(folio);
-out:
- folio_unlock(folio);
- folio_put(folio);
-
- return copied;
-}
-
-#ifdef CONFIG_9P_FSCACHE
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- struct v9fs_inode *v9inode = V9FS_I(mapping->host);
-
- return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode));
-}
-#else
-#define v9fs_dirty_folio filemap_dirty_folio
-#endif
-
const struct address_space_operations v9fs_addr_operations = {
- .read_folio = netfs_read_folio,
- .readahead = netfs_readahead,
- .dirty_folio = v9fs_dirty_folio,
- .writepage = v9fs_vfs_writepage,
- .write_begin = v9fs_write_begin,
- .write_end = v9fs_write_end,
- .release_folio = v9fs_release_folio,
- .invalidate_folio = v9fs_invalidate_folio,
- .launder_folio = v9fs_launder_folio,
- .direct_IO = v9fs_direct_IO,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
+ .dirty_folio = netfs_dirty_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
+ .launder_folio = netfs_launder_folio,
+ .direct_IO = noop_direct_IO,
+ .writepages = netfs_writepages,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 11cd8d23f6f2..abdbbaee5184 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -107,7 +107,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
- if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+ if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
}
@@ -121,13 +121,12 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
struct p9_fid *fid;
uint8_t status = P9_LOCK_ERROR;
int res = 0;
- unsigned char fl_type;
struct v9fs_session_info *v9ses;
fid = filp->private_data;
BUG_ON(fid == NULL);
- BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX);
+ BUG_ON((fl->c.flc_flags & FL_POSIX) != FL_POSIX);
res = locks_lock_file_wait(filp, fl);
if (res < 0)
@@ -136,7 +135,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
/* convert posix lock to p9 tlock args */
memset(&flock, 0, sizeof(flock));
/* map the lock type */
- switch (fl->fl_type) {
+ switch (fl->c.flc_type) {
case F_RDLCK:
flock.type = P9_LOCK_TYPE_RDLCK;
break;
@@ -152,7 +151,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
flock.length = 0;
else
flock.length = fl->fl_end - fl->fl_start + 1;
- flock.proc_id = fl->fl_pid;
+ flock.proc_id = fl->c.flc_pid;
flock.client_id = fid->clnt->name;
if (IS_SETLKW(cmd))
flock.flags = P9_LOCK_FLAGS_BLOCK;
@@ -207,12 +206,13 @@ out_unlock:
* incase server returned error for lock request, revert
* it locally
*/
- if (res < 0 && fl->fl_type != F_UNLCK) {
- fl_type = fl->fl_type;
- fl->fl_type = F_UNLCK;
+ if (res < 0 && fl->c.flc_type != F_UNLCK) {
+ unsigned char type = fl->c.flc_type;
+
+ fl->c.flc_type = F_UNLCK;
/* Even if this fails we want to return the remote error */
locks_lock_file_wait(filp, fl);
- fl->fl_type = fl_type;
+ fl->c.flc_type = type;
}
if (flock.client_id != fid->clnt->name)
kfree(flock.client_id);
@@ -234,7 +234,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
* if we have a conflicting lock locally, no need to validate
* with server
*/
- if (fl->fl_type != F_UNLCK)
+ if (fl->c.flc_type != F_UNLCK)
return res;
/* convert posix lock to p9 tgetlock args */
@@ -245,7 +245,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
glock.length = 0;
else
glock.length = fl->fl_end - fl->fl_start + 1;
- glock.proc_id = fl->fl_pid;
+ glock.proc_id = fl->c.flc_pid;
glock.client_id = fid->clnt->name;
res = p9_client_getlock_dotl(fid, &glock);
@@ -254,13 +254,13 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
/* map 9p lock type to os lock type */
switch (glock.type) {
case P9_LOCK_TYPE_RDLCK:
- fl->fl_type = F_RDLCK;
+ fl->c.flc_type = F_RDLCK;
break;
case P9_LOCK_TYPE_WRLCK:
- fl->fl_type = F_WRLCK;
+ fl->c.flc_type = F_WRLCK;
break;
case P9_LOCK_TYPE_UNLCK:
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
break;
}
if (glock.type != P9_LOCK_TYPE_UNLCK) {
@@ -269,7 +269,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
fl->fl_end = OFFSET_MAX;
else
fl->fl_end = glock.start + glock.length - 1;
- fl->fl_pid = -glock.proc_id;
+ fl->c.flc_pid = -glock.proc_id;
}
out:
if (glock.client_id != fid->clnt->name)
@@ -293,7 +293,7 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
filp, cmd, fl, filp);
- if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+ if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
}
@@ -324,16 +324,16 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
filp, cmd, fl, filp);
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
goto out_err;
- if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+ if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
}
/* Convert flock to posix lock */
- fl->fl_flags |= FL_POSIX;
- fl->fl_flags ^= FL_FLOCK;
+ fl->c.flc_flags |= FL_POSIX;
+ fl->c.flc_flags ^= FL_FLOCK;
if (IS_SETLK(cmd) | IS_SETLKW(cmd))
ret = v9fs_file_do_lock(filp, cmd, fl);
@@ -353,25 +353,15 @@ static ssize_t
v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct p9_fid *fid = iocb->ki_filp->private_data;
- int ret, err = 0;
p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n",
fid->fid, iov_iter_count(to), iocb->ki_pos);
- if (!(fid->mode & P9L_DIRECT)) {
- p9_debug(P9_DEBUG_VFS, "(cached)\n");
- return generic_file_read_iter(iocb, to);
- }
-
- if (iocb->ki_filp->f_flags & O_NONBLOCK)
- ret = p9_client_read_once(fid, iocb->ki_pos, to, &err);
- else
- ret = p9_client_read(fid, iocb->ki_pos, to, &err);
- if (!ret)
- return err;
+ if (fid->mode & P9L_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, to);
- iocb->ki_pos += ret;
- return ret;
+ p9_debug(P9_DEBUG_VFS, "(cached)\n");
+ return netfs_file_read_iter(iocb, to);
}
/*
@@ -407,46 +397,14 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct p9_fid *fid = file->private_data;
- ssize_t retval;
- loff_t origin;
- int err = 0;
p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid);
- if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) {
- p9_debug(P9_DEBUG_CACHE, "(cached)\n");
- return generic_file_write_iter(iocb, from);
- }
-
- retval = generic_write_checks(iocb, from);
- if (retval <= 0)
- return retval;
+ if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))
+ return netfs_unbuffered_write_iter(iocb, from);
- origin = iocb->ki_pos;
- retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err);
- if (retval > 0) {
- struct inode *inode = file_inode(file);
- loff_t i_size;
- unsigned long pg_start, pg_end;
-
- pg_start = origin >> PAGE_SHIFT;
- pg_end = (origin + retval - 1) >> PAGE_SHIFT;
- if (inode->i_mapping && inode->i_mapping->nrpages)
- invalidate_inode_pages2_range(inode->i_mapping,
- pg_start, pg_end);
- iocb->ki_pos += retval;
- i_size = i_size_read(inode);
- if (iocb->ki_pos > i_size) {
- inode_add_bytes(inode, iocb->ki_pos - i_size);
- /*
- * Need to serialize against i_size_write() in
- * v9fs_stat2inode()
- */
- v9fs_i_size_write(inode, iocb->ki_pos);
- }
- return retval;
- }
- return err;
+ p9_debug(P9_DEBUG_CACHE, "(cached)\n");
+ return netfs_file_write_iter(iocb, from);
}
static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
@@ -519,36 +477,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
static vm_fault_t
v9fs_vm_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
- struct file *filp = vmf->vma->vm_file;
- struct inode *inode = file_inode(filp);
-
-
- p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n",
- folio, (unsigned long)filp->private_data);
-
- /* Wait for the page to be written to the cache before we allow it to
- * be modified. We then assume the entire page will need writing back.
- */
-#ifdef CONFIG_9P_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- return VM_FAULT_NOPAGE;
-#endif
-
- /* Update file times before taking page lock */
- file_update_time(filp);
-
- if (folio_lock_killable(folio) < 0)
- return VM_FAULT_RETRY;
- if (folio_mapping(folio) != inode->i_mapping)
- goto out_unlock;
- folio_wait_stable(folio);
-
- return VM_FAULT_LOCKED;
-out_unlock:
- folio_unlock(folio);
- return VM_FAULT_NOPAGE;
+ return netfs_page_mkwrite(vmf, NULL);
}
static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b845ee18a80b..32572982f72e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -246,10 +246,10 @@ void v9fs_free_inode(struct inode *inode)
/*
* Set parameters for the netfs library
*/
-static void v9fs_set_netfs_context(struct inode *inode)
+void v9fs_set_netfs_context(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
- netfs_inode_init(&v9inode->netfs, &v9fs_req_ops);
+ netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true);
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
@@ -326,8 +326,6 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
err = -EINVAL;
goto error;
}
-
- v9fs_set_netfs_context(inode);
error:
return err;
@@ -359,6 +357,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
iput(inode);
return ERR_PTR(err);
}
+ v9fs_set_netfs_context(inode);
return inode;
}
@@ -374,11 +373,8 @@ void v9fs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
-#ifdef CONFIG_9P_FSCACHE
version = cpu_to_le32(v9inode->qid.version);
- fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode,
- &version);
-#endif
+ netfs_clear_inode_writeback(inode, &version);
clear_inode(inode);
filemap_fdatawrite(&inode->i_data);
@@ -464,6 +460,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
goto error;
v9fs_stat2inode(st, inode, sb, 0);
+ v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
return inode;
@@ -1113,7 +1110,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- truncate_pagecache(inode, iattr->ia_size);
+ netfs_resize_file(netfs_inode(inode), iattr->ia_size, true);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache & CACHE_FSCACHE) {
@@ -1181,6 +1178,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
+ v9inode->netfs.remote_i_size = stat->length;
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
v9fs_i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index c7319af2f471..3505227e1704 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -128,6 +128,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
goto error;
v9fs_stat2inode_dotl(st, inode, 0);
+ v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
if (retval)
@@ -598,7 +599,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size !=
i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- truncate_pagecache(inode, iattr->ia_size);
+ netfs_resize_file(netfs_inode(inode), iattr->ia_size, true);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache & CACHE_FSCACHE)
@@ -655,6 +656,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
+ v9inode->netfs.remote_i_size = stat->st_size;
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
v9fs_i_size_write(inode, stat->st_size);
inode->i_blocks = stat->st_blocks;
@@ -683,8 +685,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_mode = mode;
}
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
- stat->st_result_mask & P9_STATS_SIZE)
+ stat->st_result_mask & P9_STATS_SIZE) {
+ v9inode->netfs.remote_i_size = stat->st_size;
v9fs_i_size_write(inode, stat->st_size);
+ }
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
}
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 73db55c050bf..941f7d0e0bfa 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -289,31 +289,21 @@ static int v9fs_drop_inode(struct inode *inode)
static int v9fs_write_inode(struct inode *inode,
struct writeback_control *wbc)
{
- struct v9fs_inode *v9inode;
-
/*
* send an fsync request to server irrespective of
* wbc->sync_mode.
*/
p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
-
- v9inode = V9FS_I(inode);
- fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
-
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static int v9fs_write_inode_dotl(struct inode *inode,
struct writeback_control *wbc)
{
- struct v9fs_inode *v9inode;
- v9inode = V9FS_I(inode);
p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
- fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
-
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index a3159831ba98..4bc7dd420874 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -144,7 +144,6 @@ source "fs/overlayfs/Kconfig"
menu "Caches"
source "fs/netfs/Kconfig"
-source "fs/fscache/Kconfig"
source "fs/cachefiles/Kconfig"
endmenu
@@ -163,7 +162,6 @@ menu "DOS/FAT/EXFAT/NT Filesystems"
source "fs/fat/Kconfig"
source "fs/exfat/Kconfig"
-source "fs/ntfs/Kconfig"
source "fs/ntfs3/Kconfig"
endmenu
@@ -175,6 +173,13 @@ source "fs/proc/Kconfig"
source "fs/kernfs/Kconfig"
source "fs/sysfs/Kconfig"
+config FS_PID
+ bool "Pseudo filesystem for process file descriptors"
+ depends on 64BIT
+ default y
+ help
+ Pidfs implements advanced features for process file descriptors.
+
config TMPFS
bool "Tmpfs virtual memory file system support (former shm fs)"
depends on SHMEM
diff --git a/fs/Makefile b/fs/Makefile
index a6962c588962..6ecc9b0a53f2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,7 @@ obj-y := open.o read_write.o file_table.o super.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
- kernel_read_file.o mnt_idmapping.o remap_range.o
+ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
@@ -61,7 +61,6 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_NETFS_SUPPORT) += netfs/
-obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT4_FS) += ext4/
# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
@@ -92,7 +91,6 @@ obj-y += unicode/
obj-$(CONFIG_SYSV_FS) += sysv/
obj-$(CONFIG_SMBFS) += smb/
obj-$(CONFIG_HPFS_FS) += hpfs/
-obj-$(CONFIG_NTFS_FS) += ntfs/
obj-$(CONFIG_NTFS3_FS) += ntfs3/
obj-$(CONFIG_UFS_FS) += ufs/
obj-$(CONFIG_EFS_FS) += efs/
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 60685ec76d98..2e612834329a 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -105,6 +105,7 @@ struct affs_sb_info {
int work_queued; /* non-zero delayed work is queued */
struct delayed_work sb_work; /* superblock flush delayed work */
spinlock_t work_lock; /* protects sb_work and work_queued */
+ struct rcu_head rcu;
};
#define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 58b391446ae1..b56a95cf414a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -640,7 +640,7 @@ static void affs_kill_sb(struct super_block *sb)
affs_brelse(sbi->s_root_bh);
kfree(sbi->s_prefix);
mutex_destroy(&sbi->s_bmlock);
- kfree(sbi);
+ kfree_rcu(sbi, rcu);
}
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index c14533ef108f..8a67fc427e74 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -124,7 +124,7 @@ static void afs_dir_read_cleanup(struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
BUG_ON(xa_is_value(folio));
- ASSERTCMP(folio_file_mapping(folio), ==, mapping);
+ ASSERTCMP(folio->mapping, ==, mapping);
folio_put(folio);
}
@@ -202,12 +202,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
- BUG_ON(folio_file_mapping(folio) != mapping);
+ BUG_ON(folio->mapping != mapping);
size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio));
for (offset = 0; offset < size; offset += sizeof(*block)) {
block = kmap_local_folio(folio, offset);
- pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block);
+ pr_warn("[%02lx] %32phN\n", folio->index + offset, block);
kunmap_local(block);
}
}
@@ -233,7 +233,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
- BUG_ON(folio_file_mapping(folio) != mapping);
+ BUG_ON(folio->mapping != mapping);
if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) {
afs_dir_dump(dvnode, req);
@@ -474,6 +474,16 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
continue;
}
+ /* Don't expose silly rename entries to userspace. */
+ if (nlen > 6 &&
+ dire->u.name[0] == '.' &&
+ ctx->actor != afs_lookup_filldir &&
+ ctx->actor != afs_lookup_one_filldir &&
+ memcmp(dire->u.name, ".__afs", 6) == 0) {
+ ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
+ continue;
+ }
+
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
ntohl(dire->u.vnode),
@@ -708,6 +718,8 @@ static void afs_do_lookup_success(struct afs_operation *op)
break;
}
+ if (vp->scb.status.abort_code)
+ trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code);
if (!vp->scb.have_status && !vp->scb.have_error)
continue;
@@ -897,12 +909,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
- inode = ERR_PTR(afs_op_error(op));
out_op:
if (!afs_op_error(op)) {
- inode = &op->file[1].vnode->netfs.inode;
- op->file[1].vnode = NULL;
+ if (op->file[1].scb.status.abort_code) {
+ afs_op_accumulate_error(op, -ECONNABORTED,
+ op->file[1].scb.status.abort_code);
+ } else {
+ inode = &op->file[1].vnode->netfs.inode;
+ op->file[1].vnode = NULL;
+ }
}
if (op->file[0].scb.have_status)
@@ -2022,7 +2038,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
{
struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio));
+ _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index);
folio_detach_private(folio);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 2cd40ba601f1..c4d2711e20ad 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
- netfs_inode_init(&vnode->netfs, NULL);
+ netfs_inode_init(&vnode->netfs, NULL, false);
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
@@ -258,16 +258,7 @@ const struct inode_operations afs_dynroot_inode_operations = {
.lookup = afs_dynroot_lookup,
};
-/*
- * Dirs in the dynamic root don't need revalidation.
- */
-static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
- return 1;
-}
-
const struct dentry_operations afs_dynroot_dentry_operations = {
- .d_revalidate = afs_dynroot_d_revalidate,
.d_delete = always_delete_dentry,
.d_release = afs_d_release,
.d_automount = afs_d_automount,
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 30914e0d9cb2..ef2cc8f565d2 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -20,9 +20,6 @@
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
static int afs_symlink_read_folio(struct file *file, struct folio *folio);
-static void afs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length);
-static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@@ -37,7 +34,7 @@ const struct file_operations afs_file_operations = {
.release = afs_release,
.llseek = generic_file_llseek,
.read_iter = afs_file_read_iter,
- .write_iter = afs_file_write,
+ .write_iter = netfs_file_write_iter,
.mmap = afs_file_mmap,
.splice_read = afs_file_splice_read,
.splice_write = iter_file_splice_write,
@@ -53,22 +50,21 @@ const struct inode_operations afs_file_inode_operations = {
};
const struct address_space_operations afs_file_aops = {
+ .direct_IO = noop_direct_IO,
.read_folio = netfs_read_folio,
.readahead = netfs_readahead,
- .dirty_folio = afs_dirty_folio,
- .launder_folio = afs_launder_folio,
- .release_folio = afs_release_folio,
- .invalidate_folio = afs_invalidate_folio,
- .write_begin = afs_write_begin,
- .write_end = afs_write_end,
- .writepages = afs_writepages,
+ .dirty_folio = netfs_dirty_folio,
+ .launder_folio = netfs_launder_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
+ .writepages = afs_writepages,
};
const struct address_space_operations afs_symlink_aops = {
.read_folio = afs_symlink_read_folio,
- .release_folio = afs_release_folio,
- .invalidate_folio = afs_invalidate_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
};
@@ -323,11 +319,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->len = subreq->len - subreq->transferred;
fsreq->key = key_get(subreq->rreq->netfs_priv);
fsreq->vnode = vnode;
- fsreq->iter = &fsreq->def_iter;
-
- iov_iter_xarray(&fsreq->def_iter, ITER_DEST,
- &fsreq->vnode->netfs.inode.i_mapping->i_pages,
- fsreq->pos, fsreq->len);
+ fsreq->iter = &subreq->io_iter;
afs_fetch_data(fsreq->vnode, fsreq);
afs_put_read(fsreq);
@@ -359,22 +351,13 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio)
static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- rreq->netfs_priv = key_get(afs_file_key(file));
+ if (file)
+ rreq->netfs_priv = key_get(afs_file_key(file));
+ rreq->rsize = 256 * 1024;
+ rreq->wsize = 256 * 1024;
return 0;
}
-static int afs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-#ifdef CONFIG_AFS_FSCACHE
- struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
-
- return fscache_begin_read_operation(&rreq->cache_resources,
- afs_vnode_cache(vnode));
-#else
- return -ENOBUFS;
-#endif
-}
-
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
struct folio **foliop, void **_fsdata)
{
@@ -388,128 +371,37 @@ static void afs_free_request(struct netfs_io_request *rreq)
key_put(rreq->netfs_priv);
}
-const struct netfs_request_ops afs_req_ops = {
- .init_request = afs_init_request,
- .free_request = afs_free_request,
- .begin_cache_operation = afs_begin_cache_operation,
- .check_write_begin = afs_check_write_begin,
- .issue_read = afs_issue_read,
-};
-
-int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
+static void afs_update_i_size(struct inode *inode, loff_t new_i_size)
{
- fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode)));
- return 0;
-}
-
-/*
- * Adjust the dirty region of the page on truncation or full invalidation,
- * getting rid of the markers altogether if the region is entirely invalidated.
- */
-static void afs_invalidate_dirty(struct folio *folio, size_t offset,
- size_t length)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
- unsigned long priv;
- unsigned int f, t, end = offset + length;
-
- priv = (unsigned long)folio_get_private(folio);
-
- /* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == folio_size(folio))
- goto full_invalidate;
-
- /* If the page was dirtied by page_mkwrite(), the PTE stays writable
- * and we don't get another notification to tell us to expand it
- * again.
- */
- if (afs_is_folio_dirty_mmapped(priv))
- return;
-
- /* We may need to shorten the dirty region */
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
-
- if (t <= offset || f >= end)
- return; /* Doesn't overlap */
-
- if (f < offset && t > end)
- return; /* Splits the dirty region - just absorb it */
-
- if (f >= offset && t <= end)
- goto undirty;
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ loff_t i_size;
- if (f < offset)
- t = offset;
- else
- f = end;
- if (f == t)
- goto undirty;
-
- priv = afs_folio_dirty(folio, f, t);
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio);
- return;
-
-undirty:
- trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio);
- folio_clear_dirty_for_io(folio);
-full_invalidate:
- trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio);
- folio_detach_private(folio);
+ write_seqlock(&vnode->cb_lock);
+ i_size = i_size_read(&vnode->netfs.inode);
+ if (new_i_size > i_size) {
+ i_size_write(&vnode->netfs.inode, new_i_size);
+ inode_set_bytes(&vnode->netfs.inode, new_i_size);
+ }
+ write_sequnlock(&vnode->cb_lock);
+ fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size);
}
-/*
- * invalidate part or all of a page
- * - release a page and clean up its private data if offset is 0 (indicating
- * the entire page)
- */
-static void afs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
+static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq)
{
- _enter("{%lu},%zu,%zu", folio->index, offset, length);
-
- BUG_ON(!folio_test_locked(folio));
+ struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
- if (folio_get_private(folio))
- afs_invalidate_dirty(folio, offset, length);
-
- folio_wait_fscache(folio);
- _leave("");
+ afs_invalidate_cache(vnode, 0);
}
-/*
- * release a page and clean up its private state if it's not busy
- * - return true if the page can now be released, false if not
- */
-static bool afs_release_folio(struct folio *folio, gfp_t gfp)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
-
- _enter("{{%llx:%llu}[%lu],%lx},%x",
- vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags,
- gfp);
-
- /* deny if folio is being written to the cache and the caller hasn't
- * elected to wait */
-#ifdef CONFIG_AFS_FSCACHE
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- fscache_note_page_release(afs_vnode_cache(vnode));
-#endif
-
- if (folio_test_private(folio)) {
- trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio);
- folio_detach_private(folio);
- }
-
- /* Indicate that the folio can be released */
- _leave(" = T");
- return true;
-}
+const struct netfs_request_ops afs_req_ops = {
+ .init_request = afs_init_request,
+ .free_request = afs_free_request,
+ .check_write_begin = afs_check_write_begin,
+ .issue_read = afs_issue_read,
+ .update_i_size = afs_update_i_size,
+ .invalidate_cache = afs_netfs_invalidate_cache,
+ .create_write_requests = afs_create_write_requests,
+};
static void afs_add_open_mmap(struct afs_vnode *vnode)
{
@@ -525,13 +417,17 @@ static void afs_add_open_mmap(struct afs_vnode *vnode)
static void afs_drop_open_mmap(struct afs_vnode *vnode)
{
- if (!atomic_dec_and_test(&vnode->cb_nr_mmap))
+ if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1))
return;
down_write(&vnode->volume->open_mmaps_lock);
- if (atomic_read(&vnode->cb_nr_mmap) == 0)
+ read_seqlock_excl(&vnode->cb_lock);
+ // the only place where ->cb_nr_mmap may hit 0
+ // see __afs_break_callback() for the other side...
+ if (atomic_dec_and_test(&vnode->cb_nr_mmap))
list_del_init(&vnode->cb_mmap_link);
+ read_sequnlock_excl(&vnode->cb_lock);
up_write(&vnode->volume->open_mmaps_lock);
flush_work(&vnode->cb_work);
@@ -576,28 +472,39 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = iocb->ki_filp->private_data;
- int ret;
+ ssize_t ret;
- ret = afs_validate(vnode, af->key);
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, iter);
+
+ ret = netfs_start_io_read(inode);
if (ret < 0)
return ret;
-
- return generic_file_read_iter(iocb, iter);
+ ret = afs_validate(vnode, af->key);
+ if (ret == 0)
+ ret = filemap_read(iocb, iter, 0);
+ netfs_end_io_read(inode);
+ return ret;
}
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(in));
+ struct inode *inode = file_inode(in);
+ struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = in->private_data;
- int ret;
+ ssize_t ret;
- ret = afs_validate(vnode, af->key);
+ ret = netfs_start_io_read(inode);
if (ret < 0)
return ret;
-
- return filemap_splice_read(in, ppos, pipe, len, flags);
+ ret = afs_validate(vnode, af->key);
+ if (ret == 0)
+ ret = filemap_splice_read(in, ppos, pipe, len, flags);
+ netfs_end_io_read(inode);
+ return ret;
}
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 9c6dea3139f5..f0e96a35093f 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -93,13 +93,13 @@ static void afs_grant_locks(struct afs_vnode *vnode)
bool exclusive = (vnode->lock_type == AFS_LOCK_WRITE);
list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) {
- if (!exclusive && p->fl_type == F_WRLCK)
+ if (!exclusive && lock_is_write(p))
continue;
list_move_tail(&p->fl_u.afs.link, &vnode->granted_locks);
p->fl_u.afs.state = AFS_LOCK_GRANTED;
trace_afs_flock_op(vnode, p, afs_flock_op_grant);
- wake_up(&p->fl_wait);
+ locks_wake_up(p);
}
}
@@ -112,25 +112,24 @@ static void afs_next_locker(struct afs_vnode *vnode, int error)
{
struct file_lock *p, *_p, *next = NULL;
struct key *key = vnode->lock_key;
- unsigned int fl_type = F_RDLCK;
+ unsigned int type = F_RDLCK;
_enter("");
if (vnode->lock_type == AFS_LOCK_WRITE)
- fl_type = F_WRLCK;
+ type = F_WRLCK;
list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) {
if (error &&
- p->fl_type == fl_type &&
- afs_file_key(p->fl_file) == key) {
+ p->c.flc_type == type &&
+ afs_file_key(p->c.flc_file) == key) {
list_del_init(&p->fl_u.afs.link);
p->fl_u.afs.state = error;
- wake_up(&p->fl_wait);
+ locks_wake_up(p);
}
/* Select the next locker to hand off to. */
- if (next &&
- (next->fl_type == F_WRLCK || p->fl_type == F_RDLCK))
+ if (next && (lock_is_write(next) || lock_is_read(p)))
continue;
next = p;
}
@@ -142,7 +141,7 @@ static void afs_next_locker(struct afs_vnode *vnode, int error)
afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING);
next->fl_u.afs.state = AFS_LOCK_YOUR_TRY;
trace_afs_flock_op(vnode, next, afs_flock_op_wake);
- wake_up(&next->fl_wait);
+ locks_wake_up(next);
} else {
afs_set_lock_state(vnode, AFS_VNODE_LOCK_NONE);
trace_afs_flock_ev(vnode, NULL, afs_flock_no_lockers, 0);
@@ -166,7 +165,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode)
struct file_lock, fl_u.afs.link);
list_del_init(&p->fl_u.afs.link);
p->fl_u.afs.state = -ENOENT;
- wake_up(&p->fl_wait);
+ locks_wake_up(p);
}
key_put(vnode->lock_key);
@@ -464,14 +463,14 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
_enter("{%llx:%llu},%llu-%llu,%u,%u",
vnode->fid.vid, vnode->fid.vnode,
- fl->fl_start, fl->fl_end, fl->fl_type, mode);
+ fl->fl_start, fl->fl_end, fl->c.flc_type, mode);
fl->fl_ops = &afs_lock_ops;
INIT_LIST_HEAD(&fl->fl_u.afs.link);
fl->fl_u.afs.state = AFS_LOCK_PENDING;
partial = (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX);
- type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+ type = lock_is_read(fl) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
if (mode == afs_flock_mode_write && partial)
type = AFS_LOCK_WRITE;
@@ -524,7 +523,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
}
if (vnode->lock_state == AFS_VNODE_LOCK_NONE &&
- !(fl->fl_flags & FL_SLEEP)) {
+ !(fl->c.flc_flags & FL_SLEEP)) {
ret = -EAGAIN;
if (type == AFS_LOCK_READ) {
if (vnode->status.lock_count == -1)
@@ -621,7 +620,7 @@ skip_server_lock:
return 0;
lock_is_contended:
- if (!(fl->fl_flags & FL_SLEEP)) {
+ if (!(fl->c.flc_flags & FL_SLEEP)) {
list_del_init(&fl->fl_u.afs.link);
afs_next_locker(vnode, 0);
ret = -EAGAIN;
@@ -641,7 +640,7 @@ need_to_wait:
spin_unlock(&vnode->lock);
trace_afs_flock_ev(vnode, fl, afs_flock_waiting, 0);
- ret = wait_event_interruptible(fl->fl_wait,
+ ret = wait_event_interruptible(fl->c.flc_wait,
fl->fl_u.afs.state != AFS_LOCK_PENDING);
trace_afs_flock_ev(vnode, fl, afs_flock_waited, ret);
@@ -704,7 +703,8 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl)
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
int ret;
- _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+ _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode,
+ fl->c.flc_type);
trace_afs_flock_op(vnode, fl, afs_flock_op_unlock);
@@ -730,11 +730,11 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
if (vnode->lock_state == AFS_VNODE_LOCK_DELETED)
return -ENOENT;
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
/* check local lock records first */
posix_test_lock(file, fl);
- if (fl->fl_type == F_UNLCK) {
+ if (lock_is_unlock(fl)) {
/* no local locks; consult the server */
ret = afs_fetch_status(vnode, key, false, NULL);
if (ret < 0)
@@ -743,18 +743,18 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
lock_count = READ_ONCE(vnode->status.lock_count);
if (lock_count != 0) {
if (lock_count > 0)
- fl->fl_type = F_RDLCK;
+ fl->c.flc_type = F_RDLCK;
else
- fl->fl_type = F_WRLCK;
+ fl->c.flc_type = F_WRLCK;
fl->fl_start = 0;
fl->fl_end = OFFSET_MAX;
- fl->fl_pid = 0;
+ fl->c.flc_pid = 0;
}
}
ret = 0;
error:
- _leave(" = %d [%hd]", ret, fl->fl_type);
+ _leave(" = %d [%hd]", ret, fl->c.flc_type);
return ret;
}
@@ -769,7 +769,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
_enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
vnode->fid.vid, vnode->fid.vnode, cmd,
- fl->fl_type, fl->fl_flags,
+ fl->c.flc_type, fl->c.flc_flags,
(long long) fl->fl_start, (long long) fl->fl_end);
if (IS_GETLK(cmd))
@@ -778,7 +778,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
trace_afs_flock_op(vnode, fl, afs_flock_op_lock);
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
ret = afs_do_unlk(file, fl);
else
ret = afs_do_setlk(file, fl);
@@ -804,7 +804,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
_enter("{%llx:%llu},%d,{t=%x,fl=%x}",
vnode->fid.vid, vnode->fid.vnode, cmd,
- fl->fl_type, fl->fl_flags);
+ fl->c.flc_type, fl->c.flc_flags);
/*
* No BSD flocks over NFS allowed.
@@ -813,14 +813,14 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
* Not sure whether that would be unique, though, or whether
* that would break in other places.
*/
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
trace_afs_flock_op(vnode, fl, afs_flock_op_flock);
/* we're simulating flock() locks using posix locks on the server */
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
ret = afs_do_unlk(file, fl);
else
ret = afs_do_setlk(file, fl);
@@ -843,7 +843,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
*/
static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file));
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file));
_enter("");
@@ -861,7 +861,7 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
*/
static void afs_fl_release_private(struct file_lock *fl)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file));
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file));
_enter("");
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 4f04f6f33f46..94fc049aff58 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
*/
static void afs_set_netfs_context(struct afs_vnode *vnode)
{
- netfs_inode_init(&vnode->netfs, &afs_req_ops);
+ netfs_inode_init(&vnode->netfs, &afs_req_ops, true);
}
/*
@@ -166,6 +166,7 @@ static void afs_apply_status(struct afs_operation *op,
struct inode *inode = &vnode->netfs.inode;
struct timespec64 t;
umode_t mode;
+ bool unexpected_jump = false;
bool data_changed = false;
bool change_size = vp->set_size;
@@ -230,6 +231,7 @@ static void afs_apply_status(struct afs_operation *op,
}
change_size = true;
data_changed = true;
+ unexpected_jump = true;
} else if (vnode->status.type == AFS_FTYPE_DIR) {
/* Expected directory change is handled elsewhere so
* that we can locally edit the directory and save on a
@@ -249,8 +251,10 @@ static void afs_apply_status(struct afs_operation *op,
* what's on the server.
*/
vnode->netfs.remote_i_size = status->size;
- if (change_size) {
+ if (change_size || status->size > i_size_read(inode)) {
afs_set_i_size(vnode, status->size);
+ if (unexpected_jump)
+ vnode->netfs.zero_point = status->size;
inode_set_ctime_to_ts(inode, t);
inode_set_atime_to_ts(inode, t);
}
@@ -647,7 +651,7 @@ void afs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
afs_set_cache_aux(vnode, &aux);
- fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux);
+ netfs_clear_inode_writeback(inode, &aux);
clear_inode(inode);
while (!list_empty(&vnode->wb_keys)) {
@@ -689,17 +693,17 @@ static void afs_setattr_success(struct afs_operation *op)
static void afs_setattr_edit_file(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->netfs.inode;
+ struct afs_vnode *vnode = vp->vnode;
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
loff_t size = op->setattr.attr->ia_size;
loff_t i_size = op->setattr.old_i_size;
- if (size < i_size)
- truncate_pagecache(inode, size);
- if (size != i_size)
- fscache_resize_cookie(afs_vnode_cache(vp->vnode),
- vp->scb.status.size);
+ if (size != i_size) {
+ truncate_setsize(&vnode->netfs.inode, size);
+ netfs_resize_file(&vnode->netfs, size, true);
+ fscache_resize_cookie(afs_vnode_cache(vnode), size);
+ }
}
}
@@ -767,11 +771,11 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
*/
if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) &&
attr->ia_size < i_size &&
- attr->ia_size > vnode->status.size) {
- truncate_pagecache(inode, attr->ia_size);
+ attr->ia_size > vnode->netfs.remote_i_size) {
+ truncate_setsize(inode, attr->ia_size);
+ netfs_resize_file(&vnode->netfs, size, false);
fscache_resize_cookie(afs_vnode_cache(vnode),
attr->ia_size);
- i_size_write(inode, attr->ia_size);
ret = 0;
goto out_unlock;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 55aa0679d8ce..6ce5a612937c 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -321,8 +321,7 @@ struct afs_net {
struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */
struct hlist_head fs_proc; /* procfs servers list */
- struct hlist_head fs_addresses4; /* afs_server (by lowest IPv4 addr) */
- struct hlist_head fs_addresses6; /* afs_server (by lowest IPv6 addr) */
+ struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */
seqlock_t fs_addr_lock; /* For fs_addresses[46] */
struct work_struct fs_manager;
@@ -561,8 +560,7 @@ struct afs_server {
struct afs_server __rcu *uuid_next; /* Next server with same UUID */
struct afs_server *uuid_prev; /* Previous server with same UUID */
struct list_head probe_link; /* Link in net->fs_probe_list */
- struct hlist_node addr4_link; /* Link in net->fs_addresses4 */
- struct hlist_node addr6_link; /* Link in net->fs_addresses6 */
+ struct hlist_node addr_link; /* Link in net->fs_addresses6 */
struct hlist_node proc_link; /* Link in net->fs_proc */
struct list_head volumes; /* RCU list of afs_server_entry objects */
struct afs_server *gc_next; /* Next server in manager's list */
@@ -985,62 +983,6 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
i_size_read(&vnode->netfs.inode), flags);
}
-/*
- * We use folio->private to hold the amount of the folio that we've written to,
- * splitting the field into two parts. However, we need to represent a range
- * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio
- * exceeds what we can encode.
- */
-#ifdef CONFIG_64BIT
-#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL
-#define __AFS_FOLIO_PRIV_SHIFT 32
-#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL
-#else
-#define __AFS_FOLIO_PRIV_MASK 0x7fffUL
-#define __AFS_FOLIO_PRIV_SHIFT 16
-#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL
-#endif
-
-static inline unsigned int afs_folio_dirty_resolution(struct folio *folio)
-{
- int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1);
- return (shift > 0) ? shift : 0;
-}
-
-static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv)
-{
- unsigned long x = priv & __AFS_FOLIO_PRIV_MASK;
-
- /* The lower bound is inclusive */
- return x << afs_folio_dirty_resolution(folio);
-}
-
-static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv)
-{
- unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK;
-
- /* The upper bound is immediately beyond the region */
- return (x + 1) << afs_folio_dirty_resolution(folio);
-}
-
-static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to)
-{
- unsigned int res = afs_folio_dirty_resolution(folio);
- from >>= res;
- to = (to - 1) >> res;
- return (to << __AFS_FOLIO_PRIV_SHIFT) | from;
-}
-
-static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv)
-{
- return priv | __AFS_FOLIO_PRIV_MMAPPED;
-}
-
-static inline bool afs_is_folio_dirty_mmapped(unsigned long priv)
-{
- return priv & __AFS_FOLIO_PRIV_MMAPPED;
-}
-
#include <trace/events/afs.h>
/*****************************************************************************/
@@ -1167,7 +1109,6 @@ extern int afs_release(struct inode *, struct file *);
extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
extern struct afs_read *afs_alloc_read(gfp_t);
extern void afs_put_read(struct afs_read *);
-extern int afs_write_inode(struct inode *, struct writeback_control *);
static inline struct afs_read *afs_get_read(struct afs_read *req)
{
@@ -1658,24 +1599,11 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
/*
* write.c
*/
-#ifdef CONFIG_AFS_FSCACHE
-bool afs_dirty_folio(struct address_space *, struct folio *);
-#else
-#define afs_dirty_folio filemap_dirty_folio
-#endif
-extern int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **pagep, void **fsdata);
-extern int afs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata);
-extern int afs_writepage(struct page *, struct writeback_control *);
extern int afs_writepages(struct address_space *, struct writeback_control *);
-extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
extern void afs_prune_wb_keys(struct afs_vnode *);
-int afs_launder_folio(struct folio *);
+void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len);
/*
* xattr.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 1b3bd21c168a..a14f6013e316 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -90,8 +90,7 @@ static int __net_init afs_net_init(struct net *net_ns)
INIT_LIST_HEAD(&net->fs_probe_slow);
INIT_HLIST_HEAD(&net->fs_proc);
- INIT_HLIST_HEAD(&net->fs_addresses4);
- INIT_HLIST_HEAD(&net->fs_addresses6);
+ INIT_HLIST_HEAD(&net->fs_addresses);
seqlock_init(&net->fs_addr_lock);
INIT_WORK(&net->fs_manager, afs_manage_servers);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 3bd02571f30d..15eab053af6d 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -166,7 +166,7 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
if (!preflist) {
seq_puts(m, "NO PREFS\n");
- return 0;
+ goto out;
}
seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n",
@@ -191,7 +191,8 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
}
}
- rcu_read_lock();
+out:
+ rcu_read_unlock();
return 0;
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index e169121f603e..038f9d0ae3af 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -38,7 +38,7 @@ struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer
seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
- hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
+ hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) {
estate = rcu_dereference(server->endpoint_state);
alist = estate->addresses;
for (i = 0; i < alist->nr_addrs; i++)
@@ -177,10 +177,8 @@ added_dup:
* bit, but anything we might want to do gets messy and memory
* intensive.
*/
- if (alist->nr_ipv4 > 0)
- hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4);
- if (alist->nr_addrs > alist->nr_ipv4)
- hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6);
+ if (alist->nr_addrs > 0)
+ hlist_add_head_rcu(&server->addr_link, &net->fs_addresses);
write_sequnlock(&net->fs_addr_lock);
@@ -511,10 +509,8 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
list_del(&server->probe_link);
hlist_del_rcu(&server->proc_link);
- if (!hlist_unhashed(&server->addr4_link))
- hlist_del_rcu(&server->addr4_link);
- if (!hlist_unhashed(&server->addr6_link))
- hlist_del_rcu(&server->addr6_link);
+ if (!hlist_unhashed(&server->addr_link))
+ hlist_del_rcu(&server->addr_link);
}
write_sequnlock(&net->fs_lock);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index ae2d66a52add..f3ba1c3e72f5 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -55,7 +55,7 @@ int afs_net_id;
static const struct super_operations afs_super_ops = {
.statfs = afs_statfs,
.alloc_inode = afs_alloc_inode,
- .write_inode = afs_write_inode,
+ .write_inode = netfs_unpin_writeback,
.drop_inode = afs_drop_inode,
.destroy_inode = afs_destroy_inode,
.free_inode = afs_free_inode,
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 020ecd45e476..af3a3f57c1b3 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -353,7 +353,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
{
struct afs_server_list *new, *old, *discard;
struct afs_vldb_entry *vldb;
- char idbuf[16];
+ char idbuf[24];
int ret, idsz;
_enter("");
@@ -361,7 +361,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
/* We look up an ID by passing it as a decimal string in the
* operation's name parameter.
*/
- idsz = sprintf(idbuf, "%llu", volume->vid);
+ idsz = snprintf(idbuf, sizeof(idbuf), "%llu", volume->vid);
vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz);
if (IS_ERR(vldb)) {
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 61d34ad2ca7d..74402d95a884 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -12,309 +12,17 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "internal.h"
-static int afs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next,
- bool max_one_loop);
-
-static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len,
- loff_t i_size, bool caching);
-
-#ifdef CONFIG_AFS_FSCACHE
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-bool afs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- return fscache_dirty_folio(mapping, folio,
- afs_vnode_cache(AFS_FS_I(mapping->host)));
-}
-static void afs_folio_start_fscache(bool caching, struct folio *folio)
-{
- if (caching)
- folio_start_fscache(folio);
-}
-#else
-static void afs_folio_start_fscache(bool caching, struct folio *folio)
-{
-}
-#endif
-
-/*
- * Flush out a conflicting write. This may extend the write to the surrounding
- * pages if also dirty and contiguous to the conflicting region..
- */
-static int afs_flush_conflicting_write(struct address_space *mapping,
- struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = folio_pos(folio),
- .range_end = LLONG_MAX,
- };
- loff_t next;
-
- return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX,
- &next, true);
-}
-
-/*
- * prepare to perform part of a write to a page
- */
-int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **_page, void **fsdata)
-{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- struct folio *folio;
- unsigned long priv;
- unsigned f, from;
- unsigned t, to;
- pgoff_t index;
- int ret;
-
- _enter("{%llx:%llu},%llx,%x",
- vnode->fid.vid, vnode->fid.vnode, pos, len);
-
- /* Prefetch area to be written into the cache if we're caching this
- * file. We need to do this before we get a lock on the page in case
- * there's more than one writer competing for the same cache block.
- */
- ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata);
- if (ret < 0)
- return ret;
-
- index = folio_index(folio);
- from = pos - index * PAGE_SIZE;
- to = from + len;
-
-try_again:
- /* See if this page is already partially written in a way that we can
- * merge the new write with.
- */
- if (folio_test_private(folio)) {
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- ASSERTCMP(f, <=, t);
-
- if (folio_test_writeback(folio)) {
- trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio);
- folio_unlock(folio);
- goto wait_for_writeback;
- }
- /* If the file is being filled locally, allow inter-write
- * spaces to be merged into writes. If it's not, only write
- * back what the user gives us.
- */
- if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) &&
- (to < f || from > t))
- goto flush_conflicting_write;
- }
-
- *_page = folio_file_page(folio, pos / PAGE_SIZE);
- _leave(" = 0");
- return 0;
-
- /* The previous write and this write aren't adjacent or overlapping, so
- * flush the page out.
- */
-flush_conflicting_write:
- trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio);
- folio_unlock(folio);
-
- ret = afs_flush_conflicting_write(mapping, folio);
- if (ret < 0)
- goto error;
-
-wait_for_writeback:
- ret = folio_wait_writeback_killable(folio);
- if (ret < 0)
- goto error;
-
- ret = folio_lock_killable(folio);
- if (ret < 0)
- goto error;
- goto try_again;
-
-error:
- folio_put(folio);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * finalise part of a write to a page
- */
-int afs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *subpage, void *fsdata)
-{
- struct folio *folio = page_folio(subpage);
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- unsigned long priv;
- unsigned int f, from = offset_in_folio(folio, pos);
- unsigned int t, to = from + copied;
- loff_t i_size, write_end_pos;
-
- _enter("{%llx:%llu},{%lx}",
- vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
-
- if (!folio_test_uptodate(folio)) {
- if (copied < len) {
- copied = 0;
- goto out;
- }
-
- folio_mark_uptodate(folio);
- }
-
- if (copied == 0)
- goto out;
-
- write_end_pos = pos + copied;
-
- i_size = i_size_read(&vnode->netfs.inode);
- if (write_end_pos > i_size) {
- write_seqlock(&vnode->cb_lock);
- i_size = i_size_read(&vnode->netfs.inode);
- if (write_end_pos > i_size)
- afs_set_i_size(vnode, write_end_pos);
- write_sequnlock(&vnode->cb_lock);
- fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos);
- }
-
- if (folio_test_private(folio)) {
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- if (from < f)
- f = from;
- if (to > t)
- t = to;
- priv = afs_folio_dirty(folio, f, t);
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio);
- } else {
- priv = afs_folio_dirty(folio, from, to);
- folio_attach_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio);
- }
-
- if (folio_mark_dirty(folio))
- _debug("dirtied %lx", folio_index(folio));
-
-out:
- folio_unlock(folio);
- folio_put(folio);
- return copied;
-}
-
-/*
- * kill all the pages in the given range
- */
-static void afs_kill_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("{%llx:%llu},%llx @%llx",
- vnode->fid.vid, vnode->fid.vnode, len, start);
-
- do {
- _debug("kill %lx (to %lx)", index, last);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
-
- folio_clear_uptodate(folio);
- folio_end_writeback(folio);
- folio_lock(folio);
- generic_error_remove_folio(mapping, folio);
- folio_unlock(folio);
- folio_put(folio);
-
- } while (index = next, index <= last);
-
- _leave("");
-}
-
-/*
- * Redirty all the pages in a given range.
- */
-static void afs_redirty_pages(struct writeback_control *wbc,
- struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("{%llx:%llu},%llx @%llx",
- vnode->fid.vid, vnode->fid.vnode, len, start);
-
- do {
- _debug("redirty %llx @%llx", len, start);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = index + folio_nr_pages(folio);
- folio_redirty_for_writepage(wbc, folio);
- folio_end_writeback(folio);
- folio_put(folio);
- } while (index = next, index <= last);
-
- _leave("");
-}
-
/*
* completion of write to server
*/
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
- struct address_space *mapping = vnode->netfs.inode.i_mapping;
- struct folio *folio;
- pgoff_t end;
-
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
-
_enter("{%llx:%llu},{%x @%llx}",
vnode->fid.vid, vnode->fid.vnode, len, start);
- rcu_read_lock();
-
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, end) {
- if (!folio_test_writeback(folio)) {
- kdebug("bad %x @%llx page %lx %lx",
- len, start, folio_index(folio), end);
- ASSERT(folio_test_writeback(folio));
- }
-
- trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio);
- folio_detach_private(folio);
- folio_end_writeback(folio);
- }
-
- rcu_read_unlock();
-
afs_prune_wb_keys(vnode);
_leave("");
}
@@ -451,363 +159,53 @@ try_next_key:
return afs_put_operation(op);
}
-/*
- * Extend the region to be written back to include subsequent contiguously
- * dirty pages if possible, but don't sleep while doing so.
- *
- * If this page holds new content, then we can include filler zeros in the
- * writeback.
- */
-static void afs_extend_writeback(struct address_space *mapping,
- struct afs_vnode *vnode,
- long *_count,
- loff_t start,
- loff_t max_len,
- bool new_content,
- bool caching,
- unsigned int *_len)
+static void afs_upload_to_server(struct netfs_io_subrequest *subreq)
{
- struct folio_batch fbatch;
- struct folio *folio;
- unsigned long priv;
- unsigned int psize, filler = 0;
- unsigned int f, t;
- loff_t len = *_len;
- pgoff_t index = (start + len) / PAGE_SIZE;
- bool stop = true;
- unsigned int i;
-
- XA_STATE(xas, &mapping->i_pages, index);
- folio_batch_init(&fbatch);
-
- do {
- /* Firstly, we gather up a batch of contiguous dirty pages
- * under the RCU read lock - but we can't clear the dirty flags
- * there if any of those pages are mapped.
- */
- rcu_read_lock();
-
- xas_for_each(&xas, folio, ULONG_MAX) {
- stop = true;
- if (xas_retry(&xas, folio))
- continue;
- if (xa_is_value(folio))
- break;
- if (folio_index(folio) != index)
- break;
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(&xas);
- continue;
- }
-
- /* Has the page moved or been split? */
- if (unlikely(folio != xas_reload(&xas))) {
- folio_put(folio);
- break;
- }
-
- if (!folio_trylock(folio)) {
- folio_put(folio);
- break;
- }
- if (!folio_test_dirty(folio) ||
- folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- break;
- }
-
- psize = folio_size(folio);
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- if (f != 0 && !new_content) {
- folio_unlock(folio);
- folio_put(folio);
- break;
- }
-
- len += filler + t;
- filler = psize - t;
- if (len >= max_len || *_count <= 0)
- stop = true;
- else if (t == psize || new_content)
- stop = false;
-
- index += folio_nr_pages(folio);
- if (!folio_batch_add(&fbatch, folio))
- break;
- if (stop)
- break;
- }
-
- if (!stop)
- xas_pause(&xas);
- rcu_read_unlock();
-
- /* Now, if we obtained any folios, we can shift them to being
- * writable and mark them for caching.
- */
- if (!folio_batch_count(&fbatch))
- break;
-
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- folio = fbatch.folios[i];
- trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio);
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- folio_start_writeback(folio);
- afs_folio_start_fscache(caching, folio);
-
- *_count -= folio_nr_pages(folio);
- folio_unlock(folio);
- }
+ struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
+ ssize_t ret;
- folio_batch_release(&fbatch);
- cond_resched();
- } while (!stop);
+ _enter("%x[%x],%zx",
+ subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count);
- *_len = len;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ ret = afs_store_data(vnode, &subreq->io_iter, subreq->start,
+ subreq->rreq->origin == NETFS_LAUNDER_WRITE);
+ netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len,
+ false);
}
-/*
- * Synchronously write back the locked page and any subsequent non-locked dirty
- * pages.
- */
-static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
- struct writeback_control *wbc,
- struct folio *folio,
- loff_t start, loff_t end)
+static void afs_upload_to_server_worker(struct work_struct *work)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct iov_iter iter;
- unsigned long priv;
- unsigned int offset, to, len, max_len;
- loff_t i_size = i_size_read(&vnode->netfs.inode);
- bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode));
- long count = wbc->nr_to_write;
- int ret;
-
- _enter(",%lx,%llx-%llx", folio_index(folio), start, end);
-
- folio_start_writeback(folio);
- afs_folio_start_fscache(caching, folio);
-
- count -= folio_nr_pages(folio);
-
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- priv = (unsigned long)folio_get_private(folio);
- offset = afs_folio_dirty_from(folio, priv);
- to = afs_folio_dirty_to(folio, priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio);
-
- len = to - offset;
- start += offset;
- if (start < i_size) {
- /* Trim the write to the EOF; the extra data is ignored. Also
- * put an upper limit on the size of a single storedata op.
- */
- max_len = 65536 * 4096;
- max_len = min_t(unsigned long long, max_len, end - start + 1);
- max_len = min_t(unsigned long long, max_len, i_size - start);
-
- if (len < max_len &&
- (to == folio_size(folio) || new_content))
- afs_extend_writeback(mapping, vnode, &count,
- start, max_len, new_content,
- caching, &len);
- len = min_t(loff_t, len, max_len);
- }
-
- /* We now have a contiguous set of dirty pages, each with writeback
- * set; the first page is still locked at this point, but all the rest
- * have been unlocked.
- */
- folio_unlock(folio);
-
- if (start < i_size) {
- _debug("write back %x @%llx [%llx]", len, start, i_size);
-
- /* Speculatively write to the cache. We have to fix this up
- * later if the store fails.
- */
- afs_write_to_cache(vnode, start, len, i_size, caching);
-
- iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
- ret = afs_store_data(vnode, &iter, start, false);
- } else {
- _debug("write discard %x @%llx [%llx]", len, start, i_size);
-
- /* The dirty region was entirely beyond the EOF. */
- fscache_clear_page_bits(mapping, start, len, caching);
- afs_pages_written_back(vnode, start, len);
- ret = 0;
- }
-
- switch (ret) {
- case 0:
- wbc->nr_to_write = count;
- ret = len;
- break;
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
- default:
- pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret);
- fallthrough;
- case -EACCES:
- case -EPERM:
- case -ENOKEY:
- case -EKEYEXPIRED:
- case -EKEYREJECTED:
- case -EKEYREVOKED:
- case -ENETRESET:
- afs_redirty_pages(wbc, mapping, start, len);
- mapping_set_error(mapping, ret);
- break;
-
- case -EDQUOT:
- case -ENOSPC:
- afs_redirty_pages(wbc, mapping, start, len);
- mapping_set_error(mapping, -ENOSPC);
- break;
-
- case -EROFS:
- case -EIO:
- case -EREMOTEIO:
- case -EFBIG:
- case -ENOENT:
- case -ENOMEDIUM:
- case -ENXIO:
- trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail);
- afs_kill_pages(mapping, start, len);
- mapping_set_error(mapping, ret);
- break;
- }
-
- _leave(" = %d", ret);
- return ret;
+ afs_upload_to_server(subreq);
}
/*
- * write a region of pages back to the server
+ * Set up write requests for a writeback slice. We need to add a write request
+ * for each write we want to make.
*/
-static int afs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next,
- bool max_one_loop)
+void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
{
- struct folio *folio;
- struct folio_batch fbatch;
- ssize_t ret;
- unsigned int i;
- int n, skips = 0;
-
- _enter("%llx,%llx,", start, end);
- folio_batch_init(&fbatch);
-
- do {
- pgoff_t index = start / PAGE_SIZE;
-
- n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE,
- PAGECACHE_TAG_DIRTY, &fbatch);
-
- if (!n)
- break;
- for (i = 0; i < n; i++) {
- folio = fbatch.folios[i];
- start = folio_pos(folio); /* May regress with THPs */
-
- _debug("wback %lx", folio_index(folio));
-
- /* At this point we hold neither the i_pages lock nor the
- * page lock: the page may be truncated or invalidated
- * (changing page->mapping to NULL), or even swizzled
- * back from swapper_space to tmpfs file mapping
- */
-try_again:
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0) {
- folio_batch_release(&fbatch);
- return ret;
- }
- } else {
- if (!folio_trylock(folio))
- continue;
- }
-
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- continue;
- }
-
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE) {
- folio_wait_writeback(folio);
-#ifdef CONFIG_AFS_FSCACHE
- folio_wait_fscache(folio);
-#endif
- goto try_again;
- }
-
- start += folio_size(folio);
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (skips >= 5 || need_resched()) {
- *_next = start;
- folio_batch_release(&fbatch);
- _leave(" = 0 [%llx]", *_next);
- return 0;
- }
- skips++;
- }
- continue;
- }
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- ret = afs_write_back_from_locked_folio(mapping, wbc,
- folio, start, end);
- if (ret < 0) {
- _leave(" = %zd", ret);
- folio_batch_release(&fbatch);
- return ret;
- }
-
- start += ret;
- }
+ struct netfs_io_subrequest *subreq;
- folio_batch_release(&fbatch);
- cond_resched();
- } while (wbc->nr_to_write > 0);
+ _enter("%x,%llx-%llx", wreq->debug_id, start, start + len);
- *_next = start;
- _leave(" = 0 [%llx]", *_next);
- return 0;
+ subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
+ start, len, afs_upload_to_server_worker);
+ if (subreq)
+ netfs_queue_write_request(subreq);
}
/*
* write some of the pending data back to the server
*/
-int afs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+int afs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- loff_t start, next;
int ret;
- _enter("");
-
/* We have to be careful as we can end up racing with setattr()
* truncating the pagecache since the caller doesn't take a lock here
* to prevent it.
@@ -817,69 +215,12 @@ int afs_writepages(struct address_space *mapping,
else if (!down_read_trylock(&vnode->validate_lock))
return 0;
- if (wbc->range_cyclic) {
- start = mapping->writeback_index * PAGE_SIZE;
- ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX,
- &next, false);
- if (ret == 0) {
- mapping->writeback_index = next / PAGE_SIZE;
- if (start > 0 && wbc->nr_to_write > 0) {
- ret = afs_writepages_region(mapping, wbc, 0,
- start, &next, false);
- if (ret == 0)
- mapping->writeback_index =
- next / PAGE_SIZE;
- }
- }
- } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX,
- &next, false);
- if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = next / PAGE_SIZE;
- } else {
- ret = afs_writepages_region(mapping, wbc,
- wbc->range_start, wbc->range_end,
- &next, false);
- }
-
+ ret = netfs_writepages(mapping, wbc);
up_read(&vnode->validate_lock);
- _leave(" = %d", ret);
return ret;
}
/*
- * write to an AFS file
- */
-ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
-{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
- struct afs_file *af = iocb->ki_filp->private_data;
- ssize_t result;
- size_t count = iov_iter_count(from);
-
- _enter("{%llx:%llu},{%zu},",
- vnode->fid.vid, vnode->fid.vnode, count);
-
- if (IS_SWAPFILE(&vnode->netfs.inode)) {
- printk(KERN_INFO
- "AFS: Attempt to write to active swap file!\n");
- return -EBUSY;
- }
-
- if (!count)
- return 0;
-
- result = afs_validate(vnode, af->key);
- if (result < 0)
- return result;
-
- result = generic_file_write_iter(iocb, from);
-
- _leave(" = %zd", result);
- return result;
-}
-
-/*
* flush any dirty pages for this process, and check for write errors.
* - the return status from this call provides a reliable indication of
* whether any write errors occurred for this process.
@@ -907,59 +248,11 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
- struct inode *inode = file_inode(file);
- struct afs_vnode *vnode = AFS_FS_I(inode);
- struct afs_file *af = file->private_data;
- unsigned long priv;
- vm_fault_t ret = VM_FAULT_RETRY;
-
- _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
-
- afs_validate(vnode, af->key);
- sb_start_pagefault(inode->i_sb);
-
- /* Wait for the page to be written to the cache before we allow it to
- * be modified. We then assume the entire page will need writing back.
- */
-#ifdef CONFIG_AFS_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- goto out;
-#endif
-
- if (folio_wait_writeback_killable(folio))
- goto out;
-
- if (folio_lock_killable(folio) < 0)
- goto out;
-
- /* We mustn't change folio->private until writeback is complete as that
- * details the portion of the page we need to write back and we might
- * need to redirty the page if there's a problem.
- */
- if (folio_wait_writeback_killable(folio) < 0) {
- folio_unlock(folio);
- goto out;
- }
-
- priv = afs_folio_dirty(folio, 0, folio_size(folio));
- priv = afs_folio_dirty_mmapped(priv);
- if (folio_test_private(folio)) {
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio);
- } else {
- folio_attach_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio);
- }
- file_update_time(file);
-
- ret = VM_FAULT_LOCKED;
-out:
- sb_end_pagefault(inode->i_sb);
- return ret;
+ if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0)
+ return VM_FAULT_SIGBUS;
+ return netfs_page_mkwrite(vmf, NULL);
}
/*
@@ -989,64 +282,3 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
afs_put_wb_key(wbk);
}
}
-
-/*
- * Clean up a page during invalidation.
- */
-int afs_launder_folio(struct folio *folio)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
- struct iov_iter iter;
- struct bio_vec bv;
- unsigned long priv;
- unsigned int f, t;
- int ret = 0;
-
- _enter("{%lx}", folio->index);
-
- priv = (unsigned long)folio_get_private(folio);
- if (folio_clear_dirty_for_io(folio)) {
- f = 0;
- t = folio_size(folio);
- if (folio_test_private(folio)) {
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- }
-
- bvec_set_folio(&bv, folio, t - f, f);
- iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, bv.bv_len);
-
- trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio);
- ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true);
- }
-
- trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio);
- folio_detach_private(folio);
- folio_wait_fscache(folio);
- return ret;
-}
-
-/*
- * Deal with the completion of writing the data to the cache.
- */
-static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct afs_vnode *vnode = priv;
-
- if (IS_ERR_VALUE(transferred_or_error) &&
- transferred_or_error != -ENOBUFS)
- afs_invalidate_cache(vnode, 0);
-}
-
-/*
- * Save the write to the cache also.
- */
-static void afs_write_to_cache(struct afs_vnode *vnode,
- loff_t start, size_t len, loff_t i_size,
- bool caching)
-{
- fscache_write_to_cache(afs_vnode_cache(vnode),
- vnode->netfs.inode.i_mapping, start, len, i_size,
- afs_write_to_cache_done, vnode, caching);
-}
diff --git a/fs/aio.c b/fs/aio.c
index bb2ff48991f3..9cdaa2faa536 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -589,13 +589,24 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
{
- struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
- struct kioctx *ctx = req->ki_ctx;
+ struct aio_kiocb *req;
+ struct kioctx *ctx;
unsigned long flags;
+ /*
+ * kiocb didn't come from aio or is neither a read nor a write, hence
+ * ignore it.
+ */
+ if (!(iocb->ki_flags & IOCB_AIO_RW))
+ return;
+
+ req = container_of(iocb, struct aio_kiocb, rw);
+
if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
return;
+ ctx = req->ki_ctx;
+
spin_lock_irqsave(&ctx->ctx_lock, flags);
list_add_tail(&req->ki_list, &ctx->active_reqs);
req->ki_cancel = cancel;
@@ -1509,7 +1520,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = req->ki_filp->f_iocb_flags;
+ req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index d26222b7eefe..0496cb5b6eab 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,7 +79,7 @@ static struct file *__anon_inode_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode,
- bool secure)
+ bool make_inode)
{
struct inode *inode;
struct file *file;
@@ -87,7 +87,7 @@ static struct file *__anon_inode_getfile(const char *name,
if (fops->owner && !try_module_get(fops->owner))
return ERR_PTR(-ENOENT);
- if (secure) {
+ if (make_inode) {
inode = anon_inode_make_secure_inode(name, context_inode);
if (IS_ERR(inode)) {
file = ERR_CAST(inode);
@@ -149,13 +149,10 @@ struct file *anon_inode_getfile(const char *name,
EXPORT_SYMBOL_GPL(anon_inode_getfile);
/**
- * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new
+ * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
* !S_PRIVATE anon inode rather than reuse the
* singleton anon inode and calls the
- * inode_init_security_anon() LSM hook. This
- * allows for both the inode to have its own
- * security context and for the LSM to enforce
- * policy on the inode's creation.
+ * inode_init_security_anon() LSM hook.
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
@@ -164,11 +161,21 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile);
* @context_inode:
* [in] the logical relationship with the new inode (optional)
*
+ * Create a new anonymous inode and file pair. This can be done for two
+ * reasons:
+ *
+ * - for the inode to have its own security context, so that LSMs can enforce
+ * policy on the inode's creation;
+ *
+ * - if the caller needs a unique inode, for example in order to customize
+ * the size returned by fstat()
+ *
* The LSM may use @context_inode in inode_init_security_anon(), but a
- * reference to it is not held. Returns the newly created file* or an error
- * pointer. See the anon_inode_getfile() documentation for more information.
+ * reference to it is not held.
+ *
+ * Returns the newly created file* or an error pointer.
*/
-struct file *anon_inode_getfile_secure(const char *name,
+struct file *anon_inode_create_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode)
@@ -176,12 +183,13 @@ struct file *anon_inode_getfile_secure(const char *name,
return __anon_inode_getfile(name, fops, priv, flags,
context_inode, true);
}
+EXPORT_SYMBOL_GPL(anon_inode_create_getfile);
static int __anon_inode_getfd(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode,
- bool secure)
+ bool make_inode)
{
int error, fd;
struct file *file;
@@ -192,7 +200,7 @@ static int __anon_inode_getfd(const char *name,
fd = error;
file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
- secure);
+ make_inode);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_put_unused_fd;
@@ -231,10 +239,9 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
EXPORT_SYMBOL_GPL(anon_inode_getfd);
/**
- * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new
+ * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new
* !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls
- * the inode_init_security_anon() LSM hook. This allows the inode to have its
- * own security context and for a LSM to reject creation of the inode.
+ * the inode_init_security_anon() LSM hook.
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
@@ -243,16 +250,26 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd);
* @context_inode:
* [in] the logical relationship with the new inode (optional)
*
+ * Create a new anonymous inode and file pair. This can be done for two
+ * reasons:
+ *
+ * - for the inode to have its own security context, so that LSMs can enforce
+ * policy on the inode's creation;
+ *
+ * - if the caller needs a unique inode, for example in order to customize
+ * the size returned by fstat()
+ *
* The LSM may use @context_inode in inode_init_security_anon(), but a
* reference to it is not held.
+ *
+ * Returns a newly created file descriptor or an error code.
*/
-int anon_inode_getfd_secure(const char *name, const struct file_operations *fops,
+int anon_inode_create_getfd(const char *name, const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode)
{
return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
}
-EXPORT_SYMBOL_GPL(anon_inode_getfd_secure);
static int __init anon_inode_init(void)
{
diff --git a/fs/attr.c b/fs/attr.c
index 5a13f0c8495f..49d23b5dbab4 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -352,7 +352,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
EXPORT_SYMBOL(may_setattr);
/**
- * notify_change - modify attributes of a filesytem object
+ * notify_change - modify attributes of a filesystem object
* @idmap: idmap of the mount the inode was found from
* @dentry: object affected
* @attr: new attributes
diff --git a/fs/backing-file.c b/fs/backing-file.c
index a681f38d84d8..740185198db3 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -325,9 +325,7 @@ EXPORT_SYMBOL_GPL(backing_file_mmap);
static int __init backing_aio_init(void)
{
- backing_aio_cachep = kmem_cache_create("backing_aio",
- sizeof(struct backing_aio),
- 0, SLAB_HWCACHE_ALIGN, NULL);
+ backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN);
if (!backing_aio_cachep)
return -ENOMEM;
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 7423a3557c68..1a05cecda7cc 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -27,7 +27,6 @@ bcachefs-y := \
checksum.o \
clock.o \
compress.o \
- counters.o \
darray.o \
debug.o \
dirent.o \
@@ -71,6 +70,7 @@ bcachefs-y := \
reflink.o \
replicas.o \
sb-clean.o \
+ sb-counters.o \
sb-downgrade.o \
sb-errors.o \
sb-members.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index a09b9d00226a..fd3e175d8342 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -273,7 +273,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
c, err, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
- bch2_data_types[a.v->data_type]);
+ bch2_data_type_str(a.v->data_type));
break;
case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors ||
@@ -321,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
{
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- unsigned i;
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "gen %u oldest_gen %u data_type %s",
- a->gen, a->oldest_gen,
- a->data_type < BCH_DATA_NR
- ? bch2_data_types[a->data_type]
- : "(invalid data type)");
+ prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
+ bch2_prt_data_type(out, a->data_type);
prt_newline(out);
prt_printf(out, "journal_seq %llu", a->journal_seq);
prt_newline(out);
@@ -353,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
prt_newline(out);
prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
- prt_newline(out);
-
- if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
- struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
- const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
-
- prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
- printbuf_indent_add(out, 2);
-
- for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
- prt_newline(out);
- bch2_backpointer_to_text(out, &bps[i]);
- }
-
- printbuf_indent_sub(out, 2);
- }
-
printbuf_indent_sub(out, 2);
}
@@ -839,7 +818,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
}
- if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
u64 journal_seq = trans->journal_res.seq;
u64 bucket_journal_seq = new_a->journal_seq;
@@ -1625,13 +1604,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
return ret;
}
+struct discard_buckets_state {
+ u64 seen;
+ u64 open;
+ u64 need_journal_commit;
+ u64 discarded;
+ struct bch_dev *ca;
+ u64 need_journal_commit_this_dev;
+};
+
+static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
+{
+ if (s->ca == ca)
+ return;
+
+ if (s->ca && s->need_journal_commit_this_dev >
+ bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ if (s->ca)
+ percpu_ref_put(&s->ca->ref);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ s->ca = ca;
+ s->need_journal_commit_this_dev = 0;
+}
+
static int bch2_discard_one_bucket(struct btree_trans *trans,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
- u64 *seen,
- u64 *open,
- u64 *need_journal_commit,
- u64 *discarded)
+ struct discard_buckets_state *s)
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
@@ -1643,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
int ret = 0;
ca = bch_dev_bkey_exists(c, pos.inode);
+
if (!percpu_ref_tryget(&ca->io_ref)) {
bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
return 0;
}
+ discard_buckets_next_dev(c, s, ca);
+
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
- (*open)++;
+ s->open++;
goto out;
}
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
c->journal.flushed_seq_ondisk,
pos.inode, pos.offset)) {
- (*need_journal_commit)++;
+ s->need_journal_commit++;
+ s->need_journal_commit_this_dev++;
goto out;
}
@@ -1709,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
* This works without any other locks because this is the only
* thread that removes items from the need_discard tree
*/
- bch2_trans_unlock(trans);
+ bch2_trans_unlock_long(trans);
blkdev_issue_discard(ca->disk_sb.bdev,
k.k->p.offset * ca->mi.bucket_size,
ca->mi.bucket_size,
@@ -1732,9 +1738,9 @@ write:
goto out;
count_event(c, bucket_discard);
- (*discarded)++;
+ s->discarded++;
out:
- (*seen)++;
+ s->seen++;
bch2_trans_iter_exit(trans, &iter);
percpu_ref_put(&ca->io_ref);
printbuf_exit(&buf);
@@ -1744,7 +1750,7 @@ out:
static void bch2_do_discards_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
- u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+ struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@@ -1756,19 +1762,14 @@ static void bch2_do_discards_work(struct work_struct *work)
ret = bch2_trans_run(c,
for_each_btree_key(trans, iter,
BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
- &seen,
- &open,
- &need_journal_commit,
- &discarded)));
-
- if (need_journal_commit * 2 > seen)
- bch2_journal_flush_async(&c->journal, NULL);
+ bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ discard_buckets_next_dev(c, &s, NULL);
- trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+ trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
new file mode 100644
index 000000000000..b4ec20be93b8
--- /dev/null
+++ b/fs/bcachefs/alloc_background_format.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+
+struct bch_alloc {
+ struct bch_val v;
+ __u8 fields;
+ __u8 gen;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V1() \
+ x(read_time, 16) \
+ x(write_time, 16) \
+ x(data_type, 8) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(oldest_gen, 8) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bch_alloc_v2 {
+ struct bch_val v;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V2() \
+ x(read_time, 64) \
+ x(write_time, 64) \
+ x(dirty_sectors, 32) \
+ x(cached_sectors, 32) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+struct bch_alloc_v3 {
+ struct bch_val v;
+ __le64 journal_seq;
+ __le32 flags;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
+
+struct bch_alloc_v4 {
+ struct bch_val v;
+ __u64 journal_seq;
+ __u32 flags;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 stripe_redundancy;
+ __u32 dirty_sectors;
+ __u32 cached_sectors;
+ __u64 io_time[2];
+ __u32 stripe;
+ __u32 nr_external_backpointers;
+ __u64 fragmentation_lru;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0 6
+#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
+
+#define KEY_TYPE_BUCKET_GENS_BITS 8
+#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+ struct bch_val v;
+ u8 gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index b0ff47998a94..633d3223b353 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1525,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str
unsigned data_type = ob->data_type;
barrier(); /* READ_ONCE() doesn't work on bitfields */
- prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+ prt_printf(out, "%zu ref %u ",
ob - c->open_buckets,
- atomic_read(&ob->pin),
- data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+ atomic_read(&ob->pin));
+ bch2_prt_data_type(out, data_type);
+ prt_printf(out, " %u:%llu gen %u allocated %u/%u",
ob->dev, ob->bucket, ob->gen,
ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
if (ob->ec)
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index e358a2ffffde..569b97904da4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -68,9 +68,11 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer
void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
- prt_str(out, "bucket=");
- bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
- prt_str(out, " ");
+ if (bch2_dev_exists2(c, k.k->p.inode)) {
+ prt_str(out, "bucket=");
+ bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+ prt_str(out, " ");
+ }
bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
}
@@ -400,13 +402,24 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
return ret;
}
+static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
+{
+ return bpos_eq(l.k->p, r.k->p) &&
+ bkey_bytes(l.k) == bkey_bytes(r.k) &&
+ !memcmp(l.v, r.v, bkey_val_bytes(l.k));
+}
+
+struct extents_to_bp_state {
+ struct bpos bucket_start;
+ struct bpos bucket_end;
+ struct bkey_buf last_flushed;
+};
+
static int check_bp_exists(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
struct bpos bucket,
struct bch_backpointer bp,
- struct bkey_s_c orig_k,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bkey_buf *last_flushed)
+ struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
struct btree_iter bp_iter = { NULL };
@@ -417,8 +430,8 @@ static int check_bp_exists(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp);
- if (bpos_lt(bucket, bucket_start) ||
- bpos_gt(bucket, bucket_end))
+ if (bpos_lt(bucket, s->bucket_start) ||
+ bpos_gt(bucket, s->bucket_end))
return 0;
if (!bch2_dev_bucket_exists(c, bucket))
@@ -433,11 +446,9 @@ static int check_bp_exists(struct btree_trans *trans,
if (bp_k.k->type != KEY_TYPE_backpointer ||
memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
- if (!bpos_eq(orig_k.k->p, last_flushed->k->k.p) ||
- bkey_bytes(orig_k.k) != bkey_bytes(&last_flushed->k->k) ||
- memcmp(orig_k.v, &last_flushed->k->v, bkey_val_bytes(orig_k.k))) {
- bch2_bkey_buf_reassemble(&tmp, c, orig_k);
+ bch2_bkey_buf_reassemble(&tmp, c, orig_k);
+ if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
if (bp.level) {
bch2_trans_unlock(trans);
bch2_btree_interior_updates_flush(c);
@@ -447,7 +458,7 @@ static int check_bp_exists(struct btree_trans *trans,
if (ret)
goto err;
- bch2_bkey_buf_copy(last_flushed, c, tmp.k);
+ bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
@@ -475,10 +486,8 @@ missing:
}
static int check_extent_to_backpointers(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
enum btree_id btree, unsigned level,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bkey_buf *last_flushed,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
@@ -498,9 +507,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
bch2_extent_ptr_to_bp(c, btree, level,
k, p, &bucket_pos, &bp);
- ret = check_bp_exists(trans, bucket_pos, bp, k,
- bucket_start, bucket_end,
- last_flushed);
+ ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
return ret;
}
@@ -509,10 +516,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
}
static int check_btree_root_to_backpointers(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
enum btree_id btree_id,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bkey_buf *last_flushed,
int *level)
{
struct bch_fs *c = trans->c;
@@ -536,9 +541,7 @@ retry:
*level = b->c.level;
k = bkey_i_to_s_c(&b->key);
- ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1,
- bucket_start, bucket_end,
- last_flushed, k);
+ ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -559,7 +562,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
si_meminfo(&i);
mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes >> 1, btree_bytes(c));
+ return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
@@ -610,43 +613,35 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
}
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
- struct bpos bucket_start,
- struct bpos bucket_end)
+ struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- enum btree_id btree_id;
- struct bkey_s_c k;
- struct bkey_buf last_flushed;
int ret = 0;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
+ for (enum btree_id btree_id = 0;
+ btree_id < btree_id_nr_alive(c);
+ btree_id++) {
int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
- check_btree_root_to_backpointers(trans, btree_id,
- bucket_start, bucket_end,
- &last_flushed, &level));
+ check_btree_root_to_backpointers(trans, s, btree_id, &level));
if (ret)
return ret;
while (level >= depth) {
+ struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
level,
BTREE_ITER_PREFETCH);
while (1) {
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&iter);
+
+ struct bkey_s_c k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
ret = bkey_err(k) ?:
- check_extent_to_backpointers(trans, btree_id, level,
- bucket_start, bucket_end,
- &last_flushed, k) ?:
+ check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
@@ -668,7 +663,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
}
}
- bch2_bkey_buf_exit(&last_flushed, c);
return 0;
}
@@ -731,37 +725,43 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct bpos start = POS_MIN, end;
+ struct extents_to_bp_state s = { .bucket_start = POS_MIN };
int ret;
+ bch2_bkey_buf_init(&s.last_flushed);
+ bkey_init(&s.last_flushed.k->k);
+
while (1) {
- ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
+ ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
if (ret)
break;
- if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+ if ( bpos_eq(s.bucket_start, POS_MIN) &&
+ !bpos_eq(s.bucket_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
__func__, btree_nodes_fit_in_ram(c));
- if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+ if (!bpos_eq(s.bucket_start, POS_MIN) ||
+ !bpos_eq(s.bucket_end, SPOS_MAX)) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "check_extents_to_backpointers(): ");
- bch2_bpos_to_text(&buf, start);
+ bch2_bpos_to_text(&buf, s.bucket_start);
prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, end);
+ bch2_bpos_to_text(&buf, s.bucket_end);
bch_verbose(c, "%s", buf.buf);
printbuf_exit(&buf);
}
- ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
- if (ret || bpos_eq(end, SPOS_MAX))
+ ret = bch2_check_extents_to_backpointers_pass(trans, &s);
+ if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
break;
- start = bpos_successor(end);
+ s.bucket_start = bpos_successor(s.bucket_end);
}
bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&s.last_flushed, c);
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 737e2396ade7..327365a9feac 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index dac383e37181..69d0d60d50e3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1204,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c)
return c->opts.block_size >> 9;
}
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
- return c->opts.btree_node_size >> 9;
-}
-
static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
{
return c->btree_key_cache_btrees & (1U << btree);
@@ -1254,6 +1249,18 @@ static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
return stdio;
}
+static inline unsigned metadata_replicas_required(struct bch_fs *c)
+{
+ return min(c->opts.metadata_replicas,
+ c->opts.metadata_replicas_required);
+}
+
+static inline unsigned data_replicas_required(struct bch_fs *c)
+{
+ return min(c->opts.data_replicas,
+ c->opts.data_replicas_required);
+}
+
#define BKEY_PADDED_ONSTACK(key, pad) \
struct { struct bkey_i key; __u64 key ## _pad[pad]; }
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0d5ac4184fbc..0668b682a21c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -417,600 +417,12 @@ struct bch_set {
struct bch_val v;
};
-/* Extents */
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32 - 0b1
- * bch_extent_ptr - 0b10
- * bch_extent_crc64 - 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
/* 128 bits, sufficient for cryptographic MACs: */
struct bch_csum {
__le64 lo;
__le64 hi;
} __packed __aligned(8);
-#define BCH_EXTENT_ENTRY_TYPES() \
- x(ptr, 0) \
- x(crc32, 1) \
- x(crc64, 2) \
- x(crc128, 3) \
- x(stripe_ptr, 4) \
- x(rebalance, 5)
-#define BCH_EXTENT_ENTRY_MAX 6
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u32 type:2,
- _compressed_size:7,
- _uncompressed_size:7,
- offset:7,
- _unused:1,
- csum_type:4,
- compression_type:4;
- __u32 csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u32 csum;
- __u32 compression_type:4,
- csum_type:4,
- _unused:1,
- offset:7,
- _uncompressed_size:7,
- _compressed_size:7,
- type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX (1U << 7)
-#define CRC32_NONCE_MAX 0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:3,
- _compressed_size:9,
- _uncompressed_size:9,
- offset:9,
- nonce:10,
- csum_type:4,
- compression_type:4,
- csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 csum_hi:16,
- compression_type:4,
- csum_type:4,
- nonce:10,
- offset:9,
- _uncompressed_size:9,
- _compressed_size:9,
- type:3;
-#endif
- __u64 csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX (1U << 9)
-#define CRC64_NONCE_MAX ((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:4,
- _compressed_size:13,
- _uncompressed_size:13,
- offset:13,
- nonce:13,
- csum_type:4,
- compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 compression_type:4,
- csum_type:4,
- nonce:13,
- offset:13,
- _uncompressed_size:13,
- _compressed_size:13,
- type:4;
-#endif
- struct bch_csum csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX (1U << 13)
-#define CRC128_NONCE_MAX ((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:1,
- cached:1,
- unused:1,
- unwritten:1,
- offset:44, /* 8 petabytes */
- dev:8,
- gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 gen:8,
- dev:8,
- offset:44,
- unwritten:1,
- unused:1,
- cached:1,
- type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:5,
- block:8,
- redundancy:4,
- idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 idx:47,
- redundancy:4,
- block:8,
- type:5;
-#endif
-};
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:34,
- compression:8, /* enum bch_compression_opt */
- target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 target:16,
- compression:8,
- unused:34,
- type:6;
-#endif
-};
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
- unsigned long type;
-#elif __BITS_PER_LONG == 32
- struct {
- unsigned long pad;
- unsigned long type;
- };
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f f;
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
- struct bch_val v;
-
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
- struct bch_val v;
-
- __u64 mem_ptr;
- __le64 seq;
- __le16 sectors_written;
- __le16 flags;
- struct bpos min_key;
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
- struct bch_val v;
-
- __u64 _data[0];
- union bch_extent_entry start[];
-} __packed __aligned(8);
-
-struct bch_reservation {
- struct bch_val v;
-
- __le32 generation;
- __u8 nr_replicas;
- __u8 pad[3];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
- ((sizeof(struct bch_extent_crc128) + \
- sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX \
- (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX \
- ((sizeof(struct bch_btree_ptr_v2) + \
- sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX \
- (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-/* Inodes */
-
-#define BLOCKDEV_INODE_MAX 4096
-
-#define BCACHEFS_ROOT_INO 4096
-
-struct bch_inode {
- struct bch_val v;
-
- __le64 bi_hash_seed;
- __le32 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le64 bi_sectors;
- __le64 bi_size;
- __le64 bi_version;
- __u8 fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL 6
-#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
- struct bch_val v;
-
- __le32 bi_generation;
- __le32 pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_size, 64) \
- x(bi_sectors, 64) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32)
-
-#define BCH_INODE_FIELDS_v3() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32) \
- x(bi_nocow, 8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS() \
- x(data_checksum, 8) \
- x(compression, 8) \
- x(project, 32) \
- x(background_compression, 8) \
- x(data_replicas, 8) \
- x(promote_target, 16) \
- x(foreground_target, 16) \
- x(background_target, 16) \
- x(erasure_code, 16) \
- x(nocow, 8)
-
-enum inode_opt_id {
-#define x(name, ...) \
- Inode_opt_##name,
- BCH_INODE_OPTS()
-#undef x
- Inode_opt_nr,
-};
-
-#define BCH_INODE_FLAGS() \
- x(sync, 0) \
- x(immutable, 1) \
- x(append, 2) \
- x(nodump, 3) \
- x(noatime, 4) \
- x(i_size_dirty, 5) \
- x(i_sectors_dirty, 6) \
- x(unlinked, 7) \
- x(backptr_untrusted, 8)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n) BCH_INODE_##t = 1U << n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n) __BCH_INODE_##t = n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
- struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
-
-/* Dirents */
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
- struct bch_val v;
-
- /* Target inode number: */
- union {
- __le64 d_inum;
- struct { /* DT_SUBVOL */
- __le32 d_child_subvol;
- __le32 d_parent_subvol;
- };
- };
-
- /*
- * Copy of mode bits 12-15 from the target inode - so userspace can get
- * the filetype without having to do a stat()
- */
- __u8 d_type;
-
- __u8 d_name[];
-} __packed __aligned(8);
-
-#define DT_SUBVOL 16
-#define BCH_DT_MAX 17
-
-#define BCH_NAME_MAX 512
-
-/* Xattrs */
-
-#define KEY_TYPE_XATTR_INDEX_USER 0
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
-#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
-#define KEY_TYPE_XATTR_INDEX_SECURITY 4
-
-struct bch_xattr {
- struct bch_val v;
- __u8 x_type;
- __u8 x_name_len;
- __le16 x_val_len;
- __u8 x_name[];
-} __packed __aligned(8);
-
-/* Bucket/allocation information: */
-
-struct bch_alloc {
- struct bch_val v;
- __u8 fields;
- __u8 gen;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1() \
- x(read_time, 16) \
- x(write_time, 16) \
- x(data_type, 8) \
- x(dirty_sectors, 16) \
- x(cached_sectors, 16) \
- x(oldest_gen, 8) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
- BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
- struct bch_val v;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2() \
- x(read_time, 64) \
- x(write_time, 64) \
- x(dirty_sectors, 32) \
- x(cached_sectors, 32) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-struct bch_alloc_v3 {
- struct bch_val v;
- __le64 journal_seq;
- __le32 flags;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
-
-struct bch_alloc_v4 {
- struct bch_val v;
- __u64 journal_seq;
- __u32 flags;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 stripe_redundancy;
- __u32 dirty_sectors;
- __u32 cached_sectors;
- __u64 io_time[2];
- __u32 stripe;
- __u32 nr_external_backpointers;
- __u64 fragmentation_lru;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0 6
-#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
-
-#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
-
struct bch_backpointer {
struct bch_val v;
__u8 btree_id;
@@ -1021,154 +433,6 @@ struct bch_backpointer {
struct bpos pos;
} __packed __aligned(8);
-#define KEY_TYPE_BUCKET_GENS_BITS 8
-#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
- struct bch_val v;
- u8 gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-/* Quotas: */
-
-enum quota_types {
- QTYP_USR = 0,
- QTYP_GRP = 1,
- QTYP_PRJ = 2,
- QTYP_NR = 3,
-};
-
-enum quota_counters {
- Q_SPC = 0,
- Q_INO = 1,
- Q_COUNTERS = 2,
-};
-
-struct bch_quota_counter {
- __le64 hardlimit;
- __le64 softlimit;
-};
-
-struct bch_quota {
- struct bch_val v;
- struct bch_quota_counter c[Q_COUNTERS];
-} __packed __aligned(8);
-
-/* Erasure coding */
-
-struct bch_stripe {
- struct bch_val v;
- __le16 sectors;
- __u8 algorithm;
- __u8 nr_blocks;
- __u8 nr_redundant;
-
- __u8 csum_granularity_bits;
- __u8 csum_type;
- __u8 pad;
-
- struct bch_extent_ptr ptrs[];
-} __packed __aligned(8);
-
-/* Reflink: */
-
-struct bch_reflink_p {
- struct bch_val v;
- __le64 idx;
- /*
- * A reflink pointer might point to an indirect extent which is then
- * later split (by copygc or rebalance). If we only pointed to part of
- * the original indirect extent, and then one of the fragments is
- * outside the range we point to, we'd leak a refcount: so when creating
- * reflink pointers, we need to store pad values to remember the full
- * range we were taking a reference on.
- */
- __le32 front_pad;
- __le32 back_pad;
-} __packed __aligned(8);
-
-struct bch_reflink_v {
- struct bch_val v;
- __le64 refcount;
- union bch_extent_entry start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-struct bch_indirect_inline_data {
- struct bch_val v;
- __le64 refcount;
- u8 data[];
-};
-
-/* Inline data */
-
-struct bch_inline_data {
- struct bch_val v;
- u8 data[];
-};
-
-/* Subvolumes: */
-
-#define SUBVOL_POS_MIN POS(0, 1)
-#define SUBVOL_POS_MAX POS(0, S32_MAX)
-#define BCACHEFS_ROOT_SUBVOL 1
-
-struct bch_subvolume {
- struct bch_val v;
- __le32 flags;
- __le32 snapshot;
- __le64 inode;
- /*
- * Snapshot subvolumes form a tree, separate from the snapshot nodes
- * tree - if this subvolume is a snapshot, this is the ID of the
- * subvolume it was created from:
- */
- __le32 parent;
- __le32 pad;
- bch_le128 otime;
-};
-
-LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
-/*
- * We need to know whether a subvolume is a snapshot so we can know whether we
- * can delete it (or whether it should just be rm -rf'd)
- */
-LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
-LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
-
-/* Snapshots */
-
-struct bch_snapshot {
- struct bch_val v;
- __le32 flags;
- __le32 parent;
- __le32 children[2];
- __le32 subvol;
- /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
- __le32 tree;
- __le32 depth;
- __le32 skip[3];
-};
-
-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
-
-/* True if a subvolume points to this snapshot node: */
-LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
-
-/*
- * Snapshot trees:
- *
- * The snapshot_trees btree gives us persistent indentifier for each tree of
- * bch_snapshot nodes, and allow us to record and easily find the root/master
- * subvolume that other snapshots were created from:
- */
-struct bch_snapshot_tree {
- struct bch_val v;
- __le32 master_subvol;
- __le32 root_snapshot;
-};
-
/* LRU btree: */
struct bch_lru {
@@ -1178,33 +442,6 @@ struct bch_lru {
#define LRU_ID_STRIPES (1U << 16)
-/* Logged operations btree: */
-
-struct bch_logged_op_truncate {
- struct bch_val v;
- __le32 subvol;
- __le32 pad;
- __le64 inum;
- __le64 new_i_size;
-};
-
-enum logged_op_finsert_state {
- LOGGED_OP_FINSERT_start,
- LOGGED_OP_FINSERT_shift_extents,
- LOGGED_OP_FINSERT_finish,
-};
-
-struct bch_logged_op_finsert {
- struct bch_val v;
- __u8 state;
- __u8 pad[3];
- __le32 subvol;
- __le64 inum;
- __le64 dst_offset;
- __le64 src_offset;
- __le64 pos;
-};
-
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -1230,6 +467,19 @@ struct bch_sb_field {
x(ext, 13) \
x(downgrade, 14)
+#include "alloc_background_format.h"
+#include "extents_format.h"
+#include "reflink_format.h"
+#include "ec_format.h"
+#include "inode_format.h"
+#include "dirent_format.h"
+#include "xattr_format.h"
+#include "quota_format.h"
+#include "logged_ops_format.h"
+#include "snapshot_format.h"
+#include "subvolume_format.h"
+#include "sb-counters_format.h"
+
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
BCH_SB_FIELDS()
@@ -1465,23 +715,6 @@ struct bch_sb_field_replicas {
struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
- __le32 timelimit;
- __le32 warnlimit;
-};
-
-struct bch_sb_quota_type {
- __le64 flags;
- struct bch_sb_quota_counter c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
- struct bch_sb_field field;
- struct bch_sb_quota_type q[QTYP_NR];
-} __packed __aligned(8);
-
/* BCH_SB_FIELD_disk_groups: */
#define BCH_SB_LABEL_SIZE 32
@@ -1500,101 +733,6 @@ struct bch_sb_field_disk_groups {
struct bch_disk_group entries[];
} __packed __aligned(8);
-/* BCH_SB_FIELD_counters */
-
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0) \
- x(io_write, 1) \
- x(io_move, 2) \
- x(bucket_invalidate, 3) \
- x(bucket_discard, 4) \
- x(bucket_alloc, 5) \
- x(bucket_alloc_fail, 6) \
- x(btree_cache_scan, 7) \
- x(btree_cache_reap, 8) \
- x(btree_cache_cannibalize, 9) \
- x(btree_cache_cannibalize_lock, 10) \
- x(btree_cache_cannibalize_lock_fail, 11) \
- x(btree_cache_cannibalize_unlock, 12) \
- x(btree_node_write, 13) \
- x(btree_node_read, 14) \
- x(btree_node_compact, 15) \
- x(btree_node_merge, 16) \
- x(btree_node_split, 17) \
- x(btree_node_rewrite, 18) \
- x(btree_node_alloc, 19) \
- x(btree_node_free, 20) \
- x(btree_node_set_root, 21) \
- x(btree_path_relock_fail, 22) \
- x(btree_path_upgrade_fail, 23) \
- x(btree_reserve_get_fail, 24) \
- x(journal_entry_full, 25) \
- x(journal_full, 26) \
- x(journal_reclaim_finish, 27) \
- x(journal_reclaim_start, 28) \
- x(journal_write, 29) \
- x(read_promote, 30) \
- x(read_bounce, 31) \
- x(read_split, 33) \
- x(read_retry, 32) \
- x(read_reuse_race, 34) \
- x(move_extent_read, 35) \
- x(move_extent_write, 36) \
- x(move_extent_finish, 37) \
- x(move_extent_fail, 38) \
- x(move_extent_start_fail, 39) \
- x(copygc, 40) \
- x(copygc_wait, 41) \
- x(gc_gens_end, 42) \
- x(gc_gens_start, 43) \
- x(trans_blocked_journal_reclaim, 44) \
- x(trans_restart_btree_node_reused, 45) \
- x(trans_restart_btree_node_split, 46) \
- x(trans_restart_fault_inject, 47) \
- x(trans_restart_iter_upgrade, 48) \
- x(trans_restart_journal_preres_get, 49) \
- x(trans_restart_journal_reclaim, 50) \
- x(trans_restart_journal_res_get, 51) \
- x(trans_restart_key_cache_key_realloced, 52) \
- x(trans_restart_key_cache_raced, 53) \
- x(trans_restart_mark_replicas, 54) \
- x(trans_restart_mem_realloced, 55) \
- x(trans_restart_memory_allocation_failure, 56) \
- x(trans_restart_relock, 57) \
- x(trans_restart_relock_after_fill, 58) \
- x(trans_restart_relock_key_cache_fill, 59) \
- x(trans_restart_relock_next_node, 60) \
- x(trans_restart_relock_parent_for_fill, 61) \
- x(trans_restart_relock_path, 62) \
- x(trans_restart_relock_path_intent, 63) \
- x(trans_restart_too_many_iters, 64) \
- x(trans_restart_traverse, 65) \
- x(trans_restart_upgrade, 66) \
- x(trans_restart_would_deadlock, 67) \
- x(trans_restart_would_deadlock_write, 68) \
- x(trans_restart_injected, 69) \
- x(trans_restart_key_cache_upgrade, 70) \
- x(trans_traverse_all, 71) \
- x(transaction_commit, 72) \
- x(write_super, 73) \
- x(trans_restart_would_deadlock_recursion_limit, 74) \
- x(trans_restart_write_buffer_flush, 75) \
- x(trans_restart_split_race, 76) \
- x(write_buffer_flush_slowpath, 77) \
- x(write_buffer_flush_sync, 78)
-
-enum bch_persistent_counters {
-#define x(t, n, ...) BCH_COUNTER_##t,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- BCH_COUNTER_NR
-};
-
-struct bch_sb_field_counters {
- struct bch_sb_field field;
- __le64 d[];
-};
-
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index abdb05507d16..76e79a15ba08 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out,
next_key_bits -= 64;
}
- bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+ bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
if (!next_key_bits)
break;
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 761f5e33b1e6..5e52684764eb 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
+static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
+
+ prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
+}
+
#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
.key_invalid = key_type_cookie_invalid, \
+ .val_to_text = key_type_cookie_to_text, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index ee82283722b7..03efe8ee565a 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -83,9 +83,10 @@ enum btree_update_flags {
__BTREE_TRIGGER_NORUN,
__BTREE_TRIGGER_TRANSACTIONAL,
+ __BTREE_TRIGGER_ATOMIC,
+ __BTREE_TRIGGER_GC,
__BTREE_TRIGGER_INSERT,
__BTREE_TRIGGER_OVERWRITE,
- __BTREE_TRIGGER_GC,
__BTREE_TRIGGER_BUCKET_INVALIDATE,
};
@@ -107,6 +108,10 @@ enum btree_update_flags {
* causing us to go emergency read-only)
*/
#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL)
+#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC)
+
+/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
/* @new is entering the btree */
#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
@@ -114,9 +119,6 @@ enum btree_update_flags {
/* @old is leaving the btree */
#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
-
/* signal from bucket invalidate path to alloc trigger */
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 74bf8eb90a4c..3fd1085b6c61 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -720,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
struct bkey_i min_key, max_key;
- unsigned j, cacheline = 1;
+ unsigned cacheline = 1;
t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
bset_ro_tree_capacity(b, t));
@@ -823,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
set_btree_bset(b, t, i);
}
-void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
- struct btree_node_entry *bne)
+void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
{
struct bset *i = &bne->keys;
struct bset_tree *t;
- BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+ BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(b->nsets >= MAX_BSETS);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 632c2b8c5460..79c77baaa383 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b,
void bch2_btree_keys_init(struct btree *);
void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct bch_fs *, struct btree *,
- struct btree_node_entry *);
+void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 8e2488a4b58d..d7c81beac14a 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
- kvpfree(b->data, btree_bytes(c));
+ kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
- b->data = kvpmalloc(btree_bytes(c), gfp);
+ b->data = kvpmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
- kvpfree(b->data, btree_bytes(c));
+ kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
bkey_btree_ptr_init(&b->key);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
- b->byte_order = ilog2(btree_bytes(c));
+ b->byte_order = ilog2(c->opts.btree_node_size);
return b;
}
@@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
- kvpfree(c->verify_ondisk, btree_bytes(c));
+ kvpfree(c->verify_ondisk, c->opts.btree_node_size);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@@ -1192,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
" failed unpacked %zu\n",
b->unpack_fn_len,
b->nr.live_u64s * sizeof(u64),
- btree_bytes(c) - sizeof(struct btree_node),
+ btree_buf_bytes(b) - sizeof(struct btree_node),
b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0],
b->sib_u64s[1],
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 4e1af5882052..6d33885fdbde 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b)
_iter = 0; _iter < (_tbl)->size; _iter++) \
rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-static inline size_t btree_bytes(struct bch_fs *c)
+static inline size_t btree_buf_bytes(const struct btree *b)
{
- return c->opts.btree_node_size;
+ return 1UL << b->byte_order;
}
-static inline size_t btree_max_u64s(struct bch_fs *c)
+static inline size_t btree_buf_max_u64s(const struct btree *b)
{
- return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+ return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline size_t btree_pages(struct bch_fs *c)
+static inline size_t btree_max_u64s(const struct bch_fs *c)
{
- return btree_bytes(c) / PAGE_SIZE;
+ return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline unsigned btree_blocks(struct bch_fs *c)
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+ return c->opts.btree_node_size >> SECTOR_SHIFT;
+}
+
+static inline unsigned btree_blocks(const struct bch_fs *c)
{
return btree_sectors(c) >> c->block_bits;
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 49b4ade758c3..1102995643b1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -597,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -615,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -637,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -649,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -664,8 +664,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[g->data_type],
- bch2_data_types[data_type],
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (data_type == BCH_DATA_btree) {
@@ -1238,11 +1238,11 @@ static int bch2_gc_done(struct bch_fs *c,
for (i = 0; i < BCH_DATA_NR; i++) {
copy_dev_field(dev_usage_buckets_wrong,
- d[i].buckets, "%s buckets", bch2_data_types[i]);
+ d[i].buckets, "%s buckets", bch2_data_type_str(i));
copy_dev_field(dev_usage_sectors_wrong,
- d[i].sectors, "%s sectors", bch2_data_types[i]);
+ d[i].sectors, "%s sectors", bch2_data_type_str(i));
copy_dev_field(dev_usage_fragmented_wrong,
- d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
}
}
@@ -1253,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
copy_fs_field(fs_usage_hidden_wrong,
- hidden, "hidden");
+ b.hidden, "hidden");
copy_fs_field(fs_usage_btree_wrong,
- btree, "btree");
+ b.btree, "btree");
if (!metadata_only) {
copy_fs_field(fs_usage_data_wrong,
- data, "data");
+ b.data, "data");
copy_fs_field(fs_usage_cached_wrong,
- cached, "cached");
+ b.cached, "cached");
copy_fs_field(fs_usage_reserved_wrong,
- reserved, "reserved");
+ b.reserved, "reserved");
copy_fs_field(fs_usage_nr_inodes_wrong,
- nr_inodes,"nr_inodes");
+ b.nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(fs_usage_persistent_reserved_wrong,
@@ -1417,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
gc.gen,
- bch2_data_types[new.data_type],
- bch2_data_types[gc.data_type]))
+ bch2_data_type_str(new.data_type),
+ bch2_data_type_str(gc.data_type)))
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
@@ -1428,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
gc.gen, \
- bch2_data_types[gc.data_type], \
+ bch2_data_type_str(gc.data_type), \
new._f, gc._f)) \
new._f = gc._f; \
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 33db48e2153f..aa9b6cbe3226 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
unsigned flags = memalloc_nofs_save();
void *p;
- BUG_ON(size > btree_bytes(c));
+ BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
@@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
- for (k = unwritten_whiteouts_start(c, b);
- k != unwritten_whiteouts_end(c, b);
+ for (k = unwritten_whiteouts_start(b);
+ k != unwritten_whiteouts_end(b);
k = bkey_p_next(k))
*--ptrs = k;
@@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
verify_no_dups(b, new_whiteouts,
(void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
- memcpy_u64s(unwritten_whiteouts_start(c, b),
+ memcpy_u64s(unwritten_whiteouts_start(b),
new_whiteouts, b->whiteout_u64s);
btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
@@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
}
bytes = sorting_entire_node
- ? btree_bytes(c)
+ ? btree_buf_bytes(b)
: __vstruct_bytes(struct btree_node, u64s);
out = btree_bounce_alloc(c, bytes, &used_mempool);
@@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
if (sorting_entire_node) {
u64s = le16_to_cpu(out->keys.u64s);
- BUG_ON(bytes != btree_bytes(c));
+ BUG_ON(bytes != btree_buf_bytes(b));
/*
* Our temporary buffer is the same size as the btree node's
@@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(c, b, bne);
+ bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
@@ -1160,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ptr_written, b->written);
} else {
for (bne = write_block(b);
- bset_byte_offset(b, bne) < btree_bytes(c);
+ bset_byte_offset(b, bne) < btree_buf_bytes(b);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq &&
!bch2_journal_seq_is_blacklisted(c,
@@ -1172,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"found bset signature after last bset");
}
- sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
+ sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
sorted->keys.u64s = 0;
set_btree_bset(b, b->set, &b->data->keys);
@@ -1188,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(b->nr.live_u64s != u64s);
- btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
+ btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
if (updated_range)
bch2_btree_node_drop_keys_outside_node(b);
@@ -1284,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work)
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
- bio->bi_iter.bi_size = btree_bytes(c);
+ bio->bi_iter.bi_size = btree_buf_bytes(b);
if (rb->have_ioref) {
bio_set_dev(bio, ca->disk_sb.bdev);
@@ -1512,7 +1512,7 @@ fsck_err:
}
if (best >= 0) {
- memcpy(b->data, ra->buf[best], btree_bytes(c));
+ memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
} else {
ret = -1;
@@ -1578,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
for (i = 0; i < ra->nr; i++) {
ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
ra->bio[i] = bio_alloc_bioset(NULL,
- buf_pages(ra->buf[i], btree_bytes(c)),
+ buf_pages(ra->buf[i], btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@@ -1598,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
rb->pick = pick;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
- bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+ bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -1665,7 +1665,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
bio = bio_alloc_bioset(NULL,
- buf_pages(b->data, btree_bytes(c)),
+ buf_pages(b->data, btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@@ -1679,7 +1679,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
- bch2_bio_map(bio, b->data, btree_bytes(c));
+ bch2_bio_map(bio, b->data, btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -2074,8 +2074,8 @@ do_write:
i->u64s = 0;
sort_iter_add(&sort_iter.iter,
- unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
+ unwritten_whiteouts_start(b),
+ unwritten_whiteouts_end(b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
b->whiteout_u64s = 0;
@@ -2251,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(c, b, bne);
+ bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index fa298289e016..3ef338df82f5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1337,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
if (path->should_be_locked &&
!trans->restarted &&
- (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
+ (!dup || !bch2_btree_path_relock_norestart(trans, dup)))
return;
if (dup) {
@@ -2156,7 +2156,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* isn't monotonically increasing before FILTER_SNAPSHOTS, and
* that's what we check against in extents mode:
*/
- if (k.k->p.inode > end.inode)
+ if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_gt(k.k->p, end)
+ : k.k->p.inode > end.inode))
goto end;
if (iter->update_path &&
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index da2b74fa63fc..24772538e4cc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -819,6 +819,11 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+/*
+ * This should not be used in a fastpath, without first trying _do in
+ * nonblocking mode - it will cause excessive transaction restarts and
+ * potentially livelocking:
+ */
#define drop_locks_do(_trans, _do) \
({ \
bch2_trans_unlock(_trans); \
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 2d1c95c42f24..684397442338 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -92,7 +92,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
continue;
bch2_btree_trans_to_text(out, i->trans);
- bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1);
+ bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
}
}
@@ -227,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
prt_printf(&buf, "backtrace:");
prt_newline(&buf);
printbuf_indent_add(&buf, 2);
- bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2);
+ bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
printbuf_indent_sub(&buf, 2);
prt_newline(&buf);
}
@@ -631,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
}
__flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
{
struct get_locks_fail f;
@@ -642,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
int __bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
- if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+ if (!bch2_btree_path_relock_norestart(trans, path)) {
trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
}
@@ -759,12 +758,39 @@ int bch2_trans_relock(struct btree_trans *trans)
if (unlikely(trans->restarted))
return -((int) trans->restarted);
- trans_for_each_path(trans, path, i)
+ trans_for_each_path(trans, path, i) {
+ struct get_locks_fail f;
+
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
- trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+ !btree_path_get_locks(trans, path, false, &f)) {
+ if (trace_trans_restart_relock_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_printf(&buf, " l=%u seq=%u node seq=",
+ f.l, path->l[f.l].lock_seq);
+ if (IS_ERR_OR_NULL(f.b)) {
+ prt_str(&buf, bch2_err_str(PTR_ERR(f.b)));
+ } else {
+ prt_printf(&buf, "%u", f.b->c.lock.seq);
+
+ struct six_lock_count c =
+ bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l);
+ prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+
+ c = six_lock_counts(&f.b->c.lock);
+ prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+ }
+
+ trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ count_event(trans->c, trans_restart_relock);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
+ }
+
return 0;
}
@@ -778,7 +804,7 @@ int bch2_trans_relock_notrace(struct btree_trans *trans)
trans_for_each_path(trans, path, i)
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ !bch2_btree_path_relock_norestart(trans, path)) {
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
return 0;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index cc5500a957a1..4bd72c855da1 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -312,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *,
/* relock: */
-bool bch2_btree_path_relock_norestart(struct btree_trans *,
- struct btree_path *, unsigned long);
+bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
int __bch2_btree_path_relock(struct btree_trans *,
struct btree_path *, unsigned long);
@@ -353,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
-
-struct get_locks_fail {
- unsigned l;
- struct btree *b;
-};
-
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
struct btree_path *, unsigned,
struct get_locks_fail *);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 90eb8065ff2d..30d69a6d133e 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -139,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
- EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(trans->c, b));
+ EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -160,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
- push_whiteout(trans->c, b, insert->k.p);
+ push_whiteout(b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
@@ -348,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
static inline int btree_key_can_insert(struct btree_trans *trans,
struct btree *b, unsigned u64s)
{
- struct bch_fs *c = trans->c;
-
- if (!bch2_btree_node_insert_fits(c, b, u64s))
+ if (!bch2_btree_node_insert_fits(b, u64s))
return -BCH_ERR_btree_insert_btree_node_full;
return 0;
@@ -418,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_k))
return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
@@ -448,9 +445,6 @@ static int run_one_mem_trigger(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
- if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
- return 0;
-
if (old_ops->trigger == new_ops->trigger) {
ret = bch2_key_trigger(trans, i->btree_id, i->level,
old, bkey_i_to_s(new),
@@ -586,9 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
trans_for_each_update(trans, i) {
/*
* XXX: synchronization of cached update triggers with gc
@@ -596,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
*/
BUG_ON(i->cached || i->level);
- if (gc_visited(c, gc_pos_btree_node(insert_l(trans, i)->b))) {
- ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
+ gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
+ int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
if (ret)
- break;
+ return ret;
}
}
- return ret;
+ return 0;
}
static inline int
@@ -680,6 +672,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
return -BCH_ERR_btree_insert_need_mark_replicas;
+ /* XXX: we only want to run this if deltas are nonzero */
+ bch2_trans_account_disk_usage_change(trans);
+
h = trans->hooks;
while (h) {
ret = h->fn(trans, h);
@@ -689,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, i->flags);
+ if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
+ ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags);
if (ret)
goto fatal_err;
}
@@ -994,6 +989,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
!trans->journal_entries_u64s)
goto out_reset;
+ memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
+
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
goto out_reset;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d530307046f4..4a5a64499eb7 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -430,6 +430,9 @@ struct btree_trans {
struct journal_res journal_res;
u64 *journal_seq;
struct disk_reservation *disk_res;
+
+ struct bch_fs_usage_base fs_usage_delta;
+
unsigned journal_u64s;
unsigned extra_disk_res; /* XXX kill */
struct replicas_delta_list *fs_usage_deltas;
@@ -653,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_reflink)| \
BIT_ULL(BKEY_TYPE_btree))
-#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
+#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
(BIT_ULL(BKEY_TYPE_alloc)| \
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
@@ -661,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
- BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+ BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
@@ -738,4 +741,9 @@ enum btree_node_sibling {
btree_next_sib,
};
+struct get_locks_fail {
+ unsigned l;
+ struct btree *b;
+};
+
#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 44f9dfa28a09..4530b14ff2c3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -159,7 +159,7 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
{
size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
- return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+ return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
}
/* Btree node freeing/allocation: */
@@ -280,7 +280,8 @@ retry:
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,
- c->opts.metadata_replicas_required,
+ min(res->nr_replicas,
+ c->opts.metadata_replicas_required),
watermark, 0, cl, &wp);
if (unlikely(ret))
return ERR_PTR(ret);
@@ -1097,7 +1098,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[update_level].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
@@ -1401,7 +1402,7 @@ static void __btree_split_node(struct btree_update *as,
unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
nr_keys[i].val_u64s;
- if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c))
+ if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
n[i]->data->format = b->format;
btree_node_set_format(n[i], n[i]->data->format);
@@ -1703,7 +1704,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_btree_node_prep_for_write(trans, path, b);
- if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
+ if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(trans, path, b);
goto split;
}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index adfc62083844..c593c925d1e3 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -184,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b)
b->sib_u64s[1] = b->nr.live_u64s;
}
-static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+static inline void *btree_data_end(struct btree *b)
{
- return (void *) b->data + btree_bytes(c);
+ return (void *) b->data + btree_buf_bytes(b);
}
-static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
- struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
{
- return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+ return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
}
-static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
- struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
{
- return btree_data_end(c, b);
+ return btree_data_end(b);
}
static inline void *write_block(struct btree *b)
@@ -221,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
return __btree_addr_written(b, k);
}
-static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
- struct btree *b,
- void *end)
+static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
{
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
b->whiteout_u64s;
- ssize_t total = c->opts.btree_node_size >> 3;
+ ssize_t total = btree_buf_bytes(b) >> 3;
/* Always leave one extra u64 for bch2_varint_decode: */
used++;
@@ -235,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
return total - used;
}
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
- struct btree *b)
+static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
{
- ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+ ssize_t remaining = __bch2_btree_u64s_remaining(b,
btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(remaining < 0);
@@ -260,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b)
return 8 << BTREE_WRITE_SET_U64s_BITS;
}
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
- struct btree *b)
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
- __bch_btree_u64s_remaining(c, b, bne->keys.start);
+ __bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
@@ -281,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
return NULL;
}
-static inline void push_whiteout(struct bch_fs *c, struct btree *b,
- struct bpos pos)
+static inline void push_whiteout(struct btree *b, struct bpos pos)
{
struct bkey_packed k;
- BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+ BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
EBUG_ON(btree_node_just_written(b));
if (!bkey_pack_pos(&k, pos, b)) {
@@ -299,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
- bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
+ bkey_p_copy(unwritten_whiteouts_start(b), &k);
}
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
*/
-static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
- struct btree *b, unsigned u64s)
+static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
{
if (unlikely(btree_node_need_rewrite(b)))
return false;
- return u64s <= bch_btree_keys_u64s_remaining(c, b);
+ return u64s <= bch2_btree_keys_u64s_remaining(b);
}
void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 5c1169c78daf..ac7844861966 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -125,13 +125,12 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
struct btree_write_buffered_key *wb,
bool *write_locked, size_t *fast)
{
- struct bch_fs *c = trans->c;
struct btree_path *path;
int ret;
EBUG_ON(!wb->journal_seq);
- EBUG_ON(!c->btree_write_buffer.flushing.pin.seq);
- EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
+ EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
+ EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
ret = bch2_btree_iter_traverse(iter);
if (ret)
@@ -155,7 +154,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
*write_locked = true;
}
- if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) {
+ if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
*write_locked = false;
return wb_flush_one_slowpath(trans, iter, wb);
}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index d83ea0e53df3..54f7826ac498 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -25,7 +25,7 @@
#include <linux/preempt.h>
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
enum bch_data_type data_type,
s64 sectors)
{
@@ -54,20 +54,20 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
bch2_fs_usage_acc_to_base(c, i);
for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
- usage->reserved += usage->persistent_reserved[i];
+ usage->b.reserved += usage->persistent_reserved[i];
for (unsigned i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
- fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
+ fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
}
for_each_member_device(c, ca) {
struct bch_dev_usage dev = bch2_dev_usage_read(ca);
- usage->hidden += (dev.d[BCH_DATA_sb].buckets +
- dev.d[BCH_DATA_journal].buckets) *
+ usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
+ dev.d[BCH_DATA_journal].buckets) *
ca->mi.bucket_size;
}
@@ -188,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out,
prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
prt_printf(out, "hidden:\t\t\t\t%llu\n",
- fs_usage->u.hidden);
+ fs_usage->u.b.hidden);
prt_printf(out, "data:\t\t\t\t%llu\n",
- fs_usage->u.data);
+ fs_usage->u.b.data);
prt_printf(out, "cached:\t\t\t\t%llu\n",
- fs_usage->u.cached);
+ fs_usage->u.b.cached);
prt_printf(out, "reserved:\t\t\t%llu\n",
- fs_usage->u.reserved);
+ fs_usage->u.b.reserved);
prt_printf(out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->u.nr_inodes);
+ fs_usage->u.b.nr_inodes);
prt_printf(out, "online reserved:\t\t%llu\n",
fs_usage->online_reserved);
@@ -225,10 +225,10 @@ static u64 reserve_factor(u64 r)
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
{
- return min(fs_usage->u.hidden +
- fs_usage->u.btree +
- fs_usage->u.data +
- reserve_factor(fs_usage->u.reserved +
+ return min(fs_usage->u.b.hidden +
+ fs_usage->u.b.btree +
+ fs_usage->u.b.data +
+ reserve_factor(fs_usage->u.b.reserved +
fs_usage->online_reserved),
c->capacity);
}
@@ -240,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
u64 data, reserved;
ret.capacity = c->capacity -
- bch2_fs_usage_read_one(c, &c->usage_base->hidden);
+ bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
- data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
- bch2_fs_usage_read_one(c, &c->usage_base->btree);
- reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+ data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
+ bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
percpu_u64_get(c->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
- ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
return ret;
}
@@ -284,7 +284,7 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
prt_newline(out);
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
- prt_str(out, bch2_data_types[i]);
+ bch2_prt_data_type(out, i);
prt_tab(out);
prt_u64(out, usage->d[i].buckets);
prt_tab_rjust(out);
@@ -308,9 +308,9 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
fs_usage = fs_usage_ptr(c, journal_seq, gc);
if (data_type_is_hidden(old->data_type))
- fs_usage->hidden -= ca->mi.bucket_size;
+ fs_usage->b.hidden -= ca->mi.bucket_size;
if (data_type_is_hidden(new->data_type))
- fs_usage->hidden += ca->mi.bucket_size;
+ fs_usage->b.hidden += ca->mi.bucket_size;
u = dev_usage_ptr(ca, journal_seq, gc);
@@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c,
if (idx < 0)
return -1;
- fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
return 0;
}
@@ -394,7 +394,7 @@ int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
- fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
preempt_enable();
err:
@@ -523,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on(g->data_type &&
g->data_type != data_type, c,
"different types of data in same bucket: %s, %s",
- bch2_data_types[g->data_type],
- bch2_data_types[data_type])) {
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type))) {
ret = -EIO;
goto err;
}
@@ -532,7 +532,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
- bch2_data_types[g->data_type ?: data_type],
+ bch2_data_type_str(g->data_type ?: data_type),
g->dirty_sectors, sectors)) {
ret = -EIO;
goto err;
@@ -575,7 +575,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@@ -588,7 +588,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -603,7 +603,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"while marking %s",
ptr->dev, bucket_nr, b_gen,
*bucket_gen(ca, bucket_nr),
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -624,8 +624,8 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type],
- bch2_data_types[ptr_data_type],
+ bch2_data_type_str(bucket_data_type),
+ bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@@ -638,7 +638,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
bucket_sectors, sectors,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -677,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
}
- dst->nr_inodes -= deltas->nr_inodes;
+ dst->b.nr_inodes -= deltas->nr_inodes;
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
added -= deltas->persistent_reserved[i];
- dst->reserved -= deltas->persistent_reserved[i];
+ dst->b.reserved -= deltas->persistent_reserved[i];
dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
}
@@ -694,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
percpu_up_read(&c->mark_lock);
}
-int bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
+void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
static int warned_disk_usage = 0;
bool warn = false;
- u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
- struct replicas_delta *d, *d2;
- struct replicas_delta *top = (void *) deltas->d + deltas->used;
- struct bch_fs_usage *dst;
- s64 added = 0, should_not_have_added;
- unsigned i;
percpu_down_read(&c->mark_lock);
preempt_disable();
- dst = fs_usage_ptr(c, trans->journal_res.seq, false);
-
- for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
- switch (d->r.data_type) {
- case BCH_DATA_btree:
- case BCH_DATA_user:
- case BCH_DATA_parity:
- added += d->delta;
- }
+ struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
+ struct bch_fs_usage_base *src = &trans->fs_usage_delta;
- if (__update_replicas(c, dst, &d->r, d->delta))
- goto need_mark;
- }
-
- dst->nr_inodes += deltas->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- added += deltas->persistent_reserved[i];
- dst->reserved += deltas->persistent_reserved[i];
- dst->persistent_reserved[i] += deltas->persistent_reserved[i];
- }
+ s64 added = src->btree + src->data + src->reserved;
/*
* Not allowed to reduce sectors_available except by getting a
* reservation:
*/
- should_not_have_added = added - (s64) disk_res_sectors;
+ s64 should_not_have_added = added - (s64) disk_res_sectors;
if (unlikely(should_not_have_added > 0)) {
u64 old, new, v = atomic64_read(&c->sectors_available);
@@ -754,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
this_cpu_sub(*c->online_reserved, added);
}
+ dst->hidden += src->hidden;
+ dst->btree += src->btree;
+ dst->data += src->data;
+ dst->cached += src->cached;
+ dst->reserved += src->reserved;
+ dst->nr_inodes += src->nr_inodes;
+
preempt_enable();
percpu_up_read(&c->mark_lock);
@@ -761,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"disk usage increased %lli more than %llu sectors reserved)",
should_not_have_added, disk_res_sectors);
+}
+
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
+{
+ struct bch_fs *c = trans->c;
+ struct replicas_delta *d, *d2;
+ struct replicas_delta *top = (void *) deltas->d + deltas->used;
+ struct bch_fs_usage *dst;
+ unsigned i;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+ for (d = deltas->d; d != top; d = replicas_delta_next(d))
+ if (__update_replicas(c, dst, &d->r, d->delta))
+ goto need_mark;
+
+ dst->b.nr_inodes += deltas->nr_inodes;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ dst->b.reserved += deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] += deltas->persistent_reserved[i];
+ }
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
return 0;
need_mark:
/* revert changes: */
@@ -1084,7 +1096,7 @@ static int __trigger_reservation(struct btree_trans *trans,
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
- fs_usage->reserved += sectors;
+ fs_usage->b.reserved += sectors;
fs_usage->persistent_reserved[replicas - 1] += sectors;
preempt_enable();
@@ -1130,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- bch2_data_types[type],
- bch2_data_types[type]);
+ bch2_data_type_str(a->v.data_type),
+ bch2_data_type_str(type),
+ bch2_data_type_str(type));
ret = -EIO;
goto err;
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 2c95cc5d86be..6387e039f789 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -356,6 +356,8 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
ret; \
})
+void bch2_trans_account_disk_usage_change(struct btree_trans *);
+
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
@@ -385,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
return false;
}
+static inline const char *bch2_data_type_str(enum bch_data_type type)
+{
+ return type < BCH_DATA_NR
+ ? __bch2_data_types[type]
+ : "(invalid data type)";
+}
+
+static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
+{
+ if (type < BCH_DATA_NR)
+ prt_str(out, __bch2_data_types[type]);
+ else
+ prt_printf(out, "(invalid data type %u)", type);
+}
+
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 783f71017204..6a31740222a7 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -45,23 +45,18 @@ struct bch_dev_usage {
} d[BCH_DATA_NR];
};
-struct bch_fs_usage {
- /* all fields are in units of 512 byte sectors: */
+struct bch_fs_usage_base {
u64 hidden;
u64 btree;
u64 data;
u64 cached;
u64 reserved;
u64 nr_inodes;
+};
- /* XXX: add stats for compression ratio */
-#if 0
- u64 uncompressed;
- u64 compressed;
-#endif
-
- /* broken out: */
-
+struct bch_fs_usage {
+ /* all fields are in units of 512 byte sectors: */
+ struct bch_fs_usage_base b;
u64 persistent_reserved[BCH_REPLICAS_MAX];
u64 replicas[];
};
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index f41889093a2c..363644451106 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
- while (1) {
+ do {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread && kthread_should_stop())
break;
@@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
schedule();
try_to_freeze();
- }
+ } while (0);
__set_current_state(TASK_RUNNING);
del_timer_sync(&wait.cpu_timer);
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 607fd5e232c9..58c2eb45570f 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
+static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
+{
+ if (type < BCH_COMPRESSION_TYPE_NR)
+ prt_str(out, __bch2_compression_types[type]);
+ else
+ prt_printf(out, "(invalid compression type %u)", type);
+}
+
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 6f13477ff652..4150feca42a2 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -285,9 +285,7 @@ restart_drop_extra_replicas:
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
- bch2_bkey_set_needs_rebalance(c, insert,
- op->opts.background_target,
- op->opts.background_compression) ?:
+ bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
@@ -529,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_MOVE|
m->data_opts.write_flags;
- m->op.compression_opt = io_opts.background_compression ?: io_opts.compression;
+ m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
bkey_for_each_ptr(ptrs, ptr)
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index d6418948495f..7bdba8507fc9 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
return false;
bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_sorted, btree_bytes(c)),
+ buf_pages(n_sorted, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_sorted, btree_bytes(c));
+ bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
submit_bio_wait(bio);
bio_put(bio);
percpu_ref_put(&ca->io_ref);
- memcpy(n_ondisk, n_sorted, btree_bytes(c));
+ memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
v->written = 0;
if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
@@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
- c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
}
bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_ondisk, btree_bytes(c)),
+ buf_pages(n_ondisk, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+ bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
ret = submit_bio_wait(bio);
if (ret) {
@@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
- kvpfree(n_ondisk, btree_bytes(c));
+ kvpfree(n_ondisk, btree_buf_bytes(b));
percpu_ref_put(&ca->io_ref);
}
@@ -627,7 +627,7 @@ restart:
prt_printf(&i->buf, "backtrace:");
prt_newline(&i->buf);
printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, task, 0);
+ bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
new file mode 100644
index 000000000000..5e116b88e814
--- /dev/null
+++ b/fs/bcachefs/dirent_format.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_FORMAT_H
+#define _BCACHEFS_DIRENT_FORMAT_H
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+ struct bch_val v;
+
+ /* Target inode number: */
+ union {
+ __le64 d_inum;
+ struct { /* DT_SUBVOL */
+ __le32 d_child_subvol;
+ __le32 d_parent_subvol;
+ };
+ };
+
+ /*
+ * Copy of mode bits 12-15 from the target inode - so userspace can get
+ * the filetype without having to do a stat()
+ */
+ __u8 d_type;
+
+ __u8 d_name[];
+} __packed __aligned(8);
+
+#define DT_SUBVOL 16
+#define BCH_DT_MAX 17
+
+#define BCH_NAME_MAX 512
+
+#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d802bc63c8d0..d503af270024 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -190,7 +190,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
a->v.stripe_redundancy, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
+ bch2_data_type_str(a->v.data_type),
a->v.dirty_sectors,
a->v.stripe, s.k->p.offset)) {
ret = -EIO;
@@ -200,7 +200,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
+ bch2_data_type_str(a->v.data_type),
a->v.dirty_sectors,
s.k->p.offset)) {
ret = -EIO;
@@ -367,7 +367,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
}
}
- if (!(flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))) {
+ if (flags & BTREE_TRIGGER_ATOMIC) {
struct stripe *m = genradix_ptr(&c->stripes, idx);
if (!m) {
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
new file mode 100644
index 000000000000..44ce88ba08d7
--- /dev/null
+++ b/fs/bcachefs/ec_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_FORMAT_H
+#define _BCACHEFS_EC_FORMAT_H
+
+struct bch_stripe {
+ struct bch_val v;
+ __le16 sectors;
+ __u8 algorithm;
+ __u8 nr_blocks;
+ __u8 nr_redundant;
+
+ __u8 csum_granularity_bits;
+ __u8 csum_type;
+ __u8 pad;
+
+ struct bch_extent_ptr ptrs[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 82ec056f4cdb..61395b113df9 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -8,6 +8,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
+#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
@@ -1018,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
- bch2_csum_types[crc.csum_type],
- bch2_compression_types[crc.compression_type]);
+ bch2_csum_types[crc.csum_type]);
+ bch2_prt_compression_type(out, crc.compression_type);
break;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
@@ -1334,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
- unsigned target, unsigned compression)
+ struct bch_io_opts *opts)
{
struct bkey_s k = bkey_i_to_s(_k);
struct bch_extent_rebalance *r;
+ unsigned target = opts->background_target;
+ unsigned compression = background_compression(*opts);
bool needs_rebalance;
if (!bkey_extent_is_direct_data(k.k))
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index a855c94d43dd..6bf839d69e84 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -708,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
- unsigned, unsigned);
+ struct bch_io_opts *);
/* Generic extent code: */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
new file mode 100644
index 000000000000..3bd2fdbb0817
--- /dev/null
+++ b/fs/bcachefs/extents_format.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_FORMAT_H
+#define _BCACHEFS_EXTENTS_FORMAT_H
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32 - 0b1
+ * bch_extent_ptr - 0b10
+ * bch_extent_crc64 - 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+#define BCH_EXTENT_ENTRY_TYPES() \
+ x(ptr, 0) \
+ x(crc32, 1) \
+ x(crc64, 2) \
+ x(crc128, 3) \
+ x(stripe_ptr, 4) \
+ x(rebalance, 5)
+#define BCH_EXTENT_ENTRY_MAX 6
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u32 type:2,
+ _compressed_size:7,
+ _uncompressed_size:7,
+ offset:7,
+ _unused:1,
+ csum_type:4,
+ compression_type:4;
+ __u32 csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u32 csum;
+ __u32 compression_type:4,
+ csum_type:4,
+ _unused:1,
+ offset:7,
+ _uncompressed_size:7,
+ _compressed_size:7,
+ type:2;
+#endif
+} __packed __aligned(8);
+
+#define CRC32_SIZE_MAX (1U << 7)
+#define CRC32_NONCE_MAX 0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:3,
+ _compressed_size:9,
+ _uncompressed_size:9,
+ offset:9,
+ nonce:10,
+ csum_type:4,
+ compression_type:4,
+ csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 csum_hi:16,
+ compression_type:4,
+ csum_type:4,
+ nonce:10,
+ offset:9,
+ _uncompressed_size:9,
+ _compressed_size:9,
+ type:3;
+#endif
+ __u64 csum_lo;
+} __packed __aligned(8);
+
+#define CRC64_SIZE_MAX (1U << 9)
+#define CRC64_NONCE_MAX ((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:4,
+ _compressed_size:13,
+ _uncompressed_size:13,
+ offset:13,
+ nonce:13,
+ csum_type:4,
+ compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 compression_type:4,
+ csum_type:4,
+ nonce:13,
+ offset:13,
+ _uncompressed_size:13,
+ _compressed_size:13,
+ type:4;
+#endif
+ struct bch_csum csum;
+} __packed __aligned(8);
+
+#define CRC128_SIZE_MAX (1U << 13)
+#define CRC128_NONCE_MAX ((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:1,
+ cached:1,
+ unused:1,
+ unwritten:1,
+ offset:44, /* 8 petabytes */
+ dev:8,
+ gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 gen:8,
+ dev:8,
+ offset:44,
+ unwritten:1,
+ unused:1,
+ cached:1,
+ type:1;
+#endif
+} __packed __aligned(8);
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:5,
+ block:8,
+ redundancy:4,
+ idx:47;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 idx:47,
+ redundancy:4,
+ block:8,
+ type:5;
+#endif
+};
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:6,
+ unused:34,
+ compression:8, /* enum bch_compression_opt */
+ target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 target:16,
+ compression:8,
+ unused:34,
+ type:6;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
+ unsigned long type;
+#elif __BITS_PER_LONG == 32
+ struct {
+ unsigned long pad;
+ unsigned long type;
+ };
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f f;
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+ struct bch_val v;
+
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+struct bch_btree_ptr_v2 {
+ struct bch_val v;
+
+ __u64 mem_ptr;
+ __le64 seq;
+ __le16 sectors_written;
+ __le16 flags;
+ struct bpos min_key;
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
+
+struct bch_extent {
+ struct bch_val v;
+
+ __u64 _data[0];
+ union bch_extent_entry start[];
+} __packed __aligned(8);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+ ((sizeof(struct bch_extent_crc128) + \
+ sizeof(struct bch_extent_ptr)) / sizeof(__u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX \
+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX \
+ ((sizeof(struct bch_btree_ptr_v2) + \
+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+#define BKEY_BTREE_PTR_U64s_MAX \
+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+struct bch_reservation {
+ struct bch_val v;
+
+ __le32 generation;
+ __u8 nr_replicas;
+ __u8 pad[3];
+} __packed __aligned(8);
+
+struct bch_inline_data {
+ struct bch_val v;
+ u8 data[];
+};
+
+#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 9637f636e32d..b04750dbf870 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
}
#define eytzinger1_for_each(_i, _size) \
- for ((_i) = eytzinger1_first((_size)); \
+ for (unsigned (_i) = eytzinger1_first((_size)); \
(_i) != 0; \
(_i) = eytzinger1_next((_i), (_size)))
@@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
}
#define eytzinger0_for_each(_i, _size) \
- for ((_i) = eytzinger0_first((_size)); \
+ for (unsigned (_i) = eytzinger0_first((_size)); \
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 73c12e565af5..27710cdd5710 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -303,18 +303,6 @@ void bch2_readahead(struct readahead_control *ractl)
darray_exit(&readpages_iter.folios);
}
-static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
- subvol_inum inum, struct folio *folio)
-{
- bch2_folio_create(folio, __GFP_NOFAIL);
-
- rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
- rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
- bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
-}
-
static void bch2_read_single_folio_end_io(struct bio *bio)
{
complete(bio->bi_private);
@@ -329,6 +317,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
int ret;
DECLARE_COMPLETION_ONSTACK(done);
+ if (!bch2_folio_create(folio, GFP_KERNEL))
+ return -ENOMEM;
+
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
@@ -336,7 +327,11 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
- __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+ rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+ rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+ BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+ bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
wait_for_completion(&done);
ret = blk_status_to_errno(rbio->bio.bi_status);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index fdd57c5785c9..33cb6da3a5ad 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -77,6 +77,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+ /* bios must be 512 byte aligned: */
+ if ((offset|iter->count) & (SECTOR_SIZE - 1))
+ return -EINVAL;
+
ret = min_t(loff_t, iter->count,
max_t(loff_t, 0, i_size_read(&inode->v) - offset));
@@ -84,6 +88,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
return ret;
shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+ if (shorten >= iter->count)
+ shorten = 0;
iter->count -= shorten;
bio = bio_alloc_bioset(NULL,
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index ff664fd0d8ef..d359aa9b33b8 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
}
}
-void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
- u64 start, u64 end)
+int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+ u64 *start, u64 end,
+ bool nonblocking)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
struct folio_batch fbatch;
s64 i_sectors_delta = 0;
- unsigned i, j;
+ int ret = 0;
- if (end <= start)
- return;
+ if (end <= *start)
+ return 0;
folio_batch_init(&fbatch);
while (filemap_get_folios(inode->v.i_mapping,
&index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
+
+ if (!nonblocking)
+ folio_lock(folio);
+ else if (!folio_trylock(folio)) {
+ folio_batch_release(&fbatch);
+ ret = -EAGAIN;
+ break;
+ }
+
u64 folio_start = folio_sector(folio);
u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
- struct bch_folio *s;
BUG_ON(end <= folio_start);
- folio_lock(folio);
- s = bch2_folio(folio);
+ *start = min(end, folio_end);
+ struct bch_folio *s = bch2_folio(folio);
if (s) {
+ unsigned folio_offset = max(*start, folio_start) - folio_start;
+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+
spin_lock(&s->lock);
- for (j = folio_offset; j < folio_offset + folio_len; j++) {
+ for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
i_sectors_delta -= s->s[j].state == SECTOR_dirty;
bch2_folio_sector_set(folio, s, j,
folio_sector_reserve(s->s[j].state));
@@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
}
bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ return ret;
}
static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 27f712ae37a6..8cbaba6565b4 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
int bch2_get_folio_disk_reservation(struct bch_fs *,
struct bch_inode_info *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 98bd5babab19..8c70123b6a0c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -79,7 +79,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
continue;
bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
- REQ_OP_FLUSH,
+ REQ_OP_WRITE|REQ_PREFLUSH,
GFP_KERNEL,
&c->nocow_flush_bioset),
struct nocow_flush, bio);
@@ -675,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
- drop_locks_do(trans,
- (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+ if (bch2_mark_pagecache_reserved(inode, &hole_start,
+ iter.pos.offset, true))
+ drop_locks_do(trans,
+ bch2_mark_pagecache_reserved(inode, &hole_start,
+ iter.pos.offset, false));
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 1cbc5807bc80..3dc8630ff9fe 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -337,11 +337,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
create_flags |= BCH_CREATE_SNAPSHOT_RO;
- /* why do we need this lock? */
- down_read(&c->vfs_sb->s_umount);
-
- if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
+ /* sync_inodes_sb enforce s_umount is locked */
+ down_read(&c->vfs_sb->s_umount);
sync_inodes_sb(c->vfs_sb);
+ up_read(&c->vfs_sb->s_umount);
+ }
retry:
if (arg.src_ptr) {
error = user_path_at(arg.dirfd,
@@ -425,8 +426,6 @@ err2:
goto retry;
}
err1:
- up_read(&c->vfs_sb->s_umount);
-
return error;
}
@@ -456,6 +455,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
if (IS_ERR(victim))
return PTR_ERR(victim);
+ dir = d_inode(path.dentry);
if (victim->d_sb->s_fs_info != c) {
ret = -EXDEV;
goto err;
@@ -464,14 +464,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
ret = -ENOENT;
goto err;
}
- dir = d_inode(path.dentry);
ret = __bch2_unlink(dir, victim, true);
if (!ret) {
fsnotify_rmdir(dir, victim);
d_delete(victim);
}
- inode_unlock(dir);
err:
+ inode_unlock(dir);
dput(victim);
path_put(&path);
return ret;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ec419b8e2c43..77ae65542db9 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -435,7 +435,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
bch2_subvol_is_ro(c, inode->ei_subvol) ?:
__bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
- return ret;
+ return bch2_err_class(ret);
ihold(&inode->v);
d_instantiate(dentry, &inode->v);
@@ -487,8 +487,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_inode_info *dir= to_bch_ei(vdir);
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- return bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+ int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
__bch2_unlink(vdir, dentry, false);
+ return bch2_err_class(ret);
}
static int bch2_symlink(struct mnt_idmap *idmap,
@@ -523,7 +524,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
return 0;
err:
iput(&inode->v);
- return ret;
+ return bch2_err_class(ret);
}
static int bch2_mkdir(struct mnt_idmap *idmap,
@@ -641,7 +642,7 @@ err:
src_inode,
dst_inode);
- return ret;
+ return bch2_err_class(ret);
}
static void bch2_setattr_copy(struct mnt_idmap *idmap,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 4f0ecd605675..6a760777bafb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -119,22 +119,19 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
if (!ret)
*snapshot = iter.pos.snapshot;
err:
- bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-static int __lookup_dirent(struct btree_trans *trans,
+static int lookup_dirent_in_snapshot(struct btree_trans *trans,
struct bch_hash_info hash_info,
subvol_inum dir, struct qstr *name,
- u64 *target, unsigned *type)
+ u64 *target, unsigned *type, u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c_dirent d;
- int ret;
-
- ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
- &hash_info, dir, name, 0);
+ int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0, snapshot);
if (ret)
return ret;
@@ -225,15 +222,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
- ret = lookup_inode(trans, root_inum.inum, &root_inode, &snapshot);
+ u32 root_inode_snapshot = snapshot;
+ ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
bch_err_msg(c, ret, "looking up root inode");
if (ret)
return ret;
root_hash_info = bch2_hash_info_init(c, &root_inode);
- ret = __lookup_dirent(trans, root_hash_info, root_inum,
- &lostfound_str, &inum, &d_type);
+ ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
+ &lostfound_str, &inum, &d_type, snapshot);
if (bch2_err_matches(ret, ENOENT))
goto create_lostfound;
@@ -250,7 +248,10 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
* The bch2_check_dirents pass has already run, dangling dirents
* shouldn't exist here:
*/
- return lookup_inode(trans, inum, lostfound, &snapshot);
+ ret = lookup_inode(trans, inum, lostfound, &snapshot);
+ bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
+ inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
+ return ret;
create_lostfound:
/*
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 37dce96f48ac..086f0090b03a 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -506,22 +506,33 @@ fsck_err:
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
- prt_printf(out, "mode=%o ", inode->bi_mode);
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "mode=%o", inode->bi_mode);
+ prt_newline(out);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
prt_printf(out, " (%x)", inode->bi_flags);
+ prt_newline(out);
- prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
- inode->bi_journal_seq,
- inode->bi_size,
- inode->bi_sectors,
- inode->bi_version);
+ prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
+ prt_newline(out);
+
+ prt_printf(out, "bi_size=%llu", inode->bi_size);
+ prt_newline(out);
+
+ prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
+ prt_newline(out);
+
+ prt_newline(out);
+ prt_printf(out, "bi_version=%llu", inode->bi_version);
#define x(_name, _bits) \
- prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
+ prt_printf(out, #_name "=%llu", (u64) inode->_name); \
+ prt_newline(out);
BCH_INODE_FIELDS_v3()
#undef x
+ printbuf_indent_sub(out, 2);
}
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
@@ -587,7 +598,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
}
}
- if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
BUG_ON(!trans->journal_res.seq);
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
@@ -597,7 +608,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
struct bch_fs *c = trans->c;
percpu_down_read(&c->mark_lock);
- this_cpu_add(c->usage_gc->nr_inodes, nr);
+ this_cpu_add(c->usage_gc->b.nr_inodes, nr);
percpu_up_read(&c->mark_lock);
}
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
new file mode 100644
index 000000000000..83d107331edf
--- /dev/null
+++ b/fs/bcachefs/inode_format.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_FORMAT_H
+#define _BCACHEFS_INODE_FORMAT_H
+
+#define BLOCKDEV_INODE_MAX 4096
+#define BCACHEFS_ROOT_INO 4096
+
+struct bch_inode {
+ struct bch_val v;
+
+ __le64 bi_hash_seed;
+ __le32 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v2 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le64 bi_sectors;
+ __le64 bi_size;
+ __le64 bi_version;
+ __u8 fields[];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL 6
+#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
+
+struct bch_inode_generation {
+ struct bch_val v;
+
+ __le32 bi_generation;
+ __le32 pad;
+} __packed __aligned(8);
+
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
+#define BCH_INODE_FIELDS_v2() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_size, 64) \
+ x(bi_sectors, 64) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32)
+
+#define BCH_INODE_FIELDS_v3() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32) \
+ x(bi_nocow, 8)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS() \
+ x(data_checksum, 8) \
+ x(compression, 8) \
+ x(project, 32) \
+ x(background_compression, 8) \
+ x(data_replicas, 8) \
+ x(promote_target, 16) \
+ x(foreground_target, 16) \
+ x(background_target, 16) \
+ x(erasure_code, 16) \
+ x(nocow, 8)
+
+enum inode_opt_id {
+#define x(name, ...) \
+ Inode_opt_##name,
+ BCH_INODE_OPTS()
+#undef x
+ Inode_opt_nr,
+};
+
+#define BCH_INODE_FLAGS() \
+ x(sync, 0) \
+ x(immutable, 1) \
+ x(append, 2) \
+ x(nodump, 3) \
+ x(noatime, 4) \
+ x(i_size_dirty, 5) \
+ x(i_sectors_dirty, 6) \
+ x(unlinked, 7) \
+ x(backptr_untrusted, 8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n) BCH_INODE_##t = 1U << n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n) __BCH_INODE_##t = n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+ struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+
+#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index ca6d5f516aa2..1baf78594cca 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -442,9 +442,7 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_bkey_set_needs_rebalance(c, copy,
- opts.background_target,
- opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 33c0e783d546..2c098ac017b3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_bkey_set_needs_rebalance(c, sk.k,
- op->opts.background_target,
- op->opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
@@ -1447,10 +1445,11 @@ err:
op->flags |= BCH_WRITE_DONE;
if (ret < 0) {
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "%s(): error: %s", __func__, bch2_err_str(ret));
+ if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ op->pos.offset << 9,
+ "%s(): error: %s", __func__, bch2_err_str(ret));
op->error = ret;
break;
}
@@ -1565,6 +1564,7 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
+ op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 8538ef34f62b..bc890776eb57 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -27,6 +27,47 @@ static const char * const bch2_journal_errors[] = {
NULL
};
+static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
+{
+ union journal_res_state s = READ_ONCE(j->reservations);
+ unsigned i = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *buf = j->buf + i;
+
+ prt_printf(out, "seq:");
+ prt_tab(out);
+ prt_printf(out, "%llu", seq);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "refcount:");
+ prt_tab(out);
+ prt_printf(out, "%u", journal_state_count(s, i));
+ prt_newline(out);
+
+ prt_printf(out, "size:");
+ prt_tab(out);
+ prt_human_readable_u64(out, vstruct_bytes(buf->data));
+ prt_newline(out);
+
+ prt_printf(out, "expires");
+ prt_tab(out);
+ prt_printf(out, "%li jiffies", buf->expires - jiffies);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
+{
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 24);
+
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++)
+ bch2_journal_buf_to_text(out, j, seq);
+}
+
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
@@ -156,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
* We don't close a journal_buf until the next journal_buf is finished writing,
* and can be opened again - this also initializes the next journal_buf:
*/
-static void __journal_entry_close(struct journal *j, unsigned closed_val)
+static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
@@ -185,7 +226,17 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
- trace_journal_entry_close(c, vstruct_bytes(buf->data));
+ if (trace_journal_entry_close_enabled() && trace) {
+ struct printbuf pbuf = PRINTBUF;
+ pbuf.atomic++;
+
+ prt_str(&pbuf, "entry size: ");
+ prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
+ prt_newline(&pbuf);
+ bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
+ trace_journal_entry_close(c, pbuf.buf);
+ printbuf_exit(&pbuf);
+ }
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
@@ -225,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
void bch2_journal_halt(struct journal *j)
{
spin_lock(&j->lock);
- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
if (!j->err_seq)
j->err_seq = journal_cur_seq(j);
journal_wake(j);
@@ -239,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j)
/* Don't close it yet if we already have a write in flight: */
if (ret)
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
else if (nr_unwritten_journal_entries(j)) {
struct journal_buf *buf = journal_cur_buf(j);
@@ -406,7 +457,7 @@ static void journal_write_work(struct work_struct *work)
if (delta > 0)
mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
unlock:
spin_unlock(&j->lock);
}
@@ -463,13 +514,21 @@ retry:
buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
ret = journal_entry_open(j);
if (ret == JOURNAL_ERR_max_in_flight) {
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, true);
- trace_and_count(c, journal_entry_full, c);
+ if (trace_journal_entry_full_enabled()) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ bch2_journal_bufs_to_text(&buf, j);
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+ count_event(c, journal_entry_full);
}
unlock:
can_discard = j->can_discard;
@@ -549,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
/*
* Not enough room in current journal entry, have to flush it:
*/
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
} else {
journal_cur_buf(j)->u64s_reserved += d;
}
@@ -606,7 +665,7 @@ recheck_need_open:
struct journal_res res = { 0 };
if (journal_entry_is_open(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
spin_unlock(&j->lock);
@@ -786,7 +845,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
if (buf->need_flush_to_write_buffer) {
if (seq == journal_cur_seq(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
union journal_res_state s;
s.v = atomic64_read_acquire(&j->reservations.counter);
@@ -1339,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
}
prt_newline(out);
-
- for (u64 seq = journal_cur_seq(j);
- seq >= journal_last_unwritten_seq(j);
- --seq) {
- unsigned i = seq & JOURNAL_BUF_MASK;
-
- prt_printf(out, "unwritten entry:");
- prt_tab(out);
- prt_printf(out, "%llu", seq);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "refcount:");
- prt_tab(out);
- prt_printf(out, "%u", journal_state_count(s, i));
- prt_newline(out);
-
- prt_printf(out, "sectors:");
- prt_tab(out);
- prt_printf(out, "%u", j->buf[i].sectors);
- prt_newline(out);
-
- prt_printf(out, "expires");
- prt_tab(out);
- prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
- }
+ prt_printf(out, "unwritten entries:");
+ prt_newline(out);
+ bch2_journal_bufs_to_text(out, j);
prt_printf(out,
"replay done:\t\t%i\n",
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b0f4dd491e12..47805193f18c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -683,10 +683,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
for (i = 0; i < nr_types; i++) {
- if (i < BCH_DATA_NR)
- prt_printf(out, " %s", bch2_data_types[i]);
- else
- prt_printf(out, " (unknown data type %u)", i);
+ bch2_prt_data_type(out, i);
prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
@@ -1481,6 +1478,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
c->opts.foreground_target;
unsigned i, replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
+ unsigned replicas_need = min_t(unsigned, replicas_want,
+ READ_ONCE(c->opts.metadata_replicas_required));
rcu_read_lock();
retry:
@@ -1529,7 +1528,7 @@ done:
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
- return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
+ return replicas >= replicas_need ? 0 : -EROFS;
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
@@ -1991,7 +1990,8 @@ CLOSURE_CALLBACK(bch2_journal_write)
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+ bio_reset(bio, ca->disk_sb.bdev,
+ REQ_OP_WRITE|REQ_PREFLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 820d25e19e5f..c33dca641575 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -205,7 +205,7 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
- if (nr_online < c->opts.metadata_replicas_required) {
+ if (nr_online < metadata_replicas_required(c)) {
ret = JOURNAL_ERR_insufficient_devices;
goto out;
}
@@ -892,9 +892,11 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
journal_seq_pin(j, seq)->devs);
seq++;
- spin_unlock(&j->lock);
- ret = bch2_mark_replicas(c, &replicas.e);
- spin_lock(&j->lock);
+ if (replicas.e.nr_devs) {
+ spin_unlock(&j->lock);
+ ret = bch2_mark_replicas(c, &replicas.e);
+ spin_lock(&j->lock);
+ }
}
spin_unlock(&j->lock);
err:
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
new file mode 100644
index 000000000000..6a4bf7129dba
--- /dev/null
+++ b/fs/bcachefs/logged_ops_format.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
+#define _BCACHEFS_LOGGED_OPS_FORMAT_H
+
+struct bch_logged_op_truncate {
+ struct bch_val v;
+ __le32 subvol;
+ __le32 pad;
+ __le64 inum;
+ __le64 new_i_size;
+};
+
+enum logged_op_finsert_state {
+ LOGGED_OP_FINSERT_start,
+ LOGGED_OP_FINSERT_shift_extents,
+ LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+ struct bch_val v;
+ __u8 state;
+ __u8 pad[3];
+ __le32 subvol;
+ __le64 inum;
+ __le64 dst_offset;
+ __le64 src_offset;
+ __le64 pos;
+};
+
+#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index b2be565bb8f2..64df11ab422b 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -17,7 +17,7 @@
* Rust and rustc has issues with u128.
*/
-#if defined(__SIZEOF_INT128__) && defined(__KERNEL__)
+#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
typedef struct {
unsigned __int128 v;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7a33319dcd16..bf68ea49447b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -6,9 +6,11 @@
#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_gc.h"
+#include "btree_io.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "errcode.h"
@@ -34,12 +36,46 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
+static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ printbuf_tabstop_push(out, 20);
+ prt_str(out, "rewrite ptrs:");
+ prt_tab(out);
+ bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "kill ptrs: ");
+ prt_tab(out);
+ bch2_prt_u64_base2(out, data_opts->kill_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "target: ");
+ prt_tab(out);
+ bch2_target_to_text(out, c, data_opts->target);
+ prt_newline(out);
+
+ prt_str(out, "compression: ");
+ prt_tab(out);
+ bch2_compression_opt_to_text(out, background_compression(*io_opts));
+ prt_newline(out);
+
+ prt_str(out, "extra replicas: ");
+ prt_tab(out);
+ prt_u64(out, data_opts->extra_replicas);
+}
+
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
if (trace_move_extent_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
trace_move_extent(c, buf.buf);
printbuf_exit(&buf);
}
@@ -111,6 +147,15 @@ static void move_write(struct moving_io *io)
return;
}
+ if (trace_move_extent_write_enabled()) {
+ struct bch_fs *c = io->write.op.c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
+ trace_move_extent_write(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
closure_get(&io->write.ctxt->cl);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
atomic_inc(&io->write.ctxt->write_ios);
@@ -241,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
+ trace_move_extent2(c, k, &io_opts, &data_opts);
+
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
- trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@@ -759,6 +805,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
if (!b)
goto next;
+ unsigned sectors = btree_ptr_sectors_written(&b->key);
+
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
bch2_trans_iter_exit(trans, &iter);
@@ -768,11 +816,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
goto err;
if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate,
- c->opts.btree_node_size >> 9);
+ bch2_ratelimit_increment(ctxt->rate, sectors);
if (ctxt->stats) {
- atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
- atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
+ atomic64_add(sectors, &ctxt->stats->sectors_moved);
}
}
next:
@@ -1083,9 +1130,9 @@ int bch2_data_job(struct bch_fs *c,
void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
- prt_printf(out, "%s: data type=%s pos=",
- stats->name,
- bch2_data_types[stats->data_type]);
+ prt_printf(out, "%s: data type==", stats->name);
+ bch2_prt_data_type(out, stats->data_type);
+ prt_str(out, " pos=");
bch2_bbpos_to_text(out, stats->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 8e6f230eac38..b1ed0b9a20d3 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = {
NULL
};
-const char * const bch2_compression_types[] = {
+const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = {
NULL
};
-const char * const bch2_data_types[] = {
+const char * const __bch2_data_types[] = {
BCH_DATA_TYPES()
NULL
};
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 93a24fef4214..9a4b7faa3765 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
-extern const char * const bch2_compression_types[];
+extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
-extern const char * const bch2_data_types[];
+extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
extern const char * const bch2_jset_entry_types[];
extern const char * const bch2_fs_usage_types[];
@@ -564,6 +564,11 @@ struct bch_io_opts {
#undef x
};
+static inline unsigned background_compression(struct bch_io_opts opts)
+{
+ return opts.background_compression ?: opts.compression;
+}
+
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index accf246c3233..b27d22925929 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -56,6 +56,7 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
va_copy(args2, args);
len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+ va_end(args2);
} while (len + 1 >= printbuf_remaining(out) &&
!bch2_printbuf_make_room(out, len + 1));
diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h
new file mode 100644
index 000000000000..dc34347ef6c7
--- /dev/null
+++ b/fs/bcachefs/quota_format.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_FORMAT_H
+#define _BCACHEFS_QUOTA_FORMAT_H
+
+/* KEY_TYPE_quota: */
+
+enum quota_types {
+ QTYP_USR = 0,
+ QTYP_GRP = 1,
+ QTYP_PRJ = 2,
+ QTYP_NR = 3,
+};
+
+enum quota_counters {
+ Q_SPC = 0,
+ Q_INO = 1,
+ Q_COUNTERS = 2,
+};
+
+struct bch_quota_counter {
+ __le64 hardlimit;
+ __le64 softlimit;
+};
+
+struct bch_quota {
+ struct bch_val v;
+ struct bch_quota_counter c[Q_COUNTERS];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+ __le32 timelimit;
+ __le32 warnlimit;
+};
+
+struct bch_sb_quota_type {
+ __le64 flags;
+ struct bch_sb_quota_counter c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+ struct bch_sb_field field;
+ struct bch_sb_quota_type q[QTYP_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_QUOTA_FORMAT_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 95f46cb3b5bd..22d1017aa49b 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -177,8 +177,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
prt_str(&buf, "target=");
bch2_target_to_text(&buf, c, r->target);
prt_str(&buf, " compression=");
- struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
- prt_str(&buf, bch2_compression_opts[opt.type]);
+ bch2_compression_opt_to_text(&buf, r->compression);
prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, k);
@@ -254,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
if (k.k->p.inode) {
target = io_opts->background_target;
- compression = io_opts->background_compression ?: io_opts->compression;
+ compression = background_compression(*io_opts);
} else {
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
target = r ? r->target : io_opts->background_target;
- compression = r ? r->compression :
- (io_opts->background_compression ?: io_opts->compression);
+ compression = r ? r->compression : background_compression(*io_opts);
}
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
@@ -371,6 +369,7 @@ static int do_rebalance(struct moving_context *ctxt)
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
!atomic64_read(&r->scan_stats.sectors_seen)) {
+ bch2_moving_ctxt_flush_all(ctxt);
bch2_trans_unlock_long(trans);
rebalance_wait(c);
}
@@ -385,7 +384,6 @@ static int bch2_rebalance_thread(void *arg)
struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct moving_context ctxt;
- int ret;
set_freezable();
@@ -393,8 +391,7 @@ static int bch2_rebalance_thread(void *arg)
writepoint_ptr(&c->rebalance_write_point),
true);
- while (!kthread_should_stop() &&
- !(ret = do_rebalance(&ctxt)))
+ while (!kthread_should_stop() && !do_rebalance(&ctxt))
;
bch2_moving_ctxt_exit(&ctxt);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 725214605a05..21e13bb4335b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -280,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_inodes:
- c->usage_base->nr_inodes = le64_to_cpu(u->v);
+ c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_key_version:
atomic64_set(&c->key_version,
@@ -577,8 +577,9 @@ u64 bch2_recovery_passes_from_stable(u64 v)
static bool check_version_upgrade(struct bch_fs *c)
{
- unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
unsigned latest_version = bcachefs_metadata_version_current;
+ unsigned latest_compatible = min(latest_version,
+ bch2_latest_compatible_version(c->sb.version));
unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
unsigned new_version = 0;
@@ -597,7 +598,7 @@ static bool check_version_upgrade(struct bch_fs *c)
new_version = latest_version;
break;
case BCH_VERSION_UPGRADE_none:
- new_version = old_version;
+ new_version = min(old_version, latest_version);
break;
}
}
@@ -774,7 +775,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!(c->opts.nochanges && c->opts.norecovery)) {
+ if (!c->opts.nochanges) {
mutex_lock(&c->sb_lock);
bool write_sb = false;
@@ -804,7 +805,7 @@ int bch2_fs_recovery(struct bch_fs *c)
if (bch2_check_version_downgrade(c)) {
struct printbuf buf = PRINTBUF;
- prt_str(&buf, "Version downgrade required:\n");
+ prt_str(&buf, "Version downgrade required:");
__le64 passes = ext->recovery_passes_required[0];
bch2_sb_set_downgrade(c,
@@ -812,7 +813,7 @@ int bch2_fs_recovery(struct bch_fs *c)
BCH_VERSION_MINOR(c->sb.version));
passes = ext->recovery_passes_required[0] & ~passes;
if (passes) {
- prt_str(&buf, " running recovery passes: ");
+ prt_str(&buf, "\n running recovery passes: ");
prt_bitflags(&buf, bch2_recovery_passes,
bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index faa5d3670058..c47c66c2b394 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -292,10 +292,10 @@ static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *f
}
}
-int bch2_trans_mark_reflink_v(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+int bch2_trigger_reflink_v(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
(flags & BTREE_TRIGGER_INSERT))
@@ -324,7 +324,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
min(datalen, 32U), d.v->data);
}
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
unsigned flags)
@@ -486,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+ if (dst_inum.inum < src_inum.inum) {
+ /* Avoid some lock cycle transaction restarts */
+ ret = bch2_btree_iter_traverse(&dst_iter);
+ if (ret)
+ continue;
+ }
+
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
bch2_btree_iter_set_pos(&src_iter, src_want);
@@ -538,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
- opts.background_target,
- opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?:
bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 8ee778ec0022..4d8867289717 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -24,14 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
- .trigger = bch2_trans_mark_reflink_v, \
+ .trigger = bch2_trigger_reflink_v, \
.min_val_size = 8, \
})
@@ -39,7 +39,7 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+int bch2_trigger_indirect_inline_data(struct btree_trans *,
enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
unsigned);
@@ -47,7 +47,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
.val_to_text = bch2_indirect_inline_data_to_text, \
- .trigger = bch2_trans_mark_indirect_inline_data, \
+ .trigger = bch2_trigger_indirect_inline_data, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
new file mode 100644
index 000000000000..6772eebb1fc6
--- /dev/null
+++ b/fs/bcachefs/reflink_format.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_FORMAT_H
+#define _BCACHEFS_REFLINK_FORMAT_H
+
+struct bch_reflink_p {
+ struct bch_val v;
+ __le64 idx;
+ /*
+ * A reflink pointer might point to an indirect extent which is then
+ * later split (by copygc or rebalance). If we only pointed to part of
+ * the original indirect extent, and then one of the fragments is
+ * outside the range we point to, we'd leak a refcount: so when creating
+ * reflink pointers, we need to store pad values to remember the full
+ * range we were taking a reference on.
+ */
+ __le32 front_pad;
+ __le32 back_pad;
+} __packed __aligned(8);
+
+struct bch_reflink_v {
+ struct bch_val v;
+ __le64 refcount;
+ union bch_extent_entry start[0];
+ __u64 _data[];
+} __packed __aligned(8);
+
+struct bch_indirect_inline_data {
+ struct bch_val v;
+ __le64 refcount;
+ u8 data[];
+};
+
+#endif /* _BCACHEFS_REFLINK_FORMAT_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 92ba56ef1fc8..cc2672c12031 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -9,6 +9,12 @@
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
+/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
+static int bch2_memcmp(const void *l, const void *r, size_t size)
+{
+ return memcmp(l, r, size);
+}
+
/* Replicas tracking - in memory: */
static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
@@ -33,21 +39,16 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+ eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
struct bch_replicas_entry_v0 *e)
{
- unsigned i;
-
- if (e->data_type < BCH_DATA_NR)
- prt_printf(out, "%s", bch2_data_types[e->data_type]);
- else
- prt_printf(out, "(invalid data type %u)", e->data_type);
+ bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u [", e->nr_devs);
- for (i = 0; i < e->nr_devs; i++)
+ for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
@@ -55,15 +56,10 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
void bch2_replicas_entry_to_text(struct printbuf *out,
struct bch_replicas_entry_v1 *e)
{
- unsigned i;
-
- if (e->data_type < BCH_DATA_NR)
- prt_printf(out, "%s", bch2_data_types[e->data_type]);
- else
- prt_printf(out, "(invalid data type %u)", e->data_type);
+ bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
- for (i = 0; i < e->nr_devs; i++)
+ for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
@@ -831,7 +827,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
- memcmp, NULL);
+ bch2_memcmp, NULL);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 9632f36f5f31..b6bf0ebe7e84 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -207,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = BCH_FS_USAGE_inodes;
- u->v = cpu_to_le64(c->usage_base->nr_inodes);
+ u->v = cpu_to_le64(c->usage_base->b.nr_inodes);
}
{
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/sb-counters.c
index 02a996e06a64..7dc898761bb3 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/sb-counters.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "super-io.h"
-#include "counters.h"
+#include "sb-counters.h"
/* BCH_SB_FIELD_counters */
diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/sb-counters.h
index 4778aa19bf34..81f8aec9fcb1 100644
--- a/fs/bcachefs/counters.h
+++ b/fs/bcachefs/sb-counters.h
@@ -1,11 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COUNTERS_H
-#define _BCACHEFS_COUNTERS_H
+#ifndef _BCACHEFS_SB_COUNTERS_H
+#define _BCACHEFS_SB_COUNTERS_H
#include "bcachefs.h"
#include "super-io.h"
-
int bch2_sb_counters_to_cpu(struct bch_fs *);
int bch2_sb_counters_from_cpu(struct bch_fs *);
@@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
-#endif // _BCACHEFS_COUNTERS_H
+#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
new file mode 100644
index 000000000000..62ea478215d0
--- /dev/null
+++ b/fs/bcachefs/sb-counters_format.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
+#define _BCACHEFS_SB_COUNTERS_FORMAT_H
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0) \
+ x(io_write, 1) \
+ x(io_move, 2) \
+ x(bucket_invalidate, 3) \
+ x(bucket_discard, 4) \
+ x(bucket_alloc, 5) \
+ x(bucket_alloc_fail, 6) \
+ x(btree_cache_scan, 7) \
+ x(btree_cache_reap, 8) \
+ x(btree_cache_cannibalize, 9) \
+ x(btree_cache_cannibalize_lock, 10) \
+ x(btree_cache_cannibalize_lock_fail, 11) \
+ x(btree_cache_cannibalize_unlock, 12) \
+ x(btree_node_write, 13) \
+ x(btree_node_read, 14) \
+ x(btree_node_compact, 15) \
+ x(btree_node_merge, 16) \
+ x(btree_node_split, 17) \
+ x(btree_node_rewrite, 18) \
+ x(btree_node_alloc, 19) \
+ x(btree_node_free, 20) \
+ x(btree_node_set_root, 21) \
+ x(btree_path_relock_fail, 22) \
+ x(btree_path_upgrade_fail, 23) \
+ x(btree_reserve_get_fail, 24) \
+ x(journal_entry_full, 25) \
+ x(journal_full, 26) \
+ x(journal_reclaim_finish, 27) \
+ x(journal_reclaim_start, 28) \
+ x(journal_write, 29) \
+ x(read_promote, 30) \
+ x(read_bounce, 31) \
+ x(read_split, 33) \
+ x(read_retry, 32) \
+ x(read_reuse_race, 34) \
+ x(move_extent_read, 35) \
+ x(move_extent_write, 36) \
+ x(move_extent_finish, 37) \
+ x(move_extent_fail, 38) \
+ x(move_extent_start_fail, 39) \
+ x(copygc, 40) \
+ x(copygc_wait, 41) \
+ x(gc_gens_end, 42) \
+ x(gc_gens_start, 43) \
+ x(trans_blocked_journal_reclaim, 44) \
+ x(trans_restart_btree_node_reused, 45) \
+ x(trans_restart_btree_node_split, 46) \
+ x(trans_restart_fault_inject, 47) \
+ x(trans_restart_iter_upgrade, 48) \
+ x(trans_restart_journal_preres_get, 49) \
+ x(trans_restart_journal_reclaim, 50) \
+ x(trans_restart_journal_res_get, 51) \
+ x(trans_restart_key_cache_key_realloced, 52) \
+ x(trans_restart_key_cache_raced, 53) \
+ x(trans_restart_mark_replicas, 54) \
+ x(trans_restart_mem_realloced, 55) \
+ x(trans_restart_memory_allocation_failure, 56) \
+ x(trans_restart_relock, 57) \
+ x(trans_restart_relock_after_fill, 58) \
+ x(trans_restart_relock_key_cache_fill, 59) \
+ x(trans_restart_relock_next_node, 60) \
+ x(trans_restart_relock_parent_for_fill, 61) \
+ x(trans_restart_relock_path, 62) \
+ x(trans_restart_relock_path_intent, 63) \
+ x(trans_restart_too_many_iters, 64) \
+ x(trans_restart_traverse, 65) \
+ x(trans_restart_upgrade, 66) \
+ x(trans_restart_would_deadlock, 67) \
+ x(trans_restart_would_deadlock_write, 68) \
+ x(trans_restart_injected, 69) \
+ x(trans_restart_key_cache_upgrade, 70) \
+ x(trans_traverse_all, 71) \
+ x(transaction_commit, 72) \
+ x(write_super, 73) \
+ x(trans_restart_would_deadlock_recursion_limit, 74) \
+ x(trans_restart_write_buffer_flush, 75) \
+ x(trans_restart_split_race, 76) \
+ x(write_buffer_flush_slowpath, 77) \
+ x(write_buffer_flush_sync, 78)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+ struct bch_sb_field field;
+ __le64 d[];
+};
+
+#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index a44a238bf8b5..eff5ce18c69c 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -251,7 +251,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Data allowed:");
prt_tab(out);
if (BCH_MEMBER_DATA_ALLOWED(&m))
- prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+ prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
else
prt_printf(out, "(none)");
prt_newline(out);
@@ -259,7 +259,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Has data:");
prt_tab(out);
if (data_have)
- prt_bitflags(out, bch2_data_types, data_have);
+ prt_bitflags(out, __bch2_data_types, data_have);
else
prt_printf(out, "(none)");
prt_newline(out);
@@ -421,7 +421,7 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
- m->errors_reset_time = ktime_get_real_seconds();
+ m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds());
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 56af937523ff..ac6ba04d5521 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -728,7 +728,7 @@ static int check_snapshot(struct btree_trans *trans,
return 0;
memset(&s, 0, sizeof(s));
- memcpy(&s, k.v, bkey_val_bytes(k.k));
+ memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
id = le32_to_cpu(s.parent);
if (id) {
@@ -1053,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
n->v.tree = cpu_to_le32(tree);
n->v.depth = cpu_to_le32(depth);
+ n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+ n->v.btime.hi = 0;
for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
@@ -1681,5 +1683,5 @@ int bch2_snapshots_read(struct bch_fs *c)
void bch2_fs_snapshots_exit(struct bch_fs *c)
{
- kfree(rcu_dereference_protected(c->snapshots, true));
+ kvfree(rcu_dereference_protected(c->snapshots, true));
}
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
new file mode 100644
index 000000000000..aabcd3a74cd9
--- /dev/null
+++ b/fs/bcachefs/snapshot_format.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
+#define _BCACHEFS_SNAPSHOT_FORMAT_H
+
+struct bch_snapshot {
+ struct bch_val v;
+ __le32 flags;
+ __le32 parent;
+ __le32 children[2];
+ __le32 subvol;
+ /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
+ __le32 tree;
+ __le32 depth;
+ __le32 skip[3];
+ bch_le128 btime;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+ struct bch_val v;
+ __le32 master_subvol;
+ __le32 root_snapshot;
+};
+
+#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 89fdb7c21134..fcaa5a888744 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -160,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
}
static __always_inline int
-bch2_hash_lookup(struct btree_trans *trans,
+bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
- unsigned flags)
+ unsigned flags, u32 snapshot)
{
struct bkey_s_c k;
- u32 snapshot;
int ret;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
@@ -195,6 +190,19 @@ bch2_hash_lookup(struct btree_trans *trans,
}
static __always_inline int
+bch2_hash_lookup(struct btree_trans *trans,
+ struct btree_iter *iter,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, const void *key,
+ unsigned flags)
+{
+ u32 snapshot;
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+}
+
+static __always_inline int
bch2_hash_hole(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
new file mode 100644
index 000000000000..af79134b07d6
--- /dev/null
+++ b/fs/bcachefs/subvolume_format.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
+#define _BCACHEFS_SUBVOLUME_FORMAT_H
+
+#define SUBVOL_POS_MIN POS(0, 1)
+#define SUBVOL_POS_MAX POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL 1
+
+struct bch_subvolume {
+ struct bch_val v;
+ __le32 flags;
+ __le32 snapshot;
+ __le64 inode;
+ /*
+ * Snapshot subvolumes form a tree, separate from the snapshot nodes
+ * tree - if this subvolume is a snapshot, this is the ID of the
+ * subvolume it was created from:
+ *
+ * This is _not_ necessarily the subvolume of the directory containing
+ * this subvolume:
+ */
+ __le32 parent;
+ __le32 pad;
+ bch_le128 otime;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
+
+#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6d3db5cce5f6..bd64eb68e84a 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -2,7 +2,6 @@
#include "bcachefs.h"
#include "checksum.h"
-#include "counters.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
@@ -13,6 +12,7 @@
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
+#include "sb-counters.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "sb-members.h"
@@ -142,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
void bch2_free_super(struct bch_sb_handle *sb)
{
kfree(sb->bio);
- if (!IS_ERR_OR_NULL(sb->bdev_handle))
- bdev_release(sb->bdev_handle);
+ if (!IS_ERR_OR_NULL(sb->s_bdev_file))
+ fput(sb->s_bdev_file);
kfree(sb->holder);
kfree(sb->sb_name);
@@ -704,22 +704,22 @@ retry:
if (!opt_get(*opts, nochanges))
sb->mode |= BLK_OPEN_WRITE;
- sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (IS_ERR(sb->bdev_handle) &&
- PTR_ERR(sb->bdev_handle) == -EACCES &&
+ sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (IS_ERR(sb->s_bdev_file) &&
+ PTR_ERR(sb->s_bdev_file) == -EACCES &&
opt_get(*opts, read_only)) {
sb->mode &= ~BLK_OPEN_WRITE;
- sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (!IS_ERR(sb->bdev_handle))
+ sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (!IS_ERR(sb->s_bdev_file))
opt_set(*opts, nochanges, true);
}
- if (IS_ERR(sb->bdev_handle)) {
- ret = PTR_ERR(sb->bdev_handle);
- goto out;
+ if (IS_ERR(sb->s_bdev_file)) {
+ ret = PTR_ERR(sb->s_bdev_file);
+ goto err;
}
- sb->bdev = sb->bdev_handle->bdev;
+ sb->bdev = file_bdev(sb->s_bdev_file);
ret = bch2_sb_realloc(sb, 0);
if (ret) {
@@ -1321,7 +1321,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Superblock size:");
prt_tab(out);
- prt_printf(out, "%zu", vstruct_bytes(sb));
+ prt_units_u64(out, vstruct_bytes(sb));
+ prt_str(out, "/");
+ prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
prt_newline(out);
prt_printf(out, "Clean:");
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9dbc35940197..6b23e11825e6 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -23,7 +23,6 @@
#include "checksum.h"
#include "clock.h"
#include "compress.h"
-#include "counters.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
@@ -49,6 +48,7 @@
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
+#include "sb-counters.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "snapshot.h"
@@ -883,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
- btree_bytes(c)) ||
+ c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@@ -1386,8 +1386,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
prt_bdevname(&name, ca->disk_sb.bdev);
if (c->sb.nr_devices == 1)
- strlcpy(c->name, name.buf, sizeof(c->name));
- strlcpy(ca->name, name.buf, sizeof(ca->name));
+ strscpy(c->name, name.buf, sizeof(c->name));
+ strscpy(ca->name, name.buf, sizeof(ca->name));
printbuf_exit(&name);
@@ -1428,10 +1428,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
? c->opts.metadata_replicas
- : c->opts.metadata_replicas_required,
+ : metadata_replicas_required(c),
!(flags & BCH_FORCE_IF_DATA_DEGRADED)
? c->opts.data_replicas
- : c->opts.data_replicas_required);
+ : data_replicas_required(c));
return nr_rw >= required;
case BCH_MEMBER_STATE_failed:
@@ -1625,7 +1625,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (data) {
struct printbuf data_has = PRINTBUF;
- prt_bitflags(&data_has, bch2_data_types, data);
+ prt_bitflags(&data_has, __bch2_data_types, data);
bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
printbuf_exit(&data_has);
ret = -EBUSY;
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 0e5a14fc8e7f..ec784d975f66 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -4,7 +4,7 @@
struct bch_sb_handle {
struct bch_sb *sb;
- struct bdev_handle *bdev_handle;
+ struct file *s_bdev_file;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 8ed52319ff68..cee80c47feea 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -21,6 +21,7 @@
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
+#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "inode.h"
@@ -247,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
mutex_lock(&c->btree_cache.lock);
list_for_each_entry(b, &c->btree_cache.live, list)
- ret += btree_bytes(c);
+ ret += btree_buf_bytes(b);
mutex_unlock(&c->btree_cache.lock);
return ret;
@@ -330,7 +331,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
prt_newline(out);
for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
- prt_str(out, bch2_compression_types[i]);
+ bch2_prt_compression_type(out, i);
prt_tab(out);
prt_human_readable_u64(out, s[i].sectors_compressed << 9);
@@ -725,8 +726,10 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_sb(c, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
- if ((id == Opt_background_target ||
- id == Opt_background_compression) && v)
+ if (v &&
+ (id == Opt_background_target ||
+ id == Opt_background_compression ||
+ (id == Opt_compression && !c->opts.background_compression)))
bch2_set_rebalance_needs_scan(c, 0);
ret = size;
@@ -883,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
for (i = 1; i < BCH_DATA_NR; i++)
prt_printf(out, "%-12s:%12llu\n",
- bch2_data_types[i],
+ bch2_data_type_str(i),
percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
}
}
@@ -908,7 +911,7 @@ SHOW(bch2_dev)
}
if (attr == &sysfs_has_data) {
- prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+ prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
prt_char(out, '\n');
}
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index b1c867aa2b58..9220d7de10db 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -53,9 +53,9 @@ int bch2_run_thread_with_file(struct thread_with_file *thr,
if (ret)
goto err;
- fd_install(fd, file);
get_task_struct(thr->task);
wake_up_process(thr->task);
+ fd_install(fd, file);
return fd;
err:
if (fd >= 0)
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index c94876b3bb06..293b90d704fb 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -46,7 +46,7 @@ DECLARE_EVENT_CLASS(fs_str,
__assign_str(str, str);
),
- TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
+ TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
);
DECLARE_EVENT_CLASS(trans_str,
@@ -273,28 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full,
TP_ARGS(c)
);
-DEFINE_EVENT(bch_fs, journal_entry_full,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(fs_str, journal_entry_full,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-TRACE_EVENT(journal_entry_close,
- TP_PROTO(struct bch_fs *c, unsigned bytes),
- TP_ARGS(c, bytes),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u32, bytes )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->bytes = bytes;
- ),
-
- TP_printk("%d,%d entry bytes %u",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->bytes)
+DEFINE_EVENT(fs_str, journal_entry_close,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(bio, journal_write,
@@ -542,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail,
__entry->level = path->level;
TRACE_BPOS_assign(pos, path->pos);
- c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
__entry->self_read_count = c.n[SIX_LOCK_read];
__entry->self_intent_count = c.n[SIX_LOCK_intent];
@@ -827,40 +813,28 @@ TRACE_EVENT(bucket_evacuate,
);
DEFINE_EVENT(fs_str, move_extent,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_read,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_write,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_finish,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-TRACE_EVENT(move_extent_fail,
- TP_PROTO(struct bch_fs *c, const char *msg),
- TP_ARGS(c, msg),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __string(msg, msg )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __assign_str(msg, msg);
- ),
-
- TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
+DEFINE_EVENT(fs_str, move_extent_fail,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_start_fail,
@@ -1039,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race,
__entry->level = b->c.level;
__entry->written = b->written;
__entry->blocks = btree_blocks(trans->c);
- __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b);
+ __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b);
),
TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
@@ -1146,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path)
);
-struct get_locks_fail;
-
TRACE_EVENT(trans_restart_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1195,11 +1167,9 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->node_seq)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
+DEFINE_EVENT(trans_str, trans_restart_relock,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+ TP_ARGS(trans, caller_ip, str)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index c2ef7cddaa4f..3a32faa86b5c 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
-void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
{
while (nr_bits)
prt_char(out, '0' + ((v >> --nr_bits) & 1));
}
+void bch2_prt_u64_base2(struct printbuf *out, u64 v)
+{
+ bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
+}
+
void bch2_print_string_as_lines(const char *prefix, const char *lines)
{
const char *p;
@@ -267,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
console_unlock();
}
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr)
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
+ gfp_t gfp)
{
#ifdef CONFIG_STACKTRACE
unsigned nr_entries = 0;
- int ret = 0;
stack->nr = 0;
- ret = darray_make_room(stack, 32);
+ int ret = darray_make_room_gfp(stack, 32, gfp);
if (ret)
return ret;
@@ -284,7 +289,7 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne
do {
nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
} while (nr_entries == stack->size &&
- !(ret = darray_make_room(stack, stack->size * 2)));
+ !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
stack->nr = nr_entries;
up_read(&task->signal->exec_update_lock);
@@ -303,10 +308,10 @@ void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
}
}
-int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr)
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
{
bch_stacktrace stack = { 0 };
- int ret = bch2_save_backtrace(&stack, task, skipnr + 1);
+ int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
bch2_prt_backtrace(out, &stack);
darray_exit(&stack);
@@ -413,14 +418,15 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
bch2_quantiles_update(&stats->quantiles, duration);
}
- if (time_after64(end, stats->last_event)) {
+ if (stats->last_event && time_after64(end, stats->last_event)) {
freq = end - stats->last_event;
mean_and_variance_update(&stats->freq_stats, freq);
mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
stats->max_freq = max(stats->max_freq, freq);
stats->min_freq = min(stats->min_freq, freq);
- stats->last_event = end;
}
+
+ stats->last_event = end;
}
static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
@@ -1186,7 +1192,9 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret)
{
darray_init(ret);
- char *dev_name = kstrdup(_dev_name, GFP_KERNEL), *s = dev_name;
+ char *dev_name, *s, *orig;
+
+ dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
if (!dev_name)
return -ENOMEM;
@@ -1201,10 +1209,10 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret)
}
}
- kfree(dev_name);
+ kfree(orig);
return 0;
err:
bch2_darray_str_exit(ret);
- kfree(dev_name);
+ kfree(orig);
return -ENOMEM;
}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index c75fc31915d3..b414736d59a5 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -342,14 +342,15 @@ bool bch2_is_zero(const void *, size_t);
u64 bch2_read_flag_list(char *, const char * const[]);
-void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2(struct printbuf *, u64);
void bch2_print_string_as_lines(const char *prefix, const char *lines);
typedef DARRAY(unsigned long) bch_stacktrace;
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned);
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
-int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
{
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 5a1858fb9879..9c0d2316031b 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -590,8 +590,9 @@ err:
mutex_unlock(&inode->ei_update_lock);
if (value &&
- (opt_id == Opt_background_compression ||
- opt_id == Opt_background_target))
+ (opt_id == Opt_background_target ||
+ opt_id == Opt_background_compression ||
+ (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
new file mode 100644
index 000000000000..e9f810539552
--- /dev/null
+++ b/fs/bcachefs/xattr_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_FORMAT_H
+#define _BCACHEFS_XATTR_FORMAT_H
+
+#define KEY_TYPE_XATTR_INDEX_USER 0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
+#define KEY_TYPE_XATTR_INDEX_SECURITY 4
+
+struct bch_xattr {
+ struct bch_val v;
+ __u8 x_type;
+ __u8 x_name_len;
+ __le16 x_val_len;
+ __u8 x_name[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a9be9ac99222..378d9103a207 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1455,6 +1455,7 @@ out:
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
+ LIST_HEAD(retry_list);
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
@@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
+ u64 used;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->reserved || block_group->pinned ||
- block_group->used || block_group->ro ||
+ if (btrfs_is_block_group_used(block_group) || block_group->ro ||
list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
@@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
+
+ /*
+ * The block group may be unused but there may be space reserved
+ * accounting with the existence of that block group, that is,
+ * space_info->bytes_may_use was incremented by a task but no
+ * space was yet allocated from the block group by the task.
+ * That space may or may not be allocated, as we are generally
+ * pessimistic about space reservation for metadata as well as
+ * for data when using compression (as we reserve space based on
+ * the worst case, when data can't be compressed, and before
+ * actually attempting compression, before starting writeback).
+ *
+ * So check if the total space of the space_info minus the size
+ * of this block group is less than the used space of the
+ * space_info - if that's the case, then it means we have tasks
+ * that might be relying on the block group in order to allocate
+ * extents, and add back the block group to the unused list when
+ * we finish, so that we retry later in case no tasks ended up
+ * needing to allocate extents from the block group.
+ */
+ used = btrfs_space_info_used(space_info, true);
+ if (space_info->total_bytes - block_group->length < used) {
+ /*
+ * Add a reference for the list, compensate for the ref
+ * drop under the "next" label for the
+ * fs_info->unused_bgs list.
+ */
+ btrfs_get_block_group(block_group);
+ list_add_tail(&block_group->bg_list, &retry_list);
+
+ trace_btrfs_skip_unused_block_group(block_group);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
/* We don't want to force the issue, only flip if it's ok. */
ret = inc_block_group_ro(block_group, 0);
@@ -1650,12 +1691,16 @@ next:
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
@@ -2684,6 +2729,37 @@ next:
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+
+ /*
+ * If the block group is still unused, add it to the list of
+ * unused block groups. The block group may have been created in
+ * order to satisfy a space reservation, in which case the
+ * extent allocation only happens later. But often we don't
+ * actually need to allocate space that we previously reserved,
+ * so the block group may become unused for a long time. For
+ * example for metadata we generally reserve space for a worst
+ * possible scenario, but then don't end up allocating all that
+ * space or none at all (due to no need to COW, extent buffers
+ * were already COWed in the current transaction and still
+ * unwritten, tree heights lower than the maximum possible
+ * height, etc). For data we generally reserve the axact amount
+ * of space we are going to allocate later, the exception is
+ * when using compression, as we must reserve space based on the
+ * uncompressed data size, because the compression is only done
+ * when writeback triggered and we don't know how much space we
+ * are actually going to need, so we reserve the uncompressed
+ * size because the data may be uncompressible in the worst case.
+ */
+ if (ret == 0) {
+ bool used;
+
+ spin_lock(&block_group->lock);
+ used = btrfs_is_block_group_used(block_group);
+ spin_unlock(&block_group->lock);
+
+ if (!used)
+ btrfs_mark_bg_unused(block_group);
+ }
}
btrfs_trans_release_chunk_metadata(trans);
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c4a1f01cc1c2..962b11983901 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
return (block_group->start + block_group->length);
}
+static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
+{
+ lockdep_assert_held(&bg->lock);
+
+ return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+}
+
static inline bool btrfs_is_block_group_data_only(
struct btrfs_block_group *block_group)
{
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index ceb5f586a2d5..1043a8142351 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -494,7 +494,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
- if (unlikely(block_rsv->size == 0))
+ if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
goto try_reserve;
again:
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index b0bd12b8652f..43a9a6b5a79f 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -101,4 +101,36 @@ static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
return data_race(rsv->full);
}
+/*
+ * Get the reserved mount of a block reserve in a context where getting a stale
+ * value is acceptable, instead of accessing it directly and trigger data race
+ * warning from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_reserved(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->reserved;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
+/*
+ * Get the size of a block reserve in a context where getting a stale value is
+ * acceptable, instead of accessing it directly and trigger data race warning
+ * from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->size;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 193168214eeb..68345f73d429 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -141,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
- const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ const u8 *data_in, struct page *dest_page,
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_NONE:
default:
/*
@@ -1037,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
struct list_head *workspace;
+ const u32 sectorsize = fs_info->sectorsize;
int ret;
+ /*
+ * The full destination page range should not exceed the page size.
+ * And the @destlen should not exceed sectorsize, as this is only called for
+ * inline file extents, which should not exceed sectorsize.
+ */
+ ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+
workspace = get_workspace(type, 0);
ret = compression_decompress(type, workspace, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
put_workspace(type, workspace);
return ret;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 93cc92974dee..afd7e50d073d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
@@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index c276b136ab63..5b0b64571418 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -1046,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
goto add;
/* Skip too large extent */
- if (range_len >= extent_thresh)
+ if (em->len >= extent_thresh)
goto next;
/*
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 2833e8ef4c09..acf9f4b6c044 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 reserve_size = 0;
u64 qgroup_rsv_size = 0;
- u64 csum_leaves;
unsigned outstanding_extents;
lockdep_assert_held(&inode->lock);
@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
outstanding_extents);
reserve_size += btrfs_calc_metadata_size(fs_info, 1);
}
- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
- inode->csum_bytes);
- reserve_size += btrfs_calc_insert_metadata_size(fs_info,
- csum_leaves);
+ if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
+ u64 csum_leaves;
+
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
+ reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
+ }
/*
* For qgroup rsv, the calculation is very simple:
* account one nodesize for each outstanding extent
@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
}
-static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+static void calc_inode_reservations(struct btrfs_inode *inode,
u64 num_bytes, u64 disk_num_bytes,
u64 *meta_reserve, u64 *qgroup_reserve)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 nr_extents = count_max_extents(fs_info, num_bytes);
- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+ u64 csum_leaves;
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ csum_leaves = 0;
+ else
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
nr_extents + csum_leaves);
@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
- calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ calc_inode_reservations(inode, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
noflush);
@@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
nr_extents = count_max_extents(fs_info, num_bytes);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += disk_num_bytes;
+ if (!(inode->flags & BTRFS_INODE_NODATASUM))
+ inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&inode->lock);
- inode->csum_bytes -= num_bytes;
+ if (!(inode->flags & BTRFS_INODE_NODATASUM))
+ inode->csum_bytes -= num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1502d664c892..fb33027e5a4c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -246,7 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -257,13 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return -EINVAL;
}
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
- return PTR_ERR(bdev_handle);
+ return PTR_ERR(bdev_file);
}
- bdev = bdev_handle->bdev;
+ bdev = file_bdev(bdev_file);
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
btrfs_err(fs_info,
@@ -314,7 +314,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
- device->bdev_handle = bdev_handle;
+ device->bdev_file = bdev_file;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->dev_stats_valid = 1;
@@ -335,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return ret;
}
@@ -725,6 +725,23 @@ leave:
return ret;
}
+static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args)
+{
+ if (args->start.srcdevid == 0) {
+ if (memchr(args->start.srcdev_name, 0,
+ sizeof(args->start.srcdev_name)) == NULL)
+ return -ENAMETOOLONG;
+ } else {
+ args->start.srcdev_name[0] = 0;
+ }
+
+ if (memchr(args->start.tgtdev_name, 0,
+ sizeof(args->start.tgtdev_name)) == NULL)
+ return -ENAMETOOLONG;
+
+ return 0;
+}
+
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
@@ -737,10 +754,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
default:
return -EINVAL;
}
-
- if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
- args->start.tgtdev_name[0] == '\0')
- return -EINVAL;
+ ret = btrfs_check_replace_dev_names(args);
+ if (ret < 0)
+ return ret;
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
args->start.srcdevid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c6907d533fe8..c843563914ca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1307,12 +1307,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
*
* @objectid: root id
* @anon_dev: preallocated anonymous block device number for new roots,
- * pass 0 for new allocation.
+ * pass NULL for a new allocation.
* @check_ref: whether to check root item references, If true, return -ENOENT
* for orphan roots
*/
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev,
+ u64 objectid, dev_t *anon_dev,
bool check_ref)
{
struct btrfs_root *root;
@@ -1336,8 +1336,17 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
- /* Shouldn't get preallocated anon_dev for cached roots */
- ASSERT(!anon_dev);
+ /*
+ * Some other caller may have read out the newly inserted
+ * subvolume already (for things like backref walk etc). Not
+ * that common but still possible. In that case, we just need
+ * to free the anon_dev.
+ */
+ if (unlikely(anon_dev && *anon_dev)) {
+ free_anon_bdev(*anon_dev);
+ *anon_dev = 0;
+ }
+
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
btrfs_put_root(root);
return ERR_PTR(-ENOENT);
@@ -1357,7 +1366,7 @@ again:
goto fail;
}
- ret = btrfs_init_fs_root(root, anon_dev);
+ ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
if (ret)
goto fail;
@@ -1393,7 +1402,7 @@ fail:
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
* and once again by our caller.
*/
- if (anon_dev)
+ if (anon_dev && *anon_dev)
root->anon_dev = 0;
btrfs_put_root(root);
return ERR_PTR(ret);
@@ -1409,7 +1418,7 @@ fail:
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref)
{
- return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+ return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
}
/*
@@ -1417,11 +1426,11 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
* the anonymous block device id
*
* @objectid: tree objectid
- * @anon_dev: if zero, allocate a new anonymous block device or use the
- * parameter value
+ * @anon_dev: if NULL, allocate a new anonymous block device or use the
+ * parameter value if not NULL
*/
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev)
+ u64 objectid, dev_t *anon_dev)
{
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9413726b329b..eb3473d1c1ac 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -61,7 +61,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref);
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev);
+ u64 objectid, dev_t *anon_dev);
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
u64 objectid);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f396aba92c57..8e8cc1111277 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
u64 bytes_left, end;
u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
- if (WARN_ON(start != aligned_start)) {
+ /* Adjust the range to be aligned to 512B sectors if necessary. */
+ if (start != aligned_start) {
len -= aligned_start - start;
len = round_down(len, 1 << SECTOR_SHIFT);
start = aligned_start;
@@ -4298,6 +4299,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
return 0;
}
+static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ } else if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ struct btrfs_block_group *block_group;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+ /*
+ * No lock is OK here because avail is monotinically
+ * decreasing, and this is just a hint.
+ */
+ u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+ if (block_group_bits(block_group, ffe_ctl->flags) &&
+ avail >= ffe_ctl->num_bytes) {
+ ffe_ctl->hint_byte = block_group->start;
+ break;
+ }
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
+ return 0;
+}
+
static int prepare_allocation(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4308,19 +4345,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
case BTRFS_EXTENT_ALLOC_ZONED:
- if (ffe_ctl->for_treelog) {
- spin_lock(&fs_info->treelog_bg_lock);
- if (fs_info->treelog_bg)
- ffe_ctl->hint_byte = fs_info->treelog_bg;
- spin_unlock(&fs_info->treelog_bg_lock);
- }
- if (ffe_ctl->for_data_reloc) {
- spin_lock(&fs_info->relocation_bg_lock);
- if (fs_info->data_reloc_bg)
- ffe_ctl->hint_byte = fs_info->data_reloc_bg;
- spin_unlock(&fs_info->relocation_bg_lock);
- }
- return 0;
+ return prepare_allocation_zoned(fs_info, ffe_ctl);
default:
BUG();
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cfd2967f04a2..8b4bef05e222 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2480,6 +2480,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
struct fiemap_cache *cache,
u64 offset, u64 phys, u64 len, u32 flags)
{
+ u64 cache_end;
int ret = 0;
/* Set at the end of extent_fiemap(). */
@@ -2489,15 +2490,102 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
goto assign;
/*
- * Sanity check, extent_fiemap() should have ensured that new
- * fiemap extent won't overlap with cached one.
- * Not recoverable.
+ * When iterating the extents of the inode, at extent_fiemap(), we may
+ * find an extent that starts at an offset behind the end offset of the
+ * previous extent we processed. This happens if fiemap is called
+ * without FIEMAP_FLAG_SYNC and there are ordered extents completing
+ * while we call btrfs_next_leaf() (through fiemap_next_leaf_item()).
*
- * NOTE: Physical address can overlap, due to compression
+ * For example we are in leaf X processing its last item, which is the
+ * file extent item for file range [512K, 1M[, and after
+ * btrfs_next_leaf() releases the path, there's an ordered extent that
+ * completes for the file range [768K, 2M[, and that results in trimming
+ * the file extent item so that it now corresponds to the file range
+ * [512K, 768K[ and a new file extent item is inserted for the file
+ * range [768K, 2M[, which may end up as the last item of leaf X or as
+ * the first item of the next leaf - in either case btrfs_next_leaf()
+ * will leave us with a path pointing to the new extent item, for the
+ * file range [768K, 2M[, since that's the first key that follows the
+ * last one we processed. So in order not to report overlapping extents
+ * to user space, we trim the length of the previously cached extent and
+ * emit it.
+ *
+ * Upon calling btrfs_next_leaf() we may also find an extent with an
+ * offset smaller than or equals to cache->offset, and this happens
+ * when we had a hole or prealloc extent with several delalloc ranges in
+ * it, but after btrfs_next_leaf() released the path, delalloc was
+ * flushed and the resulting ordered extents were completed, so we can
+ * now have found a file extent item for an offset that is smaller than
+ * or equals to what we have in cache->offset. We deal with this as
+ * described below.
*/
- if (cache->offset + cache->len > offset) {
- WARN_ON(1);
- return -EINVAL;
+ cache_end = cache->offset + cache->len;
+ if (cache_end > offset) {
+ if (offset == cache->offset) {
+ /*
+ * We cached a dealloc range (found in the io tree) for
+ * a hole or prealloc extent and we have now found a
+ * file extent item for the same offset. What we have
+ * now is more recent and up to date, so discard what
+ * we had in the cache and use what we have just found.
+ */
+ goto assign;
+ } else if (offset > cache->offset) {
+ /*
+ * The extent range we previously found ends after the
+ * offset of the file extent item we found and that
+ * offset falls somewhere in the middle of that previous
+ * extent range. So adjust the range we previously found
+ * to end at the offset of the file extent item we have
+ * just found, since this extent is more up to date.
+ * Emit that adjusted range and cache the file extent
+ * item we have just found. This corresponds to the case
+ * where a previously found file extent item was split
+ * due to an ordered extent completing.
+ */
+ cache->len = offset - cache->offset;
+ goto emit;
+ } else {
+ const u64 range_end = offset + len;
+
+ /*
+ * The offset of the file extent item we have just found
+ * is behind the cached offset. This means we were
+ * processing a hole or prealloc extent for which we
+ * have found delalloc ranges (in the io tree), so what
+ * we have in the cache is the last delalloc range we
+ * found while the file extent item we found can be
+ * either for a whole delalloc range we previously
+ * emmitted or only a part of that range.
+ *
+ * We have two cases here:
+ *
+ * 1) The file extent item's range ends at or behind the
+ * cached extent's end. In this case just ignore the
+ * current file extent item because we don't want to
+ * overlap with previous ranges that may have been
+ * emmitted already;
+ *
+ * 2) The file extent item starts behind the currently
+ * cached extent but its end offset goes beyond the
+ * end offset of the cached extent. We don't want to
+ * overlap with a previous range that may have been
+ * emmitted already, so we emit the currently cached
+ * extent and then partially store the current file
+ * extent item's range in the cache, for the subrange
+ * going the cached extent's end to the end of the
+ * file extent item.
+ */
+ if (range_end <= cache_end)
+ return 0;
+
+ if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
+ phys += cache_end - offset;
+
+ offset = cache_end;
+ len = range_end - cache_end;
+ goto emit;
+ }
}
/*
@@ -2517,6 +2605,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
return 0;
}
+emit:
/* Not mergeable, need to submit cached one */
ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
cache->len, cache->flags);
@@ -2689,16 +2778,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
* it beyond i_size.
*/
while (cur_offset < end && cur_offset < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
u64 prealloc_start;
+ u64 lockstart;
+ u64 lockend;
u64 prealloc_len = 0;
bool delalloc;
+ lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize);
+ lockend = round_up(end, inode->root->fs_info->sectorsize);
+
+ /*
+ * We are only locking for the delalloc range because that's the
+ * only thing that can change here. With fiemap we have a lock
+ * on the inode, so no buffered or direct writes can happen.
+ *
+ * However mmaps and normal page writeback will cause this to
+ * change arbitrarily. We have to lock the extent lock here to
+ * make sure that nobody messes with the tree while we're doing
+ * btrfs_find_delalloc_in_range.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
break;
@@ -2866,15 +2973,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
const u64 ino = btrfs_ino(inode);
- struct extent_state *cached_state = NULL;
struct extent_state *delalloc_cached_state = NULL;
struct btrfs_path *path;
struct fiemap_cache cache = { 0 };
struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end;
u64 prev_extent_end;
- u64 lockstart;
- u64 lockend;
+ u64 range_start;
+ u64 range_end;
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
bool stopped = false;
int ret;
@@ -2885,22 +2992,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto out;
}
- lockstart = round_down(start, inode->root->fs_info->sectorsize);
- lockend = round_up(start + len, inode->root->fs_info->sectorsize);
- prev_extent_end = lockstart;
-
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ range_start = round_down(start, sectorsize);
+ range_end = round_up(start + len, sectorsize);
+ prev_extent_end = range_start;
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
if (ret < 0)
- goto out_unlock;
+ goto out;
btrfs_release_path(path);
path->reada = READA_FORWARD;
- ret = fiemap_search_slot(inode, path, lockstart);
+ ret = fiemap_search_slot(inode, path, range_start);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/*
* No file extent item found, but we may have delalloc between
@@ -2910,7 +3014,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto check_eof_delalloc;
}
- while (prev_extent_end < lockend) {
+ while (prev_extent_end < range_end) {
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_file_extent_item *ei;
struct btrfs_key key;
@@ -2933,21 +3037,21 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
* The first iteration can leave us at an extent item that ends
* before our range's start. Move to the next item.
*/
- if (extent_end <= lockstart)
+ if (extent_end <= range_start)
goto next_item;
backref_ctx->curr_leaf_bytenr = leaf->start;
/* We have in implicit hole (NO_HOLES feature enabled). */
if (prev_extent_end < key.offset) {
- const u64 range_end = min(key.offset, lockend) - 1;
+ const u64 hole_end = min(key.offset, range_end) - 1;
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state,
backref_ctx, 0, 0, 0,
- prev_extent_end, range_end);
+ prev_extent_end, hole_end);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* fiemap_fill_next_extent() told us to stop. */
stopped = true;
@@ -2955,7 +3059,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
/* We've reached the end of the fiemap range, stop. */
- if (key.offset >= lockend) {
+ if (key.offset >= range_end) {
stopped = true;
break;
}
@@ -3003,7 +3107,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
extent_gen,
backref_ctx);
if (ret < 0)
- goto out_unlock;
+ goto out;
else if (ret > 0)
flags |= FIEMAP_EXTENT_SHARED;
}
@@ -3014,7 +3118,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* fiemap_fill_next_extent() told us to stop. */
stopped = true;
@@ -3025,12 +3129,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
next_item:
if (fatal_signal_pending(current)) {
ret = -EINTR;
- goto out_unlock;
+ goto out;
}
ret = fiemap_next_leaf_item(inode, path);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* No more file extent items for this inode. */
break;
@@ -3049,29 +3153,41 @@ check_eof_delalloc:
btrfs_free_path(path);
path = NULL;
- if (!stopped && prev_extent_end < lockend) {
+ if (!stopped && prev_extent_end < range_end) {
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state, backref_ctx,
- 0, 0, 0, prev_extent_end, lockend - 1);
+ 0, 0, 0, prev_extent_end, range_end - 1);
if (ret < 0)
- goto out_unlock;
- prev_extent_end = lockend;
+ goto out;
+ prev_extent_end = range_end;
}
if (cache.cached && cache.offset + cache.len >= last_extent_end) {
const u64 i_size = i_size_read(&inode->vfs_inode);
if (prev_extent_end < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
+ u64 lockstart;
+ u64 lockend;
bool delalloc;
+ lockstart = round_down(prev_extent_end, sectorsize);
+ lockend = round_up(i_size, sectorsize);
+
+ /*
+ * See the comment in fiemap_process_hole as to why
+ * we're doing the locking here.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode,
prev_extent_end,
i_size - 1,
&delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
cache.flags |= FIEMAP_EXTENT_LAST;
} else {
@@ -3080,10 +3196,6 @@ check_eof_delalloc:
}
ret = emit_last_fiemap_cache(fieinfo, &cache);
-
-out_unlock:
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
out:
free_extent_state(delalloc_cached_state);
btrfs_free_backref_share_ctx(backref_ctx);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 809b11472a80..4795738d5785 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3184,8 +3184,23 @@ out:
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
- /* Drop extent maps for the part of the extent we didn't write. */
- btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
+ /*
+ * Drop extent maps for the part of the extent we didn't write.
+ *
+ * We have an exception here for the free_space_inode, this is
+ * because when we do btrfs_get_extent() on the free space inode
+ * we will search the commit root. If this is a new block group
+ * we won't find anything, and we will trip over the assert in
+ * writepage where we do ASSERT(em->block_start !=
+ * EXTENT_MAP_HOLE).
+ *
+ * Theoretically we could also skip this for any NOCOW extent as
+ * we don't mess with the extent map tree in the NOCOW case, but
+ * for now simply skip this if we are the free space inode.
+ */
+ if (!btrfs_is_free_space_inode(inode))
+ btrfs_drop_extent_map_range(inode, unwritten_start,
+ end, false);
/*
* If the ordered extent had an IOERR or something else went
@@ -4458,6 +4473,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
u64 root_flags;
int ret;
+ down_write(&fs_info->subvol_sem);
+
/*
* Don't allow to delete a subvolume with send in progress. This is
* inside the inode lock so the error handling that has to drop the bit
@@ -4469,25 +4486,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
dest->root_key.objectid);
- return -EPERM;
+ ret = -EPERM;
+ goto out_up_write;
}
if (atomic_read(&dest->nr_swapfiles)) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
root->root_key.objectid);
- return -EPERM;
+ ret = -EPERM;
+ goto out_up_write;
}
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- down_write(&fs_info->subvol_sem);
-
ret = may_destroy_subvol(dest);
if (ret)
- goto out_up_write;
+ goto out_undead;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
@@ -4497,7 +4514,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
if (ret)
- goto out_up_write;
+ goto out_undead;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -4563,15 +4580,17 @@ out_end_trans:
inode->i_flags |= S_DEAD;
out_release:
btrfs_subvolume_release_metadata(root, &block_rsv);
-out_up_write:
- up_write(&fs_info->subvol_sem);
+out_undead:
if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- } else {
+ }
+out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (!ret) {
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
@@ -7816,6 +7835,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
int ret;
ret = fiemap_prep(inode, fieinfo, start, &len, 0);
@@ -7841,7 +7861,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
- return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
+ btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ /*
+ * We did an initial flush to avoid holding the inode's lock while
+ * triggering writeback and waiting for the completion of IO and ordered
+ * extents. Now after we locked the inode we do it again, because it's
+ * possible a new write may have happened in between those two steps.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+ if (ret) {
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+ }
+ }
+
+ ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ return ret;
}
static int btrfs_writepages(struct address_space *mapping,
@@ -10269,6 +10308,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
return -EINVAL;
+ /*
+ * Compressed extents should always have checksums, so error out if we
+ * have a NOCOW file or inode was created while mounted with NODATASUM.
+ */
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ return -EINVAL;
+
orig_count = iov_iter_count(from);
/* The extent size must be sane. */
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 41b479861b3c..9876ee27f069 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -721,7 +721,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
free_extent_buffer(leaf);
leaf = NULL;
- new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
+ new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
@@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
return -EOPNOTSUPP;
}
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return -ENOENT;
+
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
@@ -2608,6 +2611,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EFAULT;
goto out;
}
+ if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
/* compression requires us to start the IO */
if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
@@ -2691,7 +2698,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
- struct bdev_handle *bdev_handle = NULL;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2728,7 +2735,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto err_drop;
/* Exclusive operation is now claimed */
- ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
btrfs_exclop_finish(fs_info);
@@ -2742,8 +2749,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
}
err_drop:
mnt_drop_write_file(file);
- if (bdev_handle)
- bdev_release(bdev_handle);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2756,7 +2763,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
- struct bdev_handle *bdev_handle = NULL;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2783,15 +2790,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
mnt_drop_write_file(file);
- if (bdev_handle)
- bdev_release(bdev_handle);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -3808,6 +3815,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
+ if (sa->create && is_fstree(sa->qgroupid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 1131d5a29d61..e43bc0fdc74e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
- char *kaddr;
- unsigned long bytes;
if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
return -EUCLEAN;
@@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
}
data_in += LZO_LEN;
- out_len = PAGE_SIZE;
+ out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
pr_warn("BTRFS: decompress failed!\n");
@@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
goto out;
}
- if (out_len < start_byte) {
+ ASSERT(out_len <= sectorsize);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len);
+ /* Early end, considered as an error. */
+ if (unlikely(out_len < destlen)) {
ret = -EIO;
- goto out;
+ memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len);
}
-
- /*
- * the caller is already checking against PAGE_SIZE, but lets
- * move this check closer to the memcpy/memset
- */
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes = min_t(unsigned long, destlen, out_len - start_byte);
-
- kaddr = kmap_local_page(dest_page);
- memcpy(kaddr, workspace->buf + start_byte, bytes);
-
- /*
- * btrfs_getblock is doing a zero on the tail of the page too,
- * but this will cover anything missing from the decompressed
- * data.
- */
- if (bytes < destlen)
- memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 63b426cc7798..5470e1cdf10c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1736,6 +1736,15 @@ out:
return ret;
}
+static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+{
+ return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
+ qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+}
+
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1755,6 +1764,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
+ if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* Check if there are no children of this qgroup */
if (!list_empty(&qgroup->members)) {
ret = -EBUSY;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 6486f0d7e993..8c4fc98ca9ce 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
out_unlock:
spin_unlock(&fs_info->ref_verify_lock);
out:
- if (ret)
+ if (ret) {
+ btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ }
return ret;
}
@@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
}
}
if (ret) {
- btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
btrfs_free_ref_cache(fs_info);
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a01807cbd4d4..0123d2728923 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1098,12 +1098,22 @@ out:
static void scrub_read_endio(struct btrfs_bio *bbio)
{
struct scrub_stripe *stripe = bbio->private;
+ struct bio_vec *bvec;
+ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+ int num_sectors;
+ u32 bio_size = 0;
+ int i;
+
+ ASSERT(sector_nr < stripe->nr_sectors);
+ bio_for_each_bvec_all(bvec, &bbio->bio, i)
+ bio_size += bvec->bv_len;
+ num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
- bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
+ bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
+ bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
} else {
- bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+ bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1636,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
u64 stripe_len = BTRFS_STRIPE_LEN;
int mirror = stripe->mirror_num;
int i;
@@ -1646,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
struct page *page = scrub_stripe_get_page(stripe, i);
unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+ /* We're beyond the chunk boundary, no need to read anymore. */
+ if (i >= nr_sectors)
+ break;
+
/* The current sector cannot be merged, submit the bio. */
if (bbio &&
((i > 0 &&
@@ -1701,6 +1718,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
ASSERT(stripe->bg);
@@ -1715,14 +1735,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
scrub_read_endio, stripe);
- /* Read the whole stripe. */
bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
- for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
+ /* Read the whole range inside the chunk boundary. */
+ for (unsigned int cur = 0; cur < nr_sectors; cur++) {
+ struct page *page = scrub_stripe_get_page(stripe, cur);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
int ret;
- ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
+ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
/* We should have allocated enough bio vectors. */
- ASSERT(ret == PAGE_SIZE);
+ ASSERT(ret == fs_info->sectorsize);
}
atomic_inc(&stripe->pending_io);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4e36550618e5..e48a063ef085 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6705,11 +6705,20 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (ret)
goto out;
}
- if (sctx->cur_inode_last_extent <
- sctx->cur_inode_size) {
- ret = send_hole(sctx, sctx->cur_inode_size);
- if (ret)
+ if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
+ ret = range_is_hole_in_parent(sctx,
+ sctx->cur_inode_last_extent,
+ sctx->cur_inode_size);
+ if (ret < 0) {
goto out;
+ } else if (ret == 0) {
+ ret = send_hole(sctx, sctx->cur_inode_size);
+ if (ret < 0)
+ goto out;
+ } else {
+ /* Range is already a hole, skip. */
+ ret = 0;
+ }
}
}
if (need_truncate) {
@@ -8111,7 +8120,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
}
if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
- ret = -EINVAL;
+ ret = -EOPNOTSUPP;
goto out;
}
@@ -8205,8 +8214,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
- arg->clone_sources_count + 1,
+ sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1,
+ sizeof(*sctx->clone_roots),
GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 571bb13587d5..3b54eb583474 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -856,7 +856,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
- u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+ const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
u64 ordered, delalloc;
u64 thresh;
u64 used;
@@ -956,8 +956,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
- used += fs_info->delayed_refs_rsv.reserved +
- fs_info->delayed_block_rsv.reserved;
+ used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) +
+ btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv);
else
used += space_info->bytes_may_use - global_rsv_size;
@@ -1173,7 +1173,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
enum btrfs_flush_state flush;
u64 delalloc_size = 0;
u64 to_reclaim, block_rsv_size;
- u64 global_rsv_size = global_rsv->reserved;
+ const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);
loops++;
@@ -1185,9 +1185,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* assume it's tied up in delalloc reservations.
*/
block_rsv_size = global_rsv_size +
- delayed_block_rsv->reserved +
- delayed_refs_rsv->reserved +
- trans_rsv->reserved;
+ btrfs_block_rsv_reserved(delayed_block_rsv) +
+ btrfs_block_rsv_reserved(delayed_refs_rsv) +
+ btrfs_block_rsv_reserved(trans_rsv);
if (block_rsv_size < space_info->bytes_may_use)
delalloc_size = space_info->bytes_may_use - block_rsv_size;
@@ -1207,16 +1207,16 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
to_reclaim = delalloc_size;
flush = FLUSH_DELALLOC;
} else if (space_info->bytes_pinned >
- (delayed_block_rsv->reserved +
- delayed_refs_rsv->reserved)) {
+ (btrfs_block_rsv_reserved(delayed_block_rsv) +
+ btrfs_block_rsv_reserved(delayed_refs_rsv))) {
to_reclaim = space_info->bytes_pinned;
flush = COMMIT_TRANS;
- } else if (delayed_block_rsv->reserved >
- delayed_refs_rsv->reserved) {
- to_reclaim = delayed_block_rsv->reserved;
+ } else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
+ btrfs_block_rsv_reserved(delayed_refs_rsv)) {
+ to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv);
flush = FLUSH_DELAYED_ITEMS_NR;
} else {
- to_reclaim = delayed_refs_rsv->reserved;
+ to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv);
flush = FLUSH_DELAYED_REFS_NR;
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 93511d54abf8..0e49dab8dad2 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -475,7 +475,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- folio_start_writeback(folio);
+ if (!folio_test_writeback(folio))
+ folio_start_writeback(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 896acfda1789..101f786963d4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1457,6 +1457,14 @@ static int btrfs_reconfigure(struct fs_context *fc)
btrfs_info_to_ctx(fs_info, &old_ctx);
+ /*
+ * This is our "bind mount" trick, we don't want to allow the user to do
+ * anything other than mount a different ro/rw and a different subvol,
+ * all of the mount options should be maintained.
+ */
+ if (mount_reconfigure)
+ ctx->mount_opt = old_ctx.mount_opt;
+
sync_filesystem(sb);
set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5b3333ceef04..bf8e64c766b6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
u64 num_bytes,
u64 *delayed_refs_bytes)
{
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
- u64 extra_delayed_refs_bytes = 0;
- u64 bytes;
+ u64 bytes = num_bytes + *delayed_refs_bytes;
int ret;
/*
- * If there's a gap between the size of the delayed refs reserve and
- * its reserved space, than some tasks have added delayed refs or bumped
- * its size otherwise (due to block group creation or removal, or block
- * group item update). Also try to allocate that gap in order to prevent
- * using (and possibly abusing) the global reserve when committing the
- * transaction.
- */
- if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- !btrfs_block_rsv_full(delayed_refs_rsv)) {
- spin_lock(&delayed_refs_rsv->lock);
- if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
- extra_delayed_refs_bytes = delayed_refs_rsv->size -
- delayed_refs_rsv->reserved;
- spin_unlock(&delayed_refs_rsv->lock);
- }
-
- bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
-
- /*
* We want to reserve all the bytes we may need all at once, so we only
* do 1 enospc flushing cycle per transaction start.
*/
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
- if (ret == 0) {
- if (extra_delayed_refs_bytes > 0)
- btrfs_migrate_to_delayed_refs_rsv(fs_info,
- extra_delayed_refs_bytes);
- return 0;
- }
-
- if (extra_delayed_refs_bytes > 0) {
- bytes -= extra_delayed_refs_bytes;
- ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
- if (ret == 0)
- return 0;
- }
/*
* If we are an emergency flush, which can steal from the global block
* reserve, then attempt to not reserve space for the delayed refs, as
* we will consume space for them from the global block reserve.
*/
- if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
bytes -= *delayed_refs_bytes;
*delayed_refs_bytes = 0;
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
@@ -1868,7 +1834,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
- pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
+ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
pending->snap = NULL;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 50fdc69fdddf..6eccf8496486 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf,
if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
- ptr, inline_type, end);
+ ptr, btrfs_extent_inline_ref_size(inline_type), end);
return -EUCLEAN;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4c32497311d2..e180da4cc227 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -468,39 +468,39 @@ static noinline struct btrfs_fs_devices *find_fsid(
static int
btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
- int flush, struct bdev_handle **bdev_handle,
+ int flush, struct file **bdev_file,
struct btrfs_super_block **disk_super)
{
struct block_device *bdev;
int ret;
- *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
- if (IS_ERR(*bdev_handle)) {
- ret = PTR_ERR(*bdev_handle);
+ if (IS_ERR(*bdev_file)) {
+ ret = PTR_ERR(*bdev_file);
goto error;
}
- bdev = (*bdev_handle)->bdev;
+ bdev = file_bdev(*bdev_file);
if (flush)
sync_blockdev(bdev);
ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
- bdev_release(*bdev_handle);
+ fput(*bdev_file);
goto error;
}
invalidate_bdev(bdev);
*disk_super = btrfs_read_dev_super(bdev);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- bdev_release(*bdev_handle);
+ fput(*bdev_file);
goto error;
}
return 0;
error:
- *bdev_handle = NULL;
+ *bdev_file = NULL;
return ret;
}
@@ -643,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, blk_mode_t flags,
void *holder)
{
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -654,7 +654,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev_handle, &disk_super);
+ &bdev_file, &disk_super);
if (ret)
return ret;
@@ -678,20 +678,20 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = true;
} else {
- if (bdev_read_only(bdev_handle->bdev))
+ if (bdev_read_only(file_bdev(bdev_file)))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!bdev_nonrot(bdev_handle->bdev))
+ if (!bdev_nonrot(file_bdev(bdev_file)))
fs_devices->rotating = true;
- if (bdev_max_discard_sectors(bdev_handle->bdev))
+ if (bdev_max_discard_sectors(file_bdev(bdev_file)))
fs_devices->discardable = true;
- device->bdev_handle = bdev_handle;
- device->bdev = bdev_handle->bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
fs_devices->open_devices++;
@@ -706,7 +706,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return -EINVAL;
}
@@ -1015,10 +1015,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
- if (device->bdev_handle) {
- bdev_release(device->bdev_handle);
+ if (device->bdev_file) {
+ fput(device->bdev_file);
device->bdev = NULL;
- device->bdev_handle = NULL;
+ device->bdev_file = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1063,7 +1063,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- bdev_release(device->bdev_handle);
+ fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1316,7 +1316,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
u64 bytenr, bytenr_orig;
int ret;
@@ -1339,18 +1339,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
- if (IS_ERR(bdev_handle))
- return ERR_CAST(bdev_handle);
+ bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
bytenr_orig = btrfs_sb_offset(0);
- ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
+ ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
goto error_bdev_put;
}
- disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+ disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
@@ -1381,7 +1381,7 @@ free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return device;
}
@@ -2057,7 +2057,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct bdev_handle **bdev_handle)
+ struct file **bdev_file)
{
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
@@ -2166,7 +2166,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
btrfs_assign_next_active_device(device, NULL);
- if (device->bdev_handle) {
+ if (device->bdev_file) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_remove_device(device);
@@ -2182,9 +2182,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and bdev_release() will pull in the ->open_mutex on
- * the block device and it's dependencies. Instead just flush the
- * device and let the caller do the final bdev_release.
+ * write lock, and fput() on the block device will pull in the
+ * ->open_mutex on the block device and it's dependencies. Instead
+ * just flush the device and let the caller do the final bdev_release.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
btrfs_scratch_superblocks(fs_info, device->bdev,
@@ -2195,7 +2195,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
}
- *bdev_handle = device->bdev_handle;
+ *bdev_file = device->bdev_file;
synchronize_rcu();
btrfs_free_device(device);
@@ -2332,7 +2332,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
const char *path)
{
struct btrfs_super_block *disk_super;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int ret;
if (!path || !path[0])
@@ -2350,7 +2350,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
}
ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
- &bdev_handle, &disk_super);
+ &bdev_file, &disk_super);
if (ret) {
btrfs_put_dev_args_from_path(args);
return ret;
@@ -2363,7 +2363,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return 0;
}
@@ -2583,7 +2583,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct super_block *sb = fs_info->sb;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *seed_devices = NULL;
@@ -2596,12 +2596,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
- if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
+ if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
ret = -EINVAL;
goto error;
}
@@ -2613,11 +2613,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
locked = true;
}
- sync_blockdev(bdev_handle->bdev);
+ sync_blockdev(file_bdev(bdev_file));
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (device->bdev == bdev_handle->bdev) {
+ if (device->bdev == file_bdev(bdev_file)) {
ret = -EEXIST;
rcu_read_unlock();
goto error;
@@ -2633,8 +2633,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
device->fs_info = fs_info;
- device->bdev_handle = bdev_handle;
- device->bdev = bdev_handle->bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error_free_device;
@@ -2817,7 +2817,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- bdev_release(bdev_handle);
+ fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
map = btrfs_find_chunk_map(fs_info, logical, length);
if (unlikely(!map)) {
- read_unlock(&fs_info->mapping_tree_lock);
btrfs_crit(fs_info,
"unable to find chunk map for logical %llu length %llu",
logical, length);
@@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
}
if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
- read_unlock(&fs_info->mapping_tree_lock);
btrfs_crit(fs_info,
"found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
logical, logical + length, map->start,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53f87f398da7..a11854912d53 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -90,7 +90,7 @@ struct btrfs_device {
u64 generation;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
struct btrfs_zoned_device_info *zone_info;
@@ -661,7 +661,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct bdev_handle **bdev_handle);
+ struct file **bdev_file);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 36cf1f0e338e..8da66ea699e8 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -354,18 +354,13 @@ done:
}
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
int wbits = MAX_WBITS;
- unsigned long bytes_left;
- unsigned long total_out = 0;
- unsigned long pg_offset = 0;
-
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes_left = destlen;
+ unsigned long to_copy;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = srclen;
@@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
return -EIO;
}
- while (bytes_left > 0) {
- unsigned long buf_start;
- unsigned long buf_offset;
- unsigned long bytes;
-
- ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
- if (ret != Z_OK && ret != Z_STREAM_END)
- break;
-
- buf_start = total_out;
- total_out = workspace->strm.total_out;
-
- if (total_out == buf_start) {
- ret = -EIO;
- break;
- }
-
- if (total_out <= start_byte)
- goto next;
-
- if (total_out > start_byte && buf_start < start_byte)
- buf_offset = start_byte - buf_start;
- else
- buf_offset = 0;
-
- bytes = min(PAGE_SIZE - pg_offset,
- PAGE_SIZE - (buf_offset % PAGE_SIZE));
- bytes = min(bytes, bytes_left);
+ /*
+ * Everything (in/out buf) should be at most one sector, there should
+ * be no need to switch any input/output buffer.
+ */
+ ret = zlib_inflate(&workspace->strm, Z_FINISH);
+ to_copy = min(workspace->strm.total_out, destlen);
+ if (ret != Z_STREAM_END)
+ goto out;
- memcpy_to_page(dest_page, pg_offset,
- workspace->buf + buf_offset, bytes);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy);
- pg_offset += bytes;
- bytes_left -= bytes;
-next:
- workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = workspace->buf_size;
- }
-
- if (ret != Z_STREAM_END && bytes_left != 0)
+out:
+ if (unlikely(to_copy != destlen)) {
+ pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n",
+ to_copy, destlen);
ret = -EIO;
- else
+ } else {
ret = 0;
+ }
zlib_inflateEnd(&workspace->strm);
- /*
- * this should only happen if zlib returned fewer bytes than we
- * expected. btrfs_get_block is responsible for zeroing from the
- * end of the inline extent (destlen) to the end of the page
- */
- if (pg_offset < destlen) {
- memzero_page(dest_page, pg_offset, destlen - pg_offset);
- }
+ if (unlikely(to_copy < destlen))
+ memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 5bd76813b23f..aea51fd850cd 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -824,11 +824,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ unsigned int nofs_flags;
+
ASSERT(sb_zone_is_full(reset));
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- reset->start, reset->len,
- GFP_NOFS);
+ reset->start, reset->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -974,11 +977,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
* explicit ZONE_FINISH is not necessary.
*/
if (zone->wp != zone->start + zone->capacity) {
+ unsigned int nofs_flags;
int ret;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev,
REQ_OP_ZONE_FINISH, zone->start,
- zone->len, GFP_NOFS);
+ zone->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
}
@@ -996,11 +1002,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
+ unsigned int nofs_flags;
sector_t zone_sectors;
sector_t nr_sectors;
u8 zone_sectors_shift;
u32 sb_zone;
u32 nr_zones;
+ int ret;
zone_sectors = bdev_zone_sectors(bdev);
zone_sectors_shift = ilog2(zone_sectors);
@@ -1011,9 +1019,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- zone_start_sector(sb_zone, bdev),
- zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ zone_start_sector(sb_zone, bdev),
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}
/*
@@ -1124,12 +1135,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
+ unsigned int nofs_flags;
int ret;
*bytes = 0;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
- physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
- GFP_NOFS);
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -1639,6 +1652,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
out:
+ /* Reject non SINGLE data profiles without RST */
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
+ (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
if (cache->alloc_offset > cache->zone_capacity) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
@@ -1670,6 +1692,7 @@ out:
}
bitmap_free(active);
kfree(zone_info);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2055,6 +2078,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
map = block_group->physical_map;
+ spin_lock(&fs_info->zone_active_bgs_lock);
spin_lock(&block_group->lock);
if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
@@ -2067,7 +2091,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- spin_lock(&fs_info->zone_active_bgs_lock);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_zoned_device_info *zinfo;
int reserved = 0;
@@ -2087,20 +2110,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
*/
if (atomic_read(&zinfo->active_zones_left) <= reserved) {
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!is_data)
zinfo->reserved_active_zones--;
}
- spin_unlock(&fs_info->zone_active_bgs_lock);
/* Successfully activated all the zones */
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
@@ -2108,8 +2128,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
/* For the active block group list */
btrfs_get_block_group(block_group);
-
- spin_lock(&fs_info->zone_active_bgs_lock);
list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
spin_unlock(&fs_info->zone_active_bgs_lock);
@@ -2117,6 +2135,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
out_unlock:
spin_unlock(&block_group->lock);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
return ret;
}
@@ -2238,14 +2257,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ unsigned int nofs_flags;
if (zinfo->max_active_zones == 0)
continue;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
diff --git a/fs/buffer.c b/fs/buffer.c
index d3bcf601d3e5..4f73d23c2c46 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -55,7 +55,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
- struct writeback_control *wbc);
+ enum rw_hint hint, struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
* a successful fsync(). For example, ext2 indirect blocks need to be
* written back and waited upon before fsync() returns.
*
- * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
+ * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
* inode_has_buffers() and invalidate_inode_buffers() are provided for the
* management of a list of dependent buffers at ->i_mapping->i_private_list.
*
@@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
+ inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@@ -1944,7 +1945,8 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
+ inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
}
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+ enum rw_hint write_hint,
struct writeback_control *wbc)
{
const enum req_op op = opf & REQ_OP_MASK;
@@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_write_hint = write_hint;
__bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
@@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
- submit_bh_wbc(opf, bh, NULL);
+ submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
}
EXPORT_SYMBOL(submit_bh);
@@ -3121,12 +3125,8 @@ void __init buffer_init(void)
unsigned long nrpages;
int ret;
- bh_cachep = kmem_cache_create("buffer_head",
- sizeof(struct buffer_head), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
- SLAB_MEM_SPREAD),
- NULL);
-
+ bh_cachep = KMEM_CACHE(buffer_head,
+ SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
/*
* Limit the bh occupancy to 10% of ZONE_NORMAL
*/
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 8df715640a48..c5a070550ee3 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -2,7 +2,7 @@
config CACHEFILES
tristate "Filesystem caching on files"
- depends on FSCACHE && BLOCK
+ depends on NETFS_SUPPORT && FSCACHE && BLOCK
help
This permits use of a mounted filesystem as a cache for other
filesystems - primarily networking filesystems - thus allowing fast
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
index 7077f72e6f47..f449f7340aad 100644
--- a/fs/cachefiles/cache.c
+++ b/fs/cachefiles/cache.c
@@ -168,6 +168,8 @@ error_unsupported:
dput(root);
error_open_root:
cachefiles_end_secure(cache, saved_cred);
+ put_cred(cache->cache_cred);
+ cache->cache_cred = NULL;
error_getsec:
fscache_relinquish_cache(cache_cookie);
cache->cache = NULL;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 3f24905f4066..6465e2574230 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -816,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
cachefiles_put_directory(cache->graveyard);
cachefiles_put_directory(cache->store);
mntput(cache->mnt);
+ put_cred(cache->cache_cred);
kfree(cache->rootdirname);
kfree(cache->secctx);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 4a87c9d714a9..d33169f0018b 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -246,7 +246,7 @@ extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
enum fscache_want_state want_state);
extern int __cachefiles_prepare_write(struct cachefiles_object *object,
struct file *file,
- loff_t *_start, size_t *_len,
+ loff_t *_start, size_t *_len, size_t upper_len,
bool no_space_allocated_yet);
extern int __cachefiles_write(struct cachefiles_object *object,
struct file *file,
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 5857241c5918..1d685357e67f 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -517,18 +517,26 @@ cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
*/
int __cachefiles_prepare_write(struct cachefiles_object *object,
struct file *file,
- loff_t *_start, size_t *_len,
+ loff_t *_start, size_t *_len, size_t upper_len,
bool no_space_allocated_yet)
{
struct cachefiles_cache *cache = object->volume->cache;
loff_t start = *_start, pos;
- size_t len = *_len, down;
+ size_t len = *_len;
int ret;
/* Round to DIO size */
- down = start - round_down(start, PAGE_SIZE);
- *_start = start - down;
- *_len = round_up(down + len, PAGE_SIZE);
+ start = round_down(*_start, PAGE_SIZE);
+ if (start != *_start || *_len > upper_len) {
+ /* Probably asked to cache a streaming write written into the
+ * pagecache when the cookie was temporarily out of service to
+ * culling.
+ */
+ fscache_count_dio_misfit();
+ return -ENOBUFS;
+ }
+
+ *_len = round_up(len, PAGE_SIZE);
/* We need to work out whether there's sufficient disk space to perform
* the write - but we can skip that check if we have space already
@@ -539,7 +547,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
pos = cachefiles_inject_read_error();
if (pos == 0)
- pos = vfs_llseek(file, *_start, SEEK_DATA);
+ pos = vfs_llseek(file, start, SEEK_DATA);
if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
if (pos == -ENXIO)
goto check_space; /* Unallocated tail */
@@ -547,7 +555,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
cachefiles_trace_seek_error);
return pos;
}
- if ((u64)pos >= (u64)*_start + *_len)
+ if ((u64)pos >= (u64)start + *_len)
goto check_space; /* Unallocated region */
/* We have a block that's at least partially filled - if we're low on
@@ -560,13 +568,13 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
pos = cachefiles_inject_read_error();
if (pos == 0)
- pos = vfs_llseek(file, *_start, SEEK_HOLE);
+ pos = vfs_llseek(file, start, SEEK_HOLE);
if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
trace_cachefiles_io_error(object, file_inode(file), pos,
cachefiles_trace_seek_error);
return pos;
}
- if ((u64)pos >= (u64)*_start + *_len)
+ if ((u64)pos >= (u64)start + *_len)
return 0; /* Fully allocated */
/* Partially allocated, but insufficient space: cull. */
@@ -574,7 +582,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
ret = cachefiles_inject_remove_error();
if (ret == 0)
ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
- *_start, *_len);
+ start, *_len);
if (ret < 0) {
trace_cachefiles_io_error(object, file_inode(file), ret,
cachefiles_trace_fallocate_error);
@@ -591,8 +599,8 @@ check_space:
}
static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
- loff_t *_start, size_t *_len, loff_t i_size,
- bool no_space_allocated_yet)
+ loff_t *_start, size_t *_len, size_t upper_len,
+ loff_t i_size, bool no_space_allocated_yet)
{
struct cachefiles_object *object = cachefiles_cres_object(cres);
struct cachefiles_cache *cache = object->volume->cache;
@@ -608,7 +616,7 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
cachefiles_begin_secure(cache, &saved_cred);
ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
- _start, _len,
+ _start, _len, upper_len,
no_space_allocated_yet);
cachefiles_end_secure(cache, saved_cred);
return ret;
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index b8fbbb1961bb..4ba42f1fa3b4 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -50,7 +50,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
return -ENOBUFS;
cachefiles_begin_secure(cache, &saved_cred);
- ret = __cachefiles_prepare_write(object, file, &pos, &len, true);
+ ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true);
cachefiles_end_secure(cache, saved_cred);
if (ret < 0)
return ret;
@@ -539,6 +539,9 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
struct fscache_volume *volume = object->volume->vcookie;
size_t volume_key_size, cookie_key_size, data_len;
+ if (!object->ondemand)
+ return 0;
+
/*
* CacheFiles will firstly check the cache file under the root cache
* directory. If the coherency check failed, it will fallback to
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 94df854147d3..7249d70e1a43 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -7,6 +7,7 @@ config CEPH_FS
select CRYPTO_AES
select CRYPTO
select NETFS_SUPPORT
+ select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
default n
help
Choose Y or M here to include support for mounting the
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 13af429ab030..1340d77124ae 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -159,27 +159,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
ceph_put_snap_context(snapc);
}
- folio_wait_fscache(folio);
-}
-
-static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
-{
- struct inode *inode = folio->mapping->host;
- struct ceph_client *cl = ceph_inode_to_client(inode);
-
- doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode),
- folio->index, folio_test_dirty(folio) ? "" : "not ");
-
- if (folio_test_private(folio))
- return false;
-
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- ceph_fscache_note_page_release(inode);
- return true;
+ netfs_invalidate_folio(folio, offset, length);
}
static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
@@ -357,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
u64 len = subreq->len;
bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
u64 off = subreq->start;
+ int extent_cnt;
if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
@@ -370,8 +351,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
- CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
- NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
+ CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
req = NULL;
@@ -379,7 +360,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
}
if (sparse) {
- err = ceph_alloc_sparse_ext_map(&req->r_ops[0]);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, len);
+ err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt);
if (err)
goto out;
}
@@ -509,7 +491,6 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq)
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
.free_request = ceph_netfs_free_request,
- .begin_cache_operation = ceph_begin_cache_operation,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
@@ -1586,7 +1567,7 @@ const struct address_space_operations ceph_aops = {
.write_end = ceph_write_end,
.dirty_folio = ceph_dirty_folio,
.invalidate_folio = ceph_invalidate_folio,
- .release_folio = ceph_release_folio,
+ .release_folio = netfs_release_folio,
.direct_IO = noop_direct_IO,
};
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index dc502daac49a..20efac020394 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -43,38 +43,19 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
}
}
-static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
struct writeback_control *wbc)
{
- fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode)));
+ return netfs_unpin_writeback(inode, wbc);
}
-static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
- struct folio *folio)
-{
- struct ceph_inode_info *ci = ceph_inode(mapping->host);
-
- return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
-}
-
-static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
-{
- struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
-
- return fscache_begin_read_operation(&rreq->cache_resources, cookie);
-}
+#define ceph_fscache_dirty_folio netfs_dirty_folio
static inline bool ceph_is_cache_enabled(struct inode *inode)
{
return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline void ceph_fscache_note_page_release(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- fscache_note_page_release(ceph_fscache_cookie(ci));
-}
#else /* CONFIG_CEPH_FSCACHE */
static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc,
struct fs_context *fc)
@@ -119,30 +100,18 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
{
}
-static inline void ceph_fscache_unpin_writeback(struct inode *inode,
- struct writeback_control *wbc)
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
{
+ return 0;
}
-static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
- struct folio *folio)
-{
- return filemap_dirty_folio(mapping, folio);
-}
+#define ceph_fscache_dirty_folio filemap_dirty_folio
static inline bool ceph_is_cache_enabled(struct inode *inode)
{
return false;
}
-
-static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
-{
- return -ENOBUFS;
-}
-
-static inline void ceph_fscache_note_page_release(struct inode *inode)
-{
-}
#endif /* CONFIG_CEPH_FSCACHE */
#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2c0b8dc3dd0d..7fb4aae97412 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1452,7 +1452,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
if (flushing & CEPH_CAP_XATTR_EXCL) {
arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
arg->xattr_version = ci->i_xattrs.version;
- arg->xattr_buf = ci->i_xattrs.blob;
+ arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob);
} else {
arg->xattr_buf = NULL;
arg->old_xattr_buf = NULL;
@@ -1553,6 +1553,7 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
encode_cap_msg(msg, arg);
ceph_con_send(&arg->session->s_con, msg);
ceph_buffer_put(arg->old_xattr_buf);
+ ceph_buffer_put(arg->xattr_buf);
if (arg->wake)
wake_up_all(&ci->i_cap_wq);
}
@@ -2155,6 +2156,30 @@ retry:
ceph_cap_string(cap->implemented),
ceph_cap_string(revoking));
+ /* completed revocation? going down and there are no caps? */
+ if (revoking) {
+ if ((revoking & cap_used) == 0) {
+ doutc(cl, "completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /*
+ * If the "i_wrbuffer_ref" was increased by mmap or generic
+ * cache write just before the ceph_check_caps() is called,
+ * the Fb capability revoking will fail this time. Then we
+ * must wait for the BDI's delayed work to flush the dirty
+ * pages and to release the "i_wrbuffer_ref", which will cost
+ * at most 5 seconds. That means the MDS needs to wait at
+ * most 5 seconds to finished the Fb capability's revocation.
+ *
+ * Let's queue a writeback for it.
+ */
+ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
+ (revoking & CEPH_CAP_FILE_BUFFER))
+ queue_writeback = true;
+ }
+
if (cap == ci->i_auth_cap &&
(cap->issued & CEPH_CAP_FILE_WR)) {
/* request larger max_size from MDS? */
@@ -2182,30 +2207,6 @@ retry:
}
}
- /* completed revocation? going down and there are no caps? */
- if (revoking) {
- if ((revoking & cap_used) == 0) {
- doutc(cl, "completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
- }
-
- /*
- * If the "i_wrbuffer_ref" was increased by mmap or generic
- * cache write just before the ceph_check_caps() is called,
- * the Fb capability revoking will fail this time. Then we
- * must wait for the BDI's delayed work to flush the dirty
- * pages and to release the "i_wrbuffer_ref", which will cost
- * at most 5 seconds. That means the MDS needs to wait at
- * most 5 seconds to finished the Fb capability's revocation.
- *
- * Let's queue a writeback for it.
- */
- if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
- (revoking & CEPH_CAP_FILE_BUFFER))
- queue_writeback = true;
- }
-
/* want more caps from mds? */
if (want & ~cap->mds_wanted) {
if (want & ~(cap->mds_wanted | cap->issued))
@@ -3215,7 +3216,6 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
enum put_cap_refs_mode {
PUT_CAP_REFS_SYNC = 0,
- PUT_CAP_REFS_NO_CHECK,
PUT_CAP_REFS_ASYNC,
};
@@ -3331,11 +3331,6 @@ void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had)
__ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC);
}
-void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
-{
- __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK);
-}
-
/*
* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
* context. Adjust per-snap dirty page accounting as appropriate.
@@ -4777,7 +4772,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
ceph_inode_to_fs_client(inode)->mdsc;
- __cap_delay_requeue_front(mdsc, ci);
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
+ ceph_vinop(inode));
+ spin_lock(&mdsc->cap_unlink_delay_lock);
+ ci->i_ceph_flags |= CEPH_I_FLUSH;
+ if (!list_empty(&ci->i_cap_delay_list))
+ list_del_init(&ci->i_cap_delay_list);
+ list_add_tail(&ci->i_cap_delay_list,
+ &mdsc->cap_unlink_delay_list);
+ spin_unlock(&mdsc->cap_unlink_delay_lock);
+
+ /*
+ * Fire the work immediately, because the MDS maybe
+ * waiting for caps release.
+ */
+ ceph_queue_cap_unlink_work(mdsc);
}
}
spin_unlock(&ci->i_ceph_lock);
@@ -4887,13 +4897,15 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
struct inode *dir,
int mds, int drop, int unless)
{
- struct dentry *parent = NULL;
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
struct ceph_client *cl;
int force = 0;
int ret;
+ /* This shouldn't happen */
+ BUG_ON(!dir);
+
/*
* force an record for the directory caps if we have a dentry lease.
* this is racy (can't take i_ceph_lock and d_lock together), but it
@@ -4903,14 +4915,9 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
spin_lock(&dentry->d_lock);
if (di->lease_session && di->lease_session->s_mds == mds)
force = 1;
- if (!dir) {
- parent = dget(dentry->d_parent);
- dir = d_inode(parent);
- }
spin_unlock(&dentry->d_lock);
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
- dput(parent);
cl = ceph_inode_to_client(dir);
spin_lock(&dentry->d_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 678596684596..0e9f56eaba1e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1593,10 +1593,12 @@ struct ceph_lease_walk_control {
unsigned long dir_lease_ttl;
};
+static int __dir_lease_check(const struct dentry *, struct ceph_lease_walk_control *);
+static int __dentry_lease_check(const struct dentry *);
+
static unsigned long
__dentry_leases_walk(struct ceph_mds_client *mdsc,
- struct ceph_lease_walk_control *lwc,
- int (*check)(struct dentry*, void*))
+ struct ceph_lease_walk_control *lwc)
{
struct ceph_dentry_info *di, *tmp;
struct dentry *dentry, *last = NULL;
@@ -1624,7 +1626,10 @@ __dentry_leases_walk(struct ceph_mds_client *mdsc,
goto next;
}
- ret = check(dentry, lwc);
+ if (lwc->dir_lease)
+ ret = __dir_lease_check(dentry, lwc);
+ else
+ ret = __dentry_lease_check(dentry);
if (ret & TOUCH) {
/* move it into tail of dir lease list */
__dentry_dir_lease_touch(mdsc, di);
@@ -1681,7 +1686,7 @@ next:
return freed;
}
-static int __dentry_lease_check(struct dentry *dentry, void *arg)
+static int __dentry_lease_check(const struct dentry *dentry)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
int ret;
@@ -1696,9 +1701,9 @@ static int __dentry_lease_check(struct dentry *dentry, void *arg)
return DELETE;
}
-static int __dir_lease_check(struct dentry *dentry, void *arg)
+static int __dir_lease_check(const struct dentry *dentry,
+ struct ceph_lease_walk_control *lwc)
{
- struct ceph_lease_walk_control *lwc = arg;
struct ceph_dentry_info *di = ceph_dentry(dentry);
int ret = __dir_lease_try_check(dentry);
@@ -1737,7 +1742,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc)
lwc.dir_lease = false;
lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
- freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
+ freed = __dentry_leases_walk(mdsc, &lwc);
if (!lwc.nr_to_scan) /* more invalid leases */
return -EAGAIN;
@@ -1747,7 +1752,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc)
lwc.dir_lease = true;
lwc.expire_dir_lease = freed < count;
lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
- freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
+ freed +=__dentry_leases_walk(mdsc, &lwc);
if (!lwc.nr_to_scan) /* more to check */
return -EAGAIN;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 726af69d4d62..a79f163ae4ed 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -286,8 +286,6 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino,
vino.snap, sfh->parent_ino, sfh->hash, err);
}
- if (IS_ERR(inode))
- return ERR_CAST(inode);
/* see comments in ceph_get_parent() */
return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode);
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d380d9dad0e0..abe8028d95bf 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1029,6 +1029,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
struct ceph_osd_req_op *op;
u64 read_off = off;
u64 read_len = len;
+ int extent_cnt;
/* determine new offset/length if encrypted */
ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
@@ -1068,7 +1069,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
op = &req->r_ops[0];
if (sparse) {
- ret = ceph_alloc_sparse_ext_map(op);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, read_len);
+ ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
if (ret) {
ceph_osdc_put_request(req);
break;
@@ -1465,6 +1467,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ssize_t len;
struct ceph_osd_req_op *op;
int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ;
+ int extent_cnt;
if (write)
size = min_t(u64, size, fsc->mount_options->wsize);
@@ -1528,7 +1531,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
op = &req->r_ops[0];
if (sparse) {
- ret = ceph_alloc_sparse_ext_map(op);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, size);
+ ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
if (ret) {
ceph_osdc_put_request(req);
break;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 0679240f06db..7b2e77517f23 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -78,6 +78,8 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
if (!inode)
return ERR_PTR(-ENOMEM);
+ inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
+
if (!S_ISLNK(*mode)) {
err = ceph_pre_init_acls(dir, mode, as_ctx);
if (err < 0)
@@ -574,7 +576,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
doutc(fsc->client, "%p\n", &ci->netfs.inode);
/* Set parameters for the netfs library */
- netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
spin_lock_init(&ci->i_ceph_lock);
@@ -694,7 +696,7 @@ void ceph_evict_inode(struct inode *inode)
percpu_counter_dec(&mdsc->metric.total_inodes);
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_FSCACHE_WB)
+ if (inode->i_state & I_PINNING_NETFS_WB)
ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index e07ad29ff8b9..ebf4ac0055dd 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -33,7 +33,7 @@ void __init ceph_flock_init(void)
static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
{
- struct inode *inode = file_inode(dst->fl_file);
+ struct inode *inode = file_inode(dst->c.flc_file);
atomic_inc(&ceph_inode(inode)->i_filelock_ref);
dst->fl_u.ceph.inode = igrab(inode);
}
@@ -110,17 +110,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
else
length = fl->fl_end - fl->fl_start + 1;
- owner = secure_addr(fl->fl_owner);
+ owner = secure_addr(fl->c.flc_owner);
doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, "
"start: %llu, length: %llu, wait: %d, type: %d\n",
- (int)lock_type, (int)operation, owner, (u64)fl->fl_pid,
- fl->fl_start, length, wait, fl->fl_type);
+ (int)lock_type, (int)operation, owner,
+ (u64) fl->c.flc_pid,
+ fl->fl_start, length, wait, fl->c.flc_type);
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
req->r_args.filelock_change.owner = cpu_to_le64(owner);
- req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
+ req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid);
req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
@@ -130,13 +131,13 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
err = ceph_mdsc_wait_request(mdsc, req, wait ?
ceph_lock_wait_for_completion : NULL);
if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
- fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+ fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
- fl->fl_type = F_RDLCK;
+ fl->c.flc_type = F_RDLCK;
else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
- fl->fl_type = F_WRLCK;
+ fl->c.flc_type = F_WRLCK;
else
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
@@ -150,8 +151,8 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
ceph_mdsc_put_request(req);
doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, "
"length: %llu, wait: %d, type: %d, err code %d\n",
- (int)lock_type, (int)operation, (u64)fl->fl_pid,
- fl->fl_start, length, wait, fl->fl_type, err);
+ (int)lock_type, (int)operation, (u64) fl->c.flc_pid,
+ fl->fl_start, length, wait, fl->c.flc_type, err);
return err;
}
@@ -227,10 +228,10 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
static int try_unlock_file(struct file *file, struct file_lock *fl)
{
int err;
- unsigned int orig_flags = fl->fl_flags;
- fl->fl_flags |= FL_EXISTS;
+ unsigned int orig_flags = fl->c.flc_flags;
+ fl->c.flc_flags |= FL_EXISTS;
err = locks_lock_file_wait(file, fl);
- fl->fl_flags = orig_flags;
+ fl->c.flc_flags = orig_flags;
if (err == -ENOENT) {
if (!(orig_flags & FL_EXISTS))
err = 0;
@@ -253,13 +254,13 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
u8 wait = 0;
u8 lock_cmd;
- if (!(fl->fl_flags & FL_POSIX))
+ if (!(fl->c.flc_flags & FL_POSIX))
return -ENOLCK;
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
- doutc(cl, "fl_owner: %p\n", fl->fl_owner);
+ doutc(cl, "fl_owner: %p\n", fl->c.flc_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
if (IS_GETLK(cmd))
@@ -273,19 +274,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
}
spin_unlock(&ci->i_ceph_lock);
if (err < 0) {
- if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
+ if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl))
posix_lock_file(file, fl, NULL);
return err;
}
- if (F_RDLCK == fl->fl_type)
+ if (lock_is_read(fl))
lock_cmd = CEPH_LOCK_SHARED;
- else if (F_WRLCK == fl->fl_type)
+ else if (lock_is_write(fl))
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
- if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) {
err = try_unlock_file(file, fl);
if (err <= 0)
return err;
@@ -293,7 +294,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
- if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) {
doutc(cl, "locking locally\n");
err = posix_lock_file(file, fl, NULL);
if (err) {
@@ -319,13 +320,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
u8 wait = 0;
u8 lock_cmd;
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
- doutc(cl, "fl_file: %p\n", fl->fl_file);
+ doutc(cl, "fl_file: %p\n", fl->c.flc_file);
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
@@ -333,7 +334,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
}
spin_unlock(&ci->i_ceph_lock);
if (err < 0) {
- if (F_UNLCK == fl->fl_type)
+ if (lock_is_unlock(fl))
locks_lock_file_wait(file, fl);
return err;
}
@@ -341,14 +342,14 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (IS_SETLKW(cmd))
wait = 1;
- if (F_RDLCK == fl->fl_type)
+ if (lock_is_read(fl))
lock_cmd = CEPH_LOCK_SHARED;
- else if (F_WRLCK == fl->fl_type)
+ else if (lock_is_write(fl))
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
- if (F_UNLCK == fl->fl_type) {
+ if (lock_is_unlock(fl)) {
err = try_unlock_file(file, fl);
if (err <= 0)
return err;
@@ -356,7 +357,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
inode, lock_cmd, wait, fl);
- if (!err && F_UNLCK != fl->fl_type) {
+ if (!err && F_UNLCK != fl->c.flc_type) {
err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
@@ -385,9 +386,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
ctx = locks_inode_context(inode);
if (ctx) {
spin_lock(&ctx->flc_lock);
- list_for_each_entry(lock, &ctx->flc_posix, fl_list)
+ for_each_file_lock(lock, &ctx->flc_posix)
++(*fcntl_count);
- list_for_each_entry(lock, &ctx->flc_flock, fl_list)
+ for_each_file_lock(lock, &ctx->flc_flock)
++(*flock_count);
spin_unlock(&ctx->flc_lock);
}
@@ -408,10 +409,10 @@ static int lock_to_ceph_filelock(struct inode *inode,
cephlock->start = cpu_to_le64(lock->fl_start);
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
cephlock->client = cpu_to_le64(0);
- cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
- cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
+ cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid);
+ cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner));
- switch (lock->fl_type) {
+ switch (lock->c.flc_type) {
case F_RDLCK:
cephlock->type = CEPH_LOCK_SHARED;
break;
@@ -422,7 +423,8 @@ static int lock_to_ceph_filelock(struct inode *inode,
cephlock->type = CEPH_LOCK_UNLOCK;
break;
default:
- doutc(cl, "Have unknown lock type %d\n", lock->fl_type);
+ doutc(cl, "Have unknown lock type %d\n",
+ lock->c.flc_type);
err = -EINVAL;
}
@@ -453,7 +455,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
return 0;
spin_lock(&ctx->flc_lock);
- list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
+ for_each_file_lock(lock, &ctx->flc_posix) {
++seen_fcntl;
if (seen_fcntl > num_fcntl_locks) {
err = -ENOSPC;
@@ -464,7 +466,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
goto fail;
++l;
}
- list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+ for_each_file_lock(lock, &ctx->flc_flock) {
++seen_flock;
if (seen_flock > num_flock_locks) {
err = -ENOSPC;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 02ebfabfc8ee..3ab9c268a8bb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1089,7 +1089,7 @@ void ceph_mdsc_release_request(struct kref *kref)
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
- ceph_mdsc_release_dir_caps_no_check(req);
+ ceph_mdsc_release_dir_caps_async(req);
destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
@@ -1534,7 +1534,8 @@ static int encode_metric_spec(void **p, void *end)
* session message, specialization for CEPH_SESSION_REQUEST_OPEN
* to include additional client metadata fields.
*/
-static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
+static struct ceph_msg *
+create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq)
{
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
@@ -1578,6 +1579,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
size = METRIC_BYTES(count);
extra_bytes += 2 + 4 + 4 + size;
+ /* flags, mds auth caps and oldest_client_tid */
+ extra_bytes += 4 + 4 + 8;
+
/* Allocate the message */
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
@@ -1589,16 +1593,16 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
end = p + msg->front.iov_len;
h = p;
- h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
+ h->op = cpu_to_le32(op);
h->seq = cpu_to_le64(seq);
/*
* Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string>
*
- * ClientSession messages with metadata are v4
+ * ClientSession messages with metadata are v7
*/
- msg->hdr.version = cpu_to_le16(4);
+ msg->hdr.version = cpu_to_le16(7);
msg->hdr.compat_version = cpu_to_le16(1);
/* The write pointer, following the session_head structure */
@@ -1634,6 +1638,15 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
return ERR_PTR(ret);
}
+ /* version == 5, flags */
+ ceph_encode_32(&p, 0);
+
+ /* version == 6, mds auth caps */
+ ceph_encode_32(&p, 0);
+
+ /* version == 7, oldest_client_tid */
+ ceph_encode_64(&p, mdsc->oldest_tid);
+
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -1663,7 +1676,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
session->s_renew_requested = jiffies;
/* send connect message */
- msg = create_session_open_msg(mdsc, session->s_seq);
+ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN,
+ session->s_seq);
if (IS_ERR(msg))
return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
@@ -2028,10 +2042,10 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
doutc(cl, "to mds%d (%s)\n", session->s_mds,
ceph_mds_state_name(state));
- msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS,
++session->s_renew_seq);
- if (!msg)
- return -ENOMEM;
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
return 0;
}
@@ -2470,6 +2484,50 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
}
}
+void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ if (mdsc->stopping)
+ return;
+
+ if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) {
+ doutc(cl, "caps unlink work queued\n");
+ } else {
+ doutc(cl, "failed to queue caps unlink work\n");
+ }
+}
+
+static void ceph_cap_unlink_work(struct work_struct *work)
+{
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, cap_unlink_work);
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "begin\n");
+ spin_lock(&mdsc->cap_unlink_delay_lock);
+ while (!list_empty(&mdsc->cap_unlink_delay_list)) {
+ struct ceph_inode_info *ci;
+ struct inode *inode;
+
+ ci = list_first_entry(&mdsc->cap_unlink_delay_list,
+ struct ceph_inode_info,
+ i_cap_delay_list);
+ list_del_init(&ci->i_cap_delay_list);
+
+ inode = igrab(&ci->netfs.inode);
+ if (inode) {
+ spin_unlock(&mdsc->cap_unlink_delay_lock);
+ doutc(cl, "on %p %llx.%llx\n", inode,
+ ceph_vinop(inode));
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
+ iput(inode);
+ spin_lock(&mdsc->cap_unlink_delay_lock);
+ }
+ }
+ spin_unlock(&mdsc->cap_unlink_delay_lock);
+ doutc(cl, "done\n");
+}
+
/*
* requests
*/
@@ -4128,12 +4186,12 @@ static void handle_session(struct ceph_mds_session *session,
pr_info_client(cl, "mds%d reconnect success\n",
session->s_mds);
+ session->s_features = features;
if (session->s_state == CEPH_MDS_SESSION_OPEN) {
pr_notice_client(cl, "mds%d is already opened\n",
session->s_mds);
} else {
session->s_state = CEPH_MDS_SESSION_OPEN;
- session->s_features = features;
renewed_caps(mdsc, session, 0);
if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
&session->s_features))
@@ -4247,7 +4305,7 @@ void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
}
}
-void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
+void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req)
{
struct ceph_client *cl = req->r_mdsc->fsc->client;
int dcaps;
@@ -4255,8 +4313,7 @@ void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
dcaps = xchg(&req->r_dir_caps, 0);
if (dcaps) {
doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
- ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
- dcaps);
+ ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps);
}
}
@@ -4292,7 +4349,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
if (req->r_session->s_mds != session->s_mds)
continue;
- ceph_mdsc_release_dir_caps_no_check(req);
+ ceph_mdsc_release_dir_caps_async(req);
__send_request(session, req, true);
}
@@ -5346,6 +5403,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_LIST_HEAD(&mdsc->cap_delay_list);
INIT_LIST_HEAD(&mdsc->cap_wait_list);
spin_lock_init(&mdsc->cap_delay_lock);
+ INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);
+ spin_lock_init(&mdsc->cap_unlink_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->last_cap_flush_tid = 1;
@@ -5354,6 +5413,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+ INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work);
err = ceph_metric_init(&mdsc->metric);
if (err)
goto err_mdsmap;
@@ -5627,6 +5687,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
ceph_cleanup_global_and_empty_realms(mdsc);
cancel_work_sync(&mdsc->cap_reclaim_work);
+ cancel_work_sync(&mdsc->cap_unlink_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
doutc(cl, "done\n");
@@ -5870,7 +5931,8 @@ static void mds_peer_reset(struct ceph_connection *con)
pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",
s->s_mds);
- if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
+ if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO &&
+ ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT)
send_mds_reconnect(mdsc, s);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 2e6ddaa13d72..03f8ff00874f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -462,6 +462,8 @@ struct ceph_mds_client {
unsigned long last_renew_caps; /* last time we renewed our caps */
struct list_head cap_delay_list; /* caps with delayed release */
spinlock_t cap_delay_lock; /* protects cap_delay_list */
+ struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */
+ spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */
struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock;
@@ -475,6 +477,8 @@ struct ceph_mds_client {
struct work_struct cap_reclaim_work;
atomic_t cap_reclaim_pending;
+ struct work_struct cap_unlink_work;
+
/*
* Cap reservations
*
@@ -552,7 +556,7 @@ extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
-extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req);
static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
{
kref_get(&req->r_kref);
@@ -574,6 +578,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
+extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc);
extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
int (*cb)(struct inode *, int mds, void *),
void *arg);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index fae97c25ce58..8109aba66e02 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -380,10 +380,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
ceph_decode_skip_8(p, end, bad_ext);
/* required_client_features */
ceph_decode_skip_set(p, end, 64, bad_ext);
+ /* bal_rank_mask */
+ ceph_decode_skip_string(p, end, bad_ext);
+ }
+ if (mdsmap_ev >= 18) {
ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
- } else {
- /* This forces the usage of the (sync) SETXATTR Op */
- m->m_max_xattr_size = 0;
}
bad_ext:
doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
index 89f1931f1ba6..1f2171dd01bf 100644
--- a/fs/ceph/mdsmap.h
+++ b/fs/ceph/mdsmap.h
@@ -27,7 +27,11 @@ struct ceph_mdsmap {
u32 m_session_timeout; /* seconds */
u32 m_session_autoclose; /* seconds */
u64 m_max_file_size;
- u64 m_max_xattr_size; /* maximum size for xattrs blob */
+ /*
+ * maximum size for xattrs blob.
+ * Zeroed by default to force the usage of the (sync) SETXATTR Op.
+ */
+ u64 m_max_xattr_size;
u32 m_max_mds; /* expected up:active mds number */
u32 m_num_active_mds; /* actual up:active mds number */
u32 possible_max_rank; /* possible max rank index */
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 9d36c3532de1..06ee397e0c3a 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -197,10 +197,10 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
}
/*
- * This function walks through the snaprealm for an inode and returns the
- * ceph_snap_realm for the first snaprealm that has quotas set (max_files,
+ * This function walks through the snaprealm for an inode and set the
+ * realmp with the first snaprealm that has quotas set (max_files,
* max_bytes, or any, depending on the 'which_quota' argument). If the root is
- * reached, return the root ceph_snap_realm instead.
+ * reached, set the realmp with the root ceph_snap_realm instead.
*
* Note that the caller is responsible for calling ceph_put_snap_realm() on the
* returned realm.
@@ -211,10 +211,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
* this function will return -EAGAIN; otherwise, the snaprealms walk-through
* will be restarted.
*/
-static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
- struct inode *inode,
- enum quota_get_realm which_quota,
- bool retry)
+static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode,
+ enum quota_get_realm which_quota,
+ struct ceph_snap_realm **realmp, bool retry)
{
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci = NULL;
@@ -222,8 +221,10 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
struct inode *in;
bool has_quota;
+ if (realmp)
+ *realmp = NULL;
if (ceph_snap(inode) != CEPH_NOSNAP)
- return NULL;
+ return 0;
restart:
realm = ceph_inode(inode)->i_snap_realm;
@@ -250,7 +251,7 @@ restart:
break;
ceph_put_snap_realm(mdsc, realm);
if (!retry)
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
goto restart;
}
@@ -259,8 +260,11 @@ restart:
iput(in);
next = realm->parent;
- if (has_quota || !next)
- return realm;
+ if (has_quota || !next) {
+ if (realmp)
+ *realmp = realm;
+ return 0;
+ }
ceph_get_snap_realm(mdsc, next);
ceph_put_snap_realm(mdsc, realm);
@@ -269,7 +273,7 @@ restart:
if (realm)
ceph_put_snap_realm(mdsc, realm);
- return NULL;
+ return 0;
}
bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
@@ -277,6 +281,7 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
struct ceph_snap_realm *old_realm, *new_realm;
bool is_same;
+ int ret;
restart:
/*
@@ -286,9 +291,9 @@ restart:
* dropped and we can then restart the whole operation.
*/
down_read(&mdsc->snap_rwsem);
- old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true);
- new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false);
- if (PTR_ERR(new_realm) == -EAGAIN) {
+ get_quota_realm(mdsc, old, QUOTA_GET_ANY, &old_realm, true);
+ ret = get_quota_realm(mdsc, new, QUOTA_GET_ANY, &new_realm, false);
+ if (ret == -EAGAIN) {
up_read(&mdsc->snap_rwsem);
if (old_realm)
ceph_put_snap_realm(mdsc, old_realm);
@@ -492,8 +497,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
bool is_updated = false;
down_read(&mdsc->snap_rwsem);
- realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root),
- QUOTA_GET_MAX_BYTES, true);
+ get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES,
+ &realm, true);
up_read(&mdsc->snap_rwsem);
if (!realm)
return false;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fe0f64a0acb2..b63b4cd9b5b6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -3,6 +3,7 @@
#define _FS_CEPH_SUPER_H
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/osd_client.h>
#include <asm/unaligned.h>
#include <linux/backing-dev.h>
@@ -1254,8 +1255,6 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had);
-extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci,
- int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
extern void __ceph_remove_capsnap(struct inode *inode,
@@ -1407,6 +1406,19 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
}
+static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len)
+{
+ int cnt = 0;
+
+ if (IS_ENCRYPTED(inode)) {
+ cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL)
+ cnt = 0;
+ }
+
+ return cnt;
+}
+
extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 0c7c2528791e..a50356c541f6 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -24,6 +24,8 @@
#include <linux/pid_namespace.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/vmalloc.h>
#include <linux/coda.h>
@@ -87,10 +89,10 @@ void coda_destroy_inodecache(void)
kmem_cache_destroy(coda_inode_cachep);
}
-static int coda_remount(struct super_block *sb, int *flags, char *data)
+static int coda_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- *flags |= SB_NOATIME;
+ sync_filesystem(fc->root->d_sb);
+ fc->sb_flags |= SB_NOATIME;
return 0;
}
@@ -102,78 +104,102 @@ static const struct super_operations coda_super_operations =
.evict_inode = coda_evict_inode,
.put_super = coda_put_super,
.statfs = coda_statfs,
- .remount_fs = coda_remount,
};
-static int get_device_index(struct coda_mount_data *data)
+struct coda_fs_context {
+ int idx;
+};
+
+enum {
+ Opt_fd,
+};
+
+static const struct fs_parameter_spec coda_param_specs[] = {
+ fsparam_fd ("fd", Opt_fd),
+ {}
+};
+
+static int coda_parse_fd(struct fs_context *fc, int fd)
{
+ struct coda_fs_context *ctx = fc->fs_private;
struct fd f;
struct inode *inode;
int idx;
- if (data == NULL) {
- pr_warn("%s: Bad mount data\n", __func__);
- return -1;
- }
-
- if (data->version != CODA_MOUNT_VERSION) {
- pr_warn("%s: Bad mount version\n", __func__);
- return -1;
- }
-
- f = fdget(data->fd);
+ f = fdget(fd);
if (!f.file)
- goto Ebadf;
+ return -EBADF;
inode = file_inode(f.file);
if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
fdput(f);
- goto Ebadf;
+ return invalf(fc, "code: Not coda psdev");
}
idx = iminor(inode);
fdput(f);
- if (idx < 0 || idx >= MAX_CODADEVS) {
- pr_warn("%s: Bad minor number\n", __func__);
- return -1;
+ if (idx < 0 || idx >= MAX_CODADEVS)
+ return invalf(fc, "coda: Bad minor number");
+ ctx->idx = idx;
+ return 0;
+}
+
+static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, coda_param_specs, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_fd:
+ return coda_parse_fd(fc, result.uint_32);
}
- return idx;
-Ebadf:
- pr_warn("%s: Bad file\n", __func__);
- return -1;
+ return 0;
+}
+
+/*
+ * Parse coda's binary mount data form. We ignore any errors and go with index
+ * 0 if we get one for backward compatibility.
+ */
+static int coda_parse_monolithic(struct fs_context *fc, void *_data)
+{
+ struct coda_mount_data *data = _data;
+
+ if (!data)
+ return invalf(fc, "coda: Bad mount data");
+
+ if (data->version != CODA_MOUNT_VERSION)
+ return invalf(fc, "coda: Bad mount version");
+
+ coda_parse_fd(fc, data->fd);
+ return 0;
}
-static int coda_fill_super(struct super_block *sb, void *data, int silent)
+static int coda_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct coda_fs_context *ctx = fc->fs_private;
struct inode *root = NULL;
struct venus_comm *vc;
struct CodaFid fid;
int error;
- int idx;
-
- if (task_active_pid_ns(current) != &init_pid_ns)
- return -EINVAL;
-
- idx = get_device_index((struct coda_mount_data *) data);
- /* Ignore errors in data, for backward compatibility */
- if(idx == -1)
- idx = 0;
-
- pr_info("%s: device index: %i\n", __func__, idx);
+ infof(fc, "coda: device index: %i\n", ctx->idx);
- vc = &coda_comms[idx];
+ vc = &coda_comms[ctx->idx];
mutex_lock(&vc->vc_mutex);
if (!vc->vc_inuse) {
- pr_warn("%s: No pseudo device\n", __func__);
+ errorf(fc, "coda: No pseudo device");
error = -EINVAL;
goto unlock_out;
}
if (vc->vc_sb) {
- pr_warn("%s: Device already mounted\n", __func__);
+ errorf(fc, "coda: Device already mounted");
error = -EBUSY;
goto unlock_out;
}
@@ -313,18 +339,45 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-/* init_coda: used by filesystems.c to register coda */
+static int coda_get_tree(struct fs_context *fc)
+{
+ if (task_active_pid_ns(current) != &init_pid_ns)
+ return -EINVAL;
-static struct dentry *coda_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+ return get_tree_nodev(fc, coda_fill_super);
+}
+
+static void coda_free_fc(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, coda_fill_super);
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations coda_context_ops = {
+ .free = coda_free_fc,
+ .parse_param = coda_parse_param,
+ .parse_monolithic = coda_parse_monolithic,
+ .get_tree = coda_get_tree,
+ .reconfigure = coda_reconfigure,
+};
+
+static int coda_init_fs_context(struct fs_context *fc)
+{
+ struct coda_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct coda_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->fs_private = ctx;
+ fc->ops = &coda_context_ops;
+ return 0;
}
struct file_system_type coda_fs_type = {
.owner = THIS_MODULE,
.name = "coda",
- .mount = coda_mount,
+ .init_fs_context = coda_init_fs_context,
+ .parameters = coda_param_specs,
.kill_sb = kill_anon_super,
.fs_flags = FS_BINARY_MOUNTDATA,
};
diff --git a/fs/coredump.c b/fs/coredump.c
index f258c17c1841..be6403b4b14b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -872,6 +872,9 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
loff_t pos;
ssize_t n;
+ if (!page)
+ return 0;
+
if (cprm->to_skip) {
if (!__dump_skip(cprm, cprm->to_skip))
return 0;
@@ -884,7 +887,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
pos = file->f_pos;
bvec_set_page(&bvec, page, PAGE_SIZE, 0);
iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
- iov_iter_set_copy_mc(&iter);
n = __kernel_write_iter(cprm->file, &iter, &pos);
if (n != PAGE_SIZE)
return 0;
@@ -895,10 +897,44 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
return 1;
}
+/*
+ * If we might get machine checks from kernel accesses during the
+ * core dump, let's get those errors early rather than during the
+ * IO. This is not performance-critical enough to warrant having
+ * all the machine check logic in the iovec paths.
+ */
+#ifdef copy_mc_to_kernel
+
+#define dump_page_alloc() alloc_page(GFP_KERNEL)
+#define dump_page_free(x) __free_page(x)
+static struct page *dump_page_copy(struct page *src, struct page *dst)
+{
+ void *buf = kmap_local_page(src);
+ size_t left = copy_mc_to_kernel(page_address(dst), buf, PAGE_SIZE);
+ kunmap_local(buf);
+ return left ? NULL : dst;
+}
+
+#else
+
+/* We just want to return non-NULL; it's never used. */
+#define dump_page_alloc() ERR_PTR(-EINVAL)
+#define dump_page_free(x) ((void)(x))
+static inline struct page *dump_page_copy(struct page *src, struct page *dst)
+{
+ return src;
+}
+#endif
+
int dump_user_range(struct coredump_params *cprm, unsigned long start,
unsigned long len)
{
unsigned long addr;
+ struct page *dump_page;
+
+ dump_page = dump_page_alloc();
+ if (!dump_page)
+ return 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
struct page *page;
@@ -912,14 +948,17 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
*/
page = get_dump_page(addr);
if (page) {
- int stop = !dump_emit_page(cprm, page);
+ int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
put_page(page);
- if (stop)
+ if (stop) {
+ dump_page_free(dump_page);
return 0;
+ }
} else {
dump_skip(cprm, PAGE_SIZE);
}
}
+ dump_page_free(dump_page);
return 1;
}
#endif
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 60dbfa0f8805..39e75131fd5a 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb)
sb->s_mtd = NULL;
} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
sync_blockdev(sb->s_bdev);
- bdev_release(sb->s_bdev_handle);
+ fput(sb->s_bdev_file);
}
kfree(sbi);
}
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 7b3fc189593a..0ad52fbe51c9 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -74,13 +74,7 @@ struct fscrypt_nokey_name {
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
- if (str->len == 1 && str->name[0] == '.')
- return true;
-
- if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
- return true;
-
- return false;
+ return is_dot_dotdot(str->name, str->len);
}
/**
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 52504dd478d3..104771c3d3f6 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -102,11 +102,8 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
if (err && err != -ENOENT)
return err;
- if (fname->is_nokey_name) {
- spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_NOKEY_NAME;
- spin_unlock(&dentry->d_lock);
- }
+ fscrypt_prepare_dentry(dentry, fname->is_nokey_name);
+
return err;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
@@ -131,12 +128,10 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry)
{
int err = fscrypt_get_encryption_info(dir, true);
+ bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir));
+
+ fscrypt_prepare_dentry(dentry, is_nokey_name);
- if (!err && !fscrypt_has_encryption_key(dir)) {
- spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_NOKEY_NAME;
- spin_unlock(&dentry->d_lock);
- }
return err;
}
EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial);
diff --git a/fs/dcache.c b/fs/dcache.c
index b813528fb147..71a8e943a0fa 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3061,7 +3061,10 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
if (d_unhashed(dentry) || !dentry->d_inode)
return D_WALK_SKIP;
- dentry->d_lockref.count--;
+ if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+ dentry->d_flags |= DCACHE_GENOCIDE;
+ dentry->d_lockref.count--;
+ }
}
return D_WALK_CONTINUE;
}
@@ -3136,7 +3139,7 @@ static void __init dcache_init(void)
* of the dcache.
*/
dentry_cache = KMEM_CACHE_USERCOPY(dentry,
- SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
d_iname);
/* Hash may have been set up in dcache_init_early */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 60456263a338..62c97ff9e852 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio->bi_end_io = dio_bio_end_io;
if (dio->is_pinned)
bio_set_flag(bio, BIO_PAGE_PINNED);
+ bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint;
+
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d814c5121367..9ca83ef70ed1 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -138,14 +138,14 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
}
op->info.optype = DLM_PLOCK_OP_LOCK;
- op->info.pid = fl->fl_pid;
- op->info.ex = (fl->fl_type == F_WRLCK);
- op->info.wait = !!(fl->fl_flags & FL_SLEEP);
+ op->info.pid = fl->c.flc_pid;
+ op->info.ex = lock_is_write(fl);
+ op->info.wait = !!(fl->c.flc_flags & FL_SLEEP);
op->info.fsid = ls->ls_global_id;
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- op->info.owner = (__u64)(long)fl->fl_owner;
+ op->info.owner = (__u64)(long) fl->c.flc_owner;
/* async handling */
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
@@ -258,7 +258,7 @@ static int dlm_plock_callback(struct plock_op *op)
}
/* got fs lock; bookkeep locally as well: */
- flc->fl_flags &= ~FL_SLEEP;
+ flc->c.flc_flags &= ~FL_SLEEP;
if (posix_lock_file(file, flc, NULL)) {
/*
* This can only happen in the case of kmalloc() failure.
@@ -291,7 +291,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
struct dlm_ls *ls;
struct plock_op *op;
int rv;
- unsigned char fl_flags = fl->fl_flags;
+ unsigned char saved_flags = fl->c.flc_flags;
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
@@ -304,7 +304,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
}
/* cause the vfs unlock to return ENOENT if lock is not found */
- fl->fl_flags |= FL_EXISTS;
+ fl->c.flc_flags |= FL_EXISTS;
rv = locks_lock_file_wait(file, fl);
if (rv == -ENOENT) {
@@ -317,14 +317,14 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
}
op->info.optype = DLM_PLOCK_OP_UNLOCK;
- op->info.pid = fl->fl_pid;
+ op->info.pid = fl->c.flc_pid;
op->info.fsid = ls->ls_global_id;
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- op->info.owner = (__u64)(long)fl->fl_owner;
+ op->info.owner = (__u64)(long) fl->c.flc_owner;
- if (fl->fl_flags & FL_CLOSE) {
+ if (fl->c.flc_flags & FL_CLOSE) {
op->info.flags |= DLM_PLOCK_FL_CLOSE;
send_op(op);
rv = 0;
@@ -345,7 +345,7 @@ out_free:
dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
- fl->fl_flags = fl_flags;
+ fl->c.flc_flags = saved_flags;
return rv;
}
EXPORT_SYMBOL_GPL(dlm_posix_unlock);
@@ -375,14 +375,14 @@ int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file,
return -EINVAL;
memset(&info, 0, sizeof(info));
- info.pid = fl->fl_pid;
- info.ex = (fl->fl_type == F_WRLCK);
+ info.pid = fl->c.flc_pid;
+ info.ex = lock_is_write(fl);
info.fsid = ls->ls_global_id;
dlm_put_lockspace(ls);
info.number = number;
info.start = fl->fl_start;
info.end = fl->fl_end;
- info.owner = (__u64)(long)fl->fl_owner;
+ info.owner = (__u64)(long) fl->c.flc_owner;
rv = do_lock_cancel(&info);
switch (rv) {
@@ -437,13 +437,13 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
}
op->info.optype = DLM_PLOCK_OP_GET;
- op->info.pid = fl->fl_pid;
- op->info.ex = (fl->fl_type == F_WRLCK);
+ op->info.pid = fl->c.flc_pid;
+ op->info.ex = lock_is_write(fl);
op->info.fsid = ls->ls_global_id;
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- op->info.owner = (__u64)(long)fl->fl_owner;
+ op->info.owner = (__u64)(long) fl->c.flc_owner;
send_op(op);
wait_event(recv_wq, (op->done != 0));
@@ -455,16 +455,16 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = op->info.rv;
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
if (rv == -ENOENT)
rv = 0;
else if (rv > 0) {
locks_init_lock(fl);
- fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
- fl->fl_flags = FL_POSIX;
- fl->fl_pid = op->info.pid;
+ fl->c.flc_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+ fl->c.flc_flags = FL_POSIX;
+ fl->c.flc_pid = op->info.pid;
if (op->info.nodeid != dlm_our_nodeid())
- fl->fl_pid = -fl->fl_pid;
+ fl->c.flc_pid = -fl->c.flc_pid;
fl->fl_start = op->info.start;
fl->fl_end = op->info.end;
rv = 0;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 03bd55069d86..2fe0f3af1a08 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1949,16 +1949,6 @@ out:
return rc;
}
-static bool is_dot_dotdot(const char *name, size_t name_size)
-{
- if (name_size == 1 && name[0] == '.')
- return true;
- else if (name_size == 2 && name[0] == '.' && name[1] == '.')
- return true;
-
- return false;
-}
-
/**
* ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
* @plaintext_name: The plaintext name
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 169252e6dc46..f7206158ee81 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -38,7 +38,7 @@ struct efivar_entry {
int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
struct list_head *),
- void *data, bool duplicates, struct list_head *head);
+ void *data, struct list_head *head);
int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6038dd39367a..bb14462f6d99 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -343,12 +343,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
- err = efivar_init(efivarfs_callback, (void *)sb, true,
- &sfi->efivarfs_list);
- if (err)
- efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL);
-
- return err;
+ return efivar_init(efivarfs_callback, sb, &sfi->efivarfs_list);
}
static int efivarfs_get_tree(struct fs_context *fc)
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index 114ff0fd4e55..4d722af1014f 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -361,7 +361,6 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
* efivar_init - build the initial list of EFI variables
* @func: callback function to invoke for every variable
* @data: function-specific data to pass to @func
- * @duplicates: error if we encounter duplicates on @head?
* @head: initialised head of variable list
*
* Get every EFI variable from the firmware and invoke @func. @func
@@ -371,9 +370,9 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
*/
int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
struct list_head *),
- void *data, bool duplicates, struct list_head *head)
+ void *data, struct list_head *head)
{
- unsigned long variable_name_size = 1024;
+ unsigned long variable_name_size = 512;
efi_char16_t *variable_name;
efi_status_t status;
efi_guid_t vendor_guid;
@@ -390,12 +389,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
goto free;
/*
- * Per EFI spec, the maximum storage allocated for both
- * the variable name and variable data is 1024 bytes.
+ * A small set of old UEFI implementations reject sizes
+ * above a certain threshold, the lowest seen in the wild
+ * is 512.
*/
do {
- variable_name_size = 1024;
+ variable_name_size = 512;
status = efivar_get_next_variable(&variable_name_size,
variable_name,
@@ -413,8 +413,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
* we'll ever see a different variable name,
* and may end up looping here forever.
*/
- if (duplicates &&
- variable_is_present(variable_name, &vendor_guid,
+ if (variable_is_present(variable_name, &vendor_guid,
head)) {
dup_variable_bug(variable_name, &vendor_guid,
variable_name_size);
@@ -432,9 +431,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
break;
case EFI_NOT_FOUND:
break;
+ case EFI_BUFFER_TOO_SMALL:
+ pr_warn("efivars: Variable name size exceeds maximum (%lu > 512)\n",
+ variable_name_size);
+ status = EFI_NOT_FOUND;
+ break;
default:
- printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n",
- status);
+ pr_warn("efivars: get_next_variable: status=%lx\n", status);
status = EFI_NOT_FOUND;
break;
}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f17fdac76b2e..e4421c10caeb 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -14,19 +14,14 @@
#include <linux/buffer_head.h>
#include <linux/vfs.h>
#include <linux/blkdev.h>
-
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "efs.h"
#include <linux/efs_vh.h>
#include <linux/efs_fs_sb.h>
static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
-static int efs_fill_super(struct super_block *s, void *d, int silent);
-
-static struct dentry *efs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
-}
+static int efs_init_fs_context(struct fs_context *fc);
static void efs_kill_sb(struct super_block *s)
{
@@ -35,15 +30,6 @@ static void efs_kill_sb(struct super_block *s)
kfree(sbi);
}
-static struct file_system_type efs_fs_type = {
- .owner = THIS_MODULE,
- .name = "efs",
- .mount = efs_mount,
- .kill_sb = efs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("efs");
-
static struct pt_types sgi_pt_types[] = {
{0x00, "SGI vh"},
{0x01, "SGI trkrepl"},
@@ -63,6 +49,27 @@ static struct pt_types sgi_pt_types[] = {
{0, NULL}
};
+enum {
+ Opt_explicit_open,
+};
+
+static const struct fs_parameter_spec efs_param_spec[] = {
+ fsparam_flag ("explicit-open", Opt_explicit_open),
+ {}
+};
+
+/*
+ * File system definition and registration.
+ */
+static struct file_system_type efs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "efs",
+ .kill_sb = efs_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = efs_init_fs_context,
+ .parameters = efs_param_spec,
+};
+MODULE_ALIAS_FS("efs");
static struct kmem_cache * efs_inode_cachep;
@@ -91,8 +98,8 @@ static int __init init_inodecache(void)
{
efs_inode_cachep = kmem_cache_create("efs_inode_cache",
sizeof(struct efs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT, init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ init_once);
if (efs_inode_cachep == NULL)
return -ENOMEM;
return 0;
@@ -108,18 +115,10 @@ static void destroy_inodecache(void)
kmem_cache_destroy(efs_inode_cachep);
}
-static int efs_remount(struct super_block *sb, int *flags, char *data)
-{
- sync_filesystem(sb);
- *flags |= SB_RDONLY;
- return 0;
-}
-
static const struct super_operations efs_superblock_operations = {
.alloc_inode = efs_alloc_inode,
.free_inode = efs_free_inode,
.statfs = efs_statfs,
- .remount_fs = efs_remount,
};
static const struct export_operations efs_export_ops = {
@@ -249,26 +248,26 @@ static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) {
return 0;
}
-static int efs_fill_super(struct super_block *s, void *d, int silent)
+static int efs_fill_super(struct super_block *s, struct fs_context *fc)
{
struct efs_sb_info *sb;
struct buffer_head *bh;
struct inode *root;
- sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
+ sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
if (!sb)
return -ENOMEM;
s->s_fs_info = sb;
s->s_time_min = 0;
s->s_time_max = U32_MAX;
-
+
s->s_magic = EFS_SUPER_MAGIC;
if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
pr_err("device does not support %d byte blocks\n",
EFS_BLOCKSIZE);
return -EINVAL;
}
-
+
/* read the vh (volume header) block */
bh = sb_bread(s, 0);
@@ -294,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
pr_err("cannot read superblock\n");
return -EIO;
}
-
+
if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
#ifdef DEBUG
pr_warn("invalid superblock at block %u\n",
@@ -328,6 +327,61 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
return 0;
}
+static void efs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static int efs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, efs_fill_super);
+}
+
+static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ int token;
+ struct fs_parse_result result;
+
+ token = fs_parse(fc, efs_param_spec, param, &result);
+ if (token < 0)
+ return token;
+ return 0;
+}
+
+static int efs_reconfigure(struct fs_context *fc)
+{
+ sync_filesystem(fc->root->d_sb);
+
+ return 0;
+}
+
+struct efs_context {
+ unsigned long s_mount_opts;
+};
+
+static const struct fs_context_operations efs_context_opts = {
+ .parse_param = efs_parse_param,
+ .get_tree = efs_get_tree,
+ .reconfigure = efs_reconfigure,
+ .free = efs_free_fc,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int efs_init_fs_context(struct fs_context *fc)
+{
+ struct efs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ fc->fs_private = ctx;
+ fc->ops = &efs_context_opts;
+
+ return 0;
+}
+
static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
struct super_block *sb = dentry->d_sb;
struct efs_sb_info *sbi = SUPER_INFO(sb);
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 1d318f85232d..fffd3919343e 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -114,8 +114,11 @@ config EROFS_FS_ZIP_DEFLATE
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support"
- depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
- default n
+ depends on EROFS_FS
+ select NETFS_SUPPORT
+ select FSCACHE
+ select CACHEFILES
+ select CACHEFILES_ONDEMAND
help
This permits EROFS to use fscache-backed data blobs with on-demand
read support.
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 279933e007d2..7cc5841577b2 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -11,13 +11,12 @@
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
-
unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
- /* indicate the algorithm will be used for decompression */
- unsigned int alg;
+ unsigned int alg; /* the algorithm for decompression */
bool inplace_io, partial_decoding, fillgaps;
+ gfp_t gfp; /* allocation flags for extra temporary buffers */
};
struct z_erofs_decompressor {
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index c98aeda8abb2..52524bd9698b 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -220,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
up_read(&devs->rwsem);
return 0;
}
- map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL;
+ map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
@@ -238,8 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
if (map->m_pa >= startoff &&
map->m_pa < startoff + length) {
map->m_pa -= startoff;
- map->m_bdev = dif->bdev_handle ?
- dif->bdev_handle->bdev : NULL;
+ map->m_bdev = dif->bdev_file ?
+ file_bdev(dif->bdev_file) : NULL;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
@@ -447,5 +447,6 @@ const struct file_operations erofs_file_fops = {
.llseek = generic_file_llseek,
.read_iter = erofs_file_read_iter,
.mmap = erofs_file_mmap,
+ .get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 1d65b9f60a39..2ec9b2bb628d 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -111,8 +111,9 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
victim = availables[--top];
get_page(victim);
} else {
- victim = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ victim = erofs_allocpage(pagepool, rq->gfp);
+ if (!victim)
+ return -ENOMEM;
set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
}
rq->out[i] = victim;
@@ -322,7 +323,8 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
u8 *kin;
- DBG_BUGON(rq->outputsize > rq->inputsize);
+ if (rq->outputsize > rq->inputsize)
+ return -EOPNOTSUPP;
if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) {
cur = bs - (rq->pageofs_out & (bs - 1));
pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK;
@@ -408,7 +410,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
int size, ret = 0;
if (!erofs_sb_has_compr_cfgs(sbi)) {
- sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4;
+ sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4;
return z_erofs_load_lz4_config(sb, dsb, NULL, 0);
}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 4a64a9c91dd3..b98872058abe 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -95,7 +95,7 @@ int z_erofs_load_deflate_config(struct super_block *sb,
}
int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+ struct page **pgpl)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -158,8 +158,12 @@ again:
strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
outsz -= strm->z.avail_out;
if (!rq->out[no]) {
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
+ if (!rq->out[no]) {
+ kout = NULL;
+ err = -ENOMEM;
+ break;
+ }
set_page_private(rq->out[no],
Z_EROFS_SHORTLIVED_PAGE);
}
@@ -211,8 +215,11 @@ again:
DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage) {
+ err = -ENOMEM;
+ goto failed;
+ }
set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
copy_highpage(tmppage, rq->in[j]);
rq->in[j] = tmppage;
@@ -230,7 +237,7 @@ again:
break;
}
}
-
+failed:
if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
err = -EIO;
if (kout)
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 2dd14f99c1dc..6ca357d83cfa 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -148,7 +148,7 @@ again:
}
int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+ struct page **pgpl)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -215,8 +215,11 @@ again:
PAGE_SIZE - pageofs);
outlen -= strm->buf.out_size;
if (!rq->out[no] && rq->fillgaps) { /* deduped */
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
+ if (!rq->out[no]) {
+ err = -ENOMEM;
+ break;
+ }
set_page_private(rq->out[no],
Z_EROFS_SHORTLIVED_PAGE);
}
@@ -258,8 +261,11 @@ again:
DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage) {
+ err = -ENOMEM;
+ goto failed;
+ }
set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
copy_highpage(tmppage, rq->in[j]);
rq->in[j] = tmppage;
@@ -277,6 +283,7 @@ again:
break;
}
}
+failed:
if (no < nrpages_out && strm->buf.out)
kunmap(rq->out[no]);
if (ni < nrpages_in)
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 87ff35bff8d5..89a7c2453aae 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -165,10 +165,10 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
{
int ret;
- struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private;
+ struct erofs_fscache *ctx = folio->mapping->host->i_private;
struct erofs_fscache_request *req;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
if (IS_ERR(req)) {
folio_unlock(folio);
@@ -276,7 +276,7 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
struct erofs_fscache_request *req;
int ret;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
if (IS_ERR(req)) {
folio_unlock(folio);
@@ -381,11 +381,12 @@ static int erofs_fscache_init_domain(struct super_block *sb)
goto out;
if (!erofs_pseudo_mnt) {
- erofs_pseudo_mnt = kern_mount(&erofs_fs_type);
- if (IS_ERR(erofs_pseudo_mnt)) {
- err = PTR_ERR(erofs_pseudo_mnt);
+ struct vfsmount *mnt = kern_mount(&erofs_fs_type);
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
goto out;
}
+ erofs_pseudo_mnt = mnt;
}
domain->volume = sbi->volume;
@@ -459,7 +460,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
inode->i_blkbits = EROFS_SB(sb)->blkszbits;
inode->i_private = ctx;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 3d616dea55dc..36e638e8b53a 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -60,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
} else {
const unsigned int gotten = sb->s_blocksize - *ofs;
- copied = kmalloc(vi->inode_isize, GFP_NOFS);
+ copied = kmalloc(vi->inode_isize, GFP_KERNEL);
if (!copied) {
err = -ENOMEM;
goto err_out;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index b0409badb017..0f0706325b7b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -49,7 +49,7 @@ typedef u32 erofs_blk_t;
struct erofs_device_info {
char *path;
struct erofs_fscache *fscache;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct dax_device *dax_dev;
u64 dax_part_off;
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index d4f631d39f0f..f0110a78acb2 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -130,24 +130,24 @@ static void *erofs_find_target_block(struct erofs_buf *target,
/* string comparison without already matched prefix */
diff = erofs_dirnamecmp(name, &dname, &matched);
- if (!diff) {
- *_ndirents = 0;
- goto out;
- } else if (diff > 0) {
- head = mid + 1;
- startprfx = matched;
-
- if (!IS_ERR(candidate))
- erofs_put_metabuf(target);
- *target = buf;
- candidate = de;
- *_ndirents = ndirents;
- } else {
+ if (diff < 0) {
erofs_put_metabuf(&buf);
-
back = mid - 1;
endprfx = matched;
+ continue;
+ }
+
+ if (!IS_ERR(candidate))
+ erofs_put_metabuf(target);
+ *target = buf;
+ if (!diff) {
+ *_ndirents = 0;
+ return de;
}
+ head = mid + 1;
+ startprfx = matched;
+ candidate = de;
+ *_ndirents = ndirents;
continue;
}
out: /* free if the candidate is valid */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 5f60f163bd56..9b4b66dcdd4f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -177,7 +177,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
struct erofs_deviceslot *dis;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
void *ptr;
ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
@@ -201,12 +201,12 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
return PTR_ERR(fscache);
dif->fscache = fscache;
} else if (!sbi->devs->flatdev) {
- bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ,
+ bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ,
sb->s_type, NULL);
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
- dif->bdev_handle = bdev_handle;
- dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev,
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
+ dif->bdev_file = bdev_file;
+ dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file),
&dif->dax_part_off, NULL, NULL);
}
@@ -754,8 +754,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
struct erofs_device_info *dif = ptr;
fs_put_dax(dif->dax_dev, NULL);
- if (dif->bdev_handle)
- bdev_release(dif->bdev_handle);
+ if (dif->bdev_file)
+ fput(dif->bdev_file);
erofs_fscache_unregister_cookie(dif->fscache);
dif->fscache = NULL;
kfree(dif->path);
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 5dea308764b4..e146d09151af 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -81,7 +81,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
repeat:
xa_lock(&sbi->managed_pslots);
pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
- NULL, grp, GFP_NOFS);
+ NULL, grp, GFP_KERNEL);
if (pre) {
if (xa_is_err(pre)) {
pre = ERR_PTR(xa_err(pre));
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 692c0c39be63..ff0aa72b0db3 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -82,6 +82,9 @@ struct z_erofs_pcluster {
/* L: indicate several pageofs_outs or not */
bool multibases;
+ /* L: whether extra buffer allocations are best-effort */
+ bool besteffort;
+
/* A: compressed bvecs (can be cached or inplaced pages) */
struct z_erofs_bvec compressed_bvecs[];
};
@@ -230,7 +233,7 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct page *nextpage = *candidate_bvpage;
if (!nextpage) {
- nextpage = erofs_allocpage(pagepool, GFP_NOFS);
+ nextpage = erofs_allocpage(pagepool, GFP_KERNEL);
if (!nextpage)
return -ENOMEM;
set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
@@ -302,7 +305,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
if (nrpages > pcs->maxpages)
continue;
- pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+ pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
if (!pcl)
return ERR_PTR(-ENOMEM);
pcl->pclustersize = size;
@@ -563,21 +566,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
unsigned int i;
- if (i_blocksize(fe->inode) != PAGE_SIZE)
- return;
- if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
+ if (i_blocksize(fe->inode) != PAGE_SIZE ||
+ fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
for (i = 0; i < pclusterpages; ++i) {
struct page *page, *newpage;
void *t; /* mark pages just found for debugging */
- /* the compressed page was loaded before */
+ /* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
page = find_get_page(mc, pcl->obj.index + i);
-
if (page) {
t = (void *)((unsigned long)page | 1);
newpage = NULL;
@@ -597,9 +598,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
t = (void *)((unsigned long)newpage | 1);
}
-
- if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
+ spin_lock(&pcl->obj.lockref.lock);
+ if (!pcl->compressed_bvecs[i].page) {
+ pcl->compressed_bvecs[i].page = t;
+ spin_unlock(&pcl->obj.lockref.lock);
continue;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
if (page)
put_page(page);
@@ -694,7 +699,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio,
DBG_BUGON(stop > folio_size(folio) || stop < length);
if (offset == 0 && stop == folio_size(folio))
- while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
+ while (!z_erofs_cache_release_folio(folio, 0))
cond_resched();
}
@@ -713,36 +718,30 @@ int erofs_init_managed_cache(struct super_block *sb)
set_nlink(inode, 1);
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &z_erofs_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
EROFS_SB(sb)->managed_cache = inode;
return 0;
}
-static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
- struct z_erofs_bvec *bvec)
-{
- struct z_erofs_pcluster *const pcl = fe->pcl;
-
- while (fe->icur > 0) {
- if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
- NULL, bvec->page)) {
- pcl->compressed_bvecs[fe->icur] = *bvec;
- return true;
- }
- }
- return false;
-}
-
/* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
+ struct z_erofs_pcluster *pcl = fe->pcl;
int ret;
if (exclusive) {
/* give priority for inplaceio to use file pages first */
- if (z_erofs_try_inplace_io(fe, bvec))
+ spin_lock(&pcl->obj.lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->obj.lockref.lock);
return 0;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
+
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
!fe->candidate_bvpage)
@@ -964,7 +963,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page)
+ struct page *page, bool ra)
{
struct inode *const inode = fe->inode;
struct erofs_map_blocks *const map = &fe->map;
@@ -1014,6 +1013,7 @@ repeat:
err = z_erofs_pcluster_begin(fe);
if (err)
goto out;
+ fe->pcl->besteffort |= !ra;
}
/*
@@ -1280,6 +1280,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
.fillgaps = pcl->multibases,
+ .gfp = pcl->besteffort ?
+ GFP_KERNEL | __GFP_NOFAIL :
+ GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
/* must handle all compressed pages before actual file pages */
@@ -1322,6 +1325,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
pcl->length = 0;
pcl->partial = true;
pcl->multibases = false;
+ pcl->besteffort = false;
pcl->bvset.nextpage = NULL;
pcl->vcnt = 0;
@@ -1423,23 +1427,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
{
gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
- struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
+ struct z_erofs_bvec zbv;
struct address_space *mapping;
- struct page *page, *oldpage;
+ struct page *page;
int justfound, bs = i_blocksize(f->inode);
/* Except for inplace pages, the entire page can be used for I/Os */
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
- oldpage = READ_ONCE(zbv->page);
- if (!oldpage)
+ spin_lock(&pcl->obj.lockref.lock);
+ zbv = pcl->compressed_bvecs[nr];
+ page = zbv.page;
+ justfound = (unsigned long)page & 1UL;
+ page = (struct page *)((unsigned long)page & ~1UL);
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
+ if (!page)
goto out_allocpage;
- justfound = (unsigned long)oldpage & 1UL;
- page = (struct page *)((unsigned long)oldpage & ~1UL);
bvec->bv_page = page;
-
DBG_BUGON(z_erofs_is_shortlived_page(page));
/*
* Handle preallocated cached pages. We tried to allocate such pages
@@ -1448,7 +1455,6 @@ repeat:
*/
if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
set_page_private(page, 0);
- WRITE_ONCE(zbv->page, page);
tocache = true;
goto out_tocache;
}
@@ -1459,9 +1465,9 @@ repeat:
* therefore it is impossible for `mapping` to be NULL.
*/
if (mapping && mapping != mc) {
- if (zbv->offset < 0)
- bvec->bv_offset = round_up(-zbv->offset, bs);
- bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
+ if (zbv.offset < 0)
+ bvec->bv_offset = round_up(-zbv.offset, bs);
+ bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
return;
}
@@ -1471,7 +1477,6 @@ repeat:
/* the cached page is still in managed cache */
if (page->mapping == mc) {
- WRITE_ONCE(zbv->page, page);
/*
* The cached page is still available but without a valid
* `->private` pcluster hint. Let's reconnect them.
@@ -1503,11 +1508,15 @@ repeat:
put_page(page);
out_allocpage:
page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
- if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->compressed_bvecs[nr].page) {
erofs_pagepool_add(&f->pagepool, page);
+ spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
bvec->bv_page = page;
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
@@ -1685,6 +1694,7 @@ submit_bio_retry:
if (cur + bvec.bv_len > end)
bvec.bv_len = end - cur;
+ DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset))
goto submit_bio_retry;
@@ -1785,7 +1795,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
if (PageUptodate(page))
unlock_page(page);
else
- (void)z_erofs_do_read_page(f, page);
+ (void)z_erofs_do_read_page(f, page, !!rac);
put_page(page);
}
@@ -1806,7 +1816,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
- err = z_erofs_do_read_page(&f, &folio->page);
+ err = z_erofs_do_read_page(&f, &folio->page, false);
z_erofs_pcluster_readmore(&f, NULL, false);
z_erofs_pcluster_end(&f);
@@ -1847,7 +1857,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
folio = head;
head = folio_get_private(folio);
- err = z_erofs_do_read_page(&f, &folio->page);
+ err = z_erofs_do_read_page(&f, &folio->page, true);
if (err && err != -EINTR)
erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
folio->index, EROFS_I(inode)->nid);
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 9753875e41cb..e313c936351d 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -454,7 +454,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
.map = map,
};
int err = 0;
- unsigned int lclusterbits, endoff;
+ unsigned int lclusterbits, endoff, afmt;
unsigned long initial_lcn;
unsigned long long ofs, end;
@@ -543,17 +543,20 @@ static int z_erofs_do_map_blocks(struct inode *inode,
err = -EFSCORRUPTED;
goto unmap_out;
}
- if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
- map->m_algorithmformat =
- Z_EROFS_COMPRESSION_INTERLACED;
- else
- map->m_algorithmformat =
- Z_EROFS_COMPRESSION_SHIFTED;
- } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
- map->m_algorithmformat = vi->z_algorithmtype[1];
+ afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
+ Z_EROFS_COMPRESSION_INTERLACED :
+ Z_EROFS_COMPRESSION_SHIFTED;
} else {
- map->m_algorithmformat = vi->z_algorithmtype[0];
+ afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
+ vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
+ if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ afmt, vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
}
+ map->m_algorithmformat = afmt;
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ad8186d47ba7..9afdb722fa92 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -251,7 +251,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
ssize_t res;
__u64 ucnt;
- if (count < sizeof(ucnt))
+ if (count != sizeof(ucnt))
return -EINVAL;
if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
return -EFAULT;
@@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct eventfd_ctx *ctx = f->private_data;
+ __u64 cnt;
spin_lock_irq(&ctx->wqh.lock);
- seq_printf(m, "eventfd-count: %16llx\n",
- (unsigned long long)ctx->count);
+ cnt = ctx->count;
spin_unlock_irq(&ctx->wqh.lock);
- seq_printf(m, "eventfd-id: %d\n", ctx->id);
- seq_printf(m, "eventfd-semaphore: %d\n",
+
+ seq_printf(m,
+ "eventfd-count: %16llx\n"
+ "eventfd-id: %d\n"
+ "eventfd-semaphore: %d\n",
+ cnt,
+ ctx->id,
!!(ctx->flags & EFD_SEMAPHORE));
}
#endif
@@ -383,6 +388,7 @@ static int do_eventfd(unsigned int count, int flags)
/* Check the EFD_* constants for consistency. */
BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+ BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0));
if (flags & ~EFD_FLAGS_SET)
return -EINVAL;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3534d36a1474..39ac6fdf8bca 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -206,7 +206,7 @@ struct eventpoll {
*/
struct epitem *ovflist;
- /* wakeup_source used when ep_scan_ready_list is running */
+ /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
@@ -678,12 +678,6 @@ static void ep_done_scan(struct eventpoll *ep,
write_unlock_irq(&ep->lock);
}
-static void epi_rcu_free(struct rcu_head *head)
-{
- struct epitem *epi = container_of(head, struct epitem, rcu);
- kmem_cache_free(epi_cache, epi);
-}
-
static void ep_get(struct eventpoll *ep)
{
refcount_inc(&ep->refcount);
@@ -767,7 +761,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
* ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
* use of the rbn field.
*/
- call_rcu(&epi->rcu, epi_rcu_free);
+ kfree_rcu(epi, rcu);
percpu_counter_dec(&ep->user->epoll_watches);
return ep_refcount_dec_and_test(ep);
@@ -1153,7 +1147,7 @@ static inline bool chain_epi_lockless(struct epitem *epi)
* This callback takes a read lock in order not to contend with concurrent
* events from another file descriptor, thus all modifications to ->rdllist
* or ->ovflist are lockless. Read lock is paired with the write lock from
- * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * ep_start/done_scan(), which stops all list modifications and guarantees
* that lists state is seen correctly.
*
* Another thing worth to mention is that ep_poll_callback() can be called
@@ -1751,7 +1745,7 @@ static int ep_send_events(struct eventpoll *ep,
* availability. At this point, no one can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
- * ep_scan_ready_list() holding "mtx" and the
+ * ep_send_events() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1904,7 +1898,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
__set_current_state(TASK_INTERRUPTIBLE);
/*
- * Do the final check under the lock. ep_scan_ready_list()
+ * Do the final check under the lock. ep_start/done_scan()
* plays with two lists (->rdllist and ->ovflist) and there
* is always a race when both lists are empty for short
* period of time although events are pending, so lock is
diff --git a/fs/exec.c b/fs/exec.c
index 73e4045df271..ece3ab0998e1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -128,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
struct filename *tmp = getname(library);
int error = PTR_ERR(tmp);
static const struct open_flags uselib_flags = {
- .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+ .open_flag = O_LARGEFILE | O_RDONLY,
.acc_mode = MAY_READ | MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
@@ -904,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack);
#endif /* CONFIG_MMU */
+/*
+ * On success, caller must call do_close_execat() on the returned
+ * struct file to close it.
+ */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
struct file *file;
@@ -948,6 +952,17 @@ exit:
return ERR_PTR(err);
}
+/**
+ * open_exec - Open a path name for execution
+ *
+ * @name: path name to open with the intent of executing it.
+ *
+ * Returns ERR_PTR on failure or allocated struct file on success.
+ *
+ * As this is a wrapper for the internal do_open_execat(), callers
+ * must call allow_write_access() before fput() on release. Also see
+ * do_close_execat().
+ */
struct file *open_exec(const char *name)
{
struct filename *filename = getname_kernel(name);
@@ -1143,7 +1158,6 @@ static int de_thread(struct task_struct *tsk)
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
leader->exit_state = EXIT_DEAD;
-
/*
* We are going to release_task()->ptrace_unlink() silently,
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
@@ -1409,6 +1423,9 @@ int begin_new_exec(struct linux_binprm * bprm)
out_unlock:
up_write(&me->signal->exec_update_lock);
+ if (!bprm->cred)
+ mutex_unlock(&me->signal->cred_guard_mutex);
+
out:
return retval;
}
@@ -1484,6 +1501,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
return -ENOMEM;
}
+/* Matches do_open_execat() */
+static void do_close_execat(struct file *file)
+{
+ if (!file)
+ return;
+ allow_write_access(file);
+ fput(file);
+}
+
static void free_bprm(struct linux_binprm *bprm)
{
if (bprm->mm) {
@@ -1495,10 +1521,7 @@ static void free_bprm(struct linux_binprm *bprm)
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
- if (bprm->file) {
- allow_write_access(bprm->file);
- fput(bprm->file);
- }
+ do_close_execat(bprm->file);
if (bprm->executable)
fput(bprm->executable);
/* If a binfmt changed the interp, free it. */
@@ -1508,12 +1531,23 @@ static void free_bprm(struct linux_binprm *bprm)
kfree(bprm);
}
-static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
+static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags)
{
- struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ struct linux_binprm *bprm;
+ struct file *file;
int retval = -ENOMEM;
- if (!bprm)
- goto out;
+
+ file = do_open_execat(fd, filename, flags);
+ if (IS_ERR(file))
+ return ERR_CAST(file);
+
+ bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ if (!bprm) {
+ do_close_execat(file);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ bprm->file = file;
if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
@@ -1526,18 +1560,28 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
if (!bprm->fdpath)
goto out_free;
+ /*
+ * Record that a name derived from an O_CLOEXEC fd will be
+ * inaccessible after exec. This allows the code in exec to
+ * choose to fail when the executable is not mmaped into the
+ * interpreter and an open file descriptor is not passed to
+ * the interpreter. This makes for a better user experience
+ * than having the interpreter start and then immediately fail
+ * when it finds the executable is inaccessible.
+ */
+ if (get_close_on_exec(fd))
+ bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
+
bprm->filename = bprm->fdpath;
}
bprm->interp = bprm->filename;
retval = bprm_mm_init(bprm);
- if (retval)
- goto out_free;
- return bprm;
+ if (!retval)
+ return bprm;
out_free:
free_bprm(bprm);
-out:
return ERR_PTR(retval);
}
@@ -1588,6 +1632,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
}
rcu_read_unlock();
+ /* "users" and "in_exec" locked for copy_fs() */
if (p->fs->users > n_fs)
bprm->unsafe |= LSM_UNSAFE_SHARE;
else
@@ -1804,13 +1849,8 @@ static int exec_binprm(struct linux_binprm *bprm)
return 0;
}
-/*
- * sys_execve() executes a new program.
- */
-static int bprm_execve(struct linux_binprm *bprm,
- int fd, struct filename *filename, int flags)
+static int bprm_execve(struct linux_binprm *bprm)
{
- struct file *file;
int retval;
retval = prepare_bprm_creds(bprm);
@@ -1826,26 +1866,8 @@ static int bprm_execve(struct linux_binprm *bprm,
current->in_execve = 1;
sched_mm_cid_before_execve(current);
- file = do_open_execat(fd, filename, flags);
- retval = PTR_ERR(file);
- if (IS_ERR(file))
- goto out_unmark;
-
sched_exec();
- bprm->file = file;
- /*
- * Record that a name derived from an O_CLOEXEC fd will be
- * inaccessible after exec. This allows the code in exec to
- * choose to fail when the executable is not mmaped into the
- * interpreter and an open file descriptor is not passed to
- * the interpreter. This makes for a better user experience
- * than having the interpreter start and then immediately fail
- * when it finds the executable is inaccessible.
- */
- if (bprm->fdpath && get_close_on_exec(fd))
- bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
-
/* Set the unchanging part of bprm->cred */
retval = security_bprm_creds_for_exec(bprm);
if (retval)
@@ -1875,7 +1897,6 @@ out:
if (bprm->point_of_no_return && !fatal_signal_pending(current))
force_fatal_sig(SIGSEGV);
-out_unmark:
sched_mm_cid_after_execve(current);
current->fs->in_exec = 0;
current->in_execve = 0;
@@ -1910,7 +1931,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;
- bprm = alloc_bprm(fd, filename);
+ bprm = alloc_bprm(fd, filename, flags);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -1959,7 +1980,7 @@ static int do_execveat_common(int fd, struct filename *filename,
bprm->argc = 1;
}
- retval = bprm_execve(bprm, fd, filename, flags);
+ retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
@@ -1984,7 +2005,7 @@ int kernel_execve(const char *kernel_filename,
if (IS_ERR(filename))
return PTR_ERR(filename);
- bprm = alloc_bprm(fd, filename);
+ bprm = alloc_bprm(fd, filename, 0);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -2019,7 +2040,7 @@ int kernel_execve(const char *kernel_filename,
if (retval < 0)
goto out_free;
- retval = bprm_execve(bprm, fd, filename, 0);
+ retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
out_ret:
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 9474cd50da6d..361595433480 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -275,6 +275,7 @@ struct exfat_sb_info {
spinlock_t inode_hash_lock;
struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
+ struct rcu_head rcu;
};
#define EXFAT_CACHE_VALID 0
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index d25a96a148af..cc00f1a7a1e1 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -35,13 +35,18 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
if (new_num_clusters == num_clusters)
goto out;
- exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags);
- ret = exfat_find_last_cluster(sb, &clu, &last_clu);
- if (ret)
- return ret;
+ if (num_clusters) {
+ exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags);
+ ret = exfat_find_last_cluster(sb, &clu, &last_clu);
+ if (ret)
+ return ret;
+
+ clu.dir = last_clu + 1;
+ } else {
+ last_clu = EXFAT_EOF_CLUSTER;
+ clu.dir = EXFAT_EOF_CLUSTER;
+ }
- clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ?
- EXFAT_EOF_CLUSTER : last_clu + 1;
clu.size = 0;
clu.flags = ei->flags;
@@ -51,17 +56,19 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
return ret;
/* Append new clusters to chain */
- if (clu.flags != ei->flags) {
- exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters);
- ei->flags = ALLOC_FAT_CHAIN;
- }
- if (clu.flags == ALLOC_FAT_CHAIN)
- if (exfat_ent_set(sb, last_clu, clu.dir))
- goto free_clu;
-
- if (num_clusters == 0)
+ if (num_clusters) {
+ if (clu.flags != ei->flags)
+ if (exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters))
+ goto free_clu;
+
+ if (clu.flags == ALLOC_FAT_CHAIN)
+ if (exfat_ent_set(sb, last_clu, clu.dir))
+ goto free_clu;
+ } else
ei->start_clu = clu.dir;
+ ei->flags = clu.flags;
+
out:
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
/* Expanded range not zeroed, do not update valid_size */
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 522edcbb2ce4..0687f952956c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -501,7 +501,7 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = mapping->host;
struct exfat_inode_info *ei = EXFAT_I(inode);
loff_t pos = iocb->ki_pos;
- loff_t size = iocb->ki_pos + iov_iter_count(iter);
+ loff_t size = pos + iov_iter_count(iter);
int rw = iov_iter_rw(iter);
ssize_t ret;
@@ -525,11 +525,10 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
*/
ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block);
if (ret < 0) {
- if (rw == WRITE)
+ if (rw == WRITE && ret != -EIOCBQUEUED)
exfat_write_failed(mapping, size);
- if (ret != -EIOCBQUEUED)
- return ret;
+ return ret;
} else
size = pos + ret;
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 705710f93e2d..afdf13c34ff5 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -655,7 +655,6 @@ static int exfat_load_upcase_table(struct super_block *sb,
unsigned int sect_size = sb->s_blocksize;
unsigned int i, index = 0;
u32 chksum = 0;
- int ret;
unsigned char skip = false;
unsigned short *upcase_table;
@@ -673,8 +672,7 @@ static int exfat_load_upcase_table(struct super_block *sb,
if (!bh) {
exfat_err(sb, "failed to read sector(0x%llx)",
(unsigned long long)sector);
- ret = -EIO;
- goto free_table;
+ return -EIO;
}
sector++;
for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) {
@@ -701,15 +699,12 @@ static int exfat_load_upcase_table(struct super_block *sb,
exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)",
index, chksum, utbl_checksum);
- ret = -EINVAL;
-free_table:
- exfat_free_upcase_table(sbi);
- return ret;
+ return -EINVAL;
}
static int exfat_load_default_upcase_table(struct super_block *sb)
{
- int i, ret = -EIO;
+ int i;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
unsigned char skip = false;
unsigned short uni = 0, *upcase_table;
@@ -740,8 +735,7 @@ static int exfat_load_default_upcase_table(struct super_block *sb)
return 0;
/* FATAL error: default upcase table has error */
- exfat_free_upcase_table(sbi);
- return ret;
+ return -EIO;
}
int exfat_create_upcase_table(struct super_block *sb)
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index d9d4fa91010b..fcb658267765 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -39,9 +39,6 @@ static void exfat_put_super(struct super_block *sb)
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
-
- unload_nls(sbi->nls_io);
- exfat_free_upcase_table(sbi);
}
static int exfat_sync_fs(struct super_block *sb, int wait)
@@ -600,7 +597,7 @@ static int __exfat_fill_super(struct super_block *sb)
ret = exfat_load_bitmap(sb);
if (ret) {
exfat_err(sb, "failed to load alloc-bitmap");
- goto free_upcase_table;
+ goto free_bh;
}
ret = exfat_count_used_clusters(sb, &sbi->used_clusters);
@@ -613,8 +610,6 @@ static int __exfat_fill_super(struct super_block *sb)
free_alloc_bitmap:
exfat_free_bitmap(sbi);
-free_upcase_table:
- exfat_free_upcase_table(sbi);
free_bh:
brelse(sbi->boot_bh);
return ret;
@@ -701,12 +696,10 @@ put_inode:
sb->s_root = NULL;
free_table:
- exfat_free_upcase_table(sbi);
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
check_nls_io:
- unload_nls(sbi->nls_io);
return err;
}
@@ -771,13 +764,22 @@ static int exfat_init_fs_context(struct fs_context *fc)
return 0;
}
+static void delayed_free(struct rcu_head *p)
+{
+ struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu);
+
+ unload_nls(sbi->nls_io);
+ exfat_free_upcase_table(sbi);
+ exfat_free_sbi(sbi);
+}
+
static void exfat_kill_sb(struct super_block *sb)
{
struct exfat_sb_info *sbi = sb->s_fs_info;
kill_block_super(sb);
if (sbi)
- exfat_free_sbi(sbi);
+ call_rcu(&sbi->rcu, delayed_free);
}
static struct file_system_type exfat_fs_type = {
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 3ae0154c5680..07ea3d62b298 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -255,7 +255,7 @@ static bool filldir_one(struct dir_context *ctx, const char *name, int len,
container_of(ctx, struct getdents_callback, ctx);
buf->sequence++;
- if (buf->ino == ino && len <= NAME_MAX) {
+ if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) {
memcpy(buf->name, name, len);
buf->name[len] = '\0';
buf->found = 1;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a5d784872303..3c0d7d143036 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -252,8 +252,10 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
+#define EXT4_MAP_DELAYED BIT(BH_Delay)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
- EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+ EXT4_MAP_DELAYED)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -1548,7 +1550,7 @@ struct ext4_sb_info {
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
- struct bdev_handle *s_journal_bdev_handle;
+ struct file *s_journal_bdev_file;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
@@ -2912,10 +2914,10 @@ extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
-extern int ext4_mb_release(struct super_block *);
+extern void ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
-extern void ext4_discard_preallocations(struct inode *, unsigned int);
+extern void ext4_discard_preallocations(struct inode *);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 01299b55a567..7669d154c05e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
* i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
@@ -2229,7 +2229,7 @@ static int ext4_fill_es_cache_info(struct inode *inode,
/*
- * ext4_ext_determine_hole - determine hole around given block
+ * ext4_ext_find_hole - find hole around given block according to the given path
* @inode: inode we lookup in
* @path: path in extent tree to @lblk
* @lblk: pointer to logical block around which we want to determine hole
@@ -2241,9 +2241,9 @@ static int ext4_fill_es_cache_info(struct inode *inode,
* The function returns the length of a hole starting at @lblk. We update @lblk
* to the beginning of the hole if we managed to find it.
*/
-static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
- struct ext4_ext_path *path,
- ext4_lblk_t *lblk)
+static ext4_lblk_t ext4_ext_find_hole(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t *lblk)
{
int depth = ext_depth(inode);
struct ext4_extent *ex;
@@ -2271,30 +2271,6 @@ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
}
/*
- * ext4_ext_put_gap_in_cache:
- * calculate boundaries of the gap that the requested block fits into
- * and cache this gap
- */
-static void
-ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
- ext4_lblk_t hole_len)
-{
- struct extent_status es;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
- hole_start + hole_len - 1, &es);
- if (es.es_len) {
- /* There's delayed extent containing lblock? */
- if (es.es_lblk <= hole_start)
- return;
- hole_len = min(es.es_lblk - hole_start, hole_len);
- }
- ext_debug(inode, " -> %u:%u\n", hole_start, hole_len);
- ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
- EXTENT_STATUS_HOLE);
-}
-
-/*
* ext4_ext_rm_idx:
* removes index from the index block.
*/
@@ -4062,6 +4038,72 @@ static int get_implied_cluster_alloc(struct super_block *sb,
return 0;
}
+/*
+ * Determine hole length around the given logical block, first try to
+ * locate and expand the hole from the given @path, and then adjust it
+ * if it's partially or completely converted to delayed extents, insert
+ * it into the extent cache tree if it's indeed a hole, finally return
+ * the length of the determined extent.
+ */
+static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t lblk)
+{
+ ext4_lblk_t hole_start, len;
+ struct extent_status es;
+
+ hole_start = lblk;
+ len = ext4_ext_find_hole(inode, path, &hole_start);
+again:
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
+ hole_start + len - 1, &es);
+ if (!es.es_len)
+ goto insert_hole;
+
+ /*
+ * There's a delalloc extent in the hole, handle it if the delalloc
+ * extent is in front of, behind and straddle the queried range.
+ */
+ if (lblk >= es.es_lblk + es.es_len) {
+ /*
+ * The delalloc extent is in front of the queried range,
+ * find again from the queried start block.
+ */
+ len -= lblk - hole_start;
+ hole_start = lblk;
+ goto again;
+ } else if (in_range(lblk, es.es_lblk, es.es_len)) {
+ /*
+ * The delalloc extent containing lblk, it must have been
+ * added after ext4_map_blocks() checked the extent status
+ * tree so we are not holding i_rwsem and delalloc info is
+ * only stabilized by i_data_sem we are going to release
+ * soon. Don't modify the extent status tree and report
+ * extent as a hole, just adjust the length to the delalloc
+ * extent's after lblk.
+ */
+ len = es.es_lblk + es.es_len - lblk;
+ return len;
+ } else {
+ /*
+ * The delalloc extent is partially or completely behind
+ * the queried range, update hole length until the
+ * beginning of the delalloc extent.
+ */
+ len = min(es.es_lblk - hole_start, len);
+ }
+
+insert_hole:
+ /* Put just found gap into cache to speed up subsequent requests */
+ ext_debug(inode, " -> %u:%u\n", hole_start, len);
+ ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);
+
+ /* Update hole_len to reflect hole size after lblk */
+ if (hole_start != lblk)
+ len -= lblk - hole_start;
+
+ return len;
+}
/*
* Block allocation/map/preallocation routine for extents based files
@@ -4179,22 +4221,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* we couldn't try to create block if create flag is zero
*/
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- ext4_lblk_t hole_start, hole_len;
+ ext4_lblk_t len;
- hole_start = map->m_lblk;
- hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
- /*
- * put just found gap into cache to speed up
- * subsequent requests
- */
- ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
+ len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk);
- /* Update hole_len to reflect hole size after map->m_lblk */
- if (hole_start != map->m_lblk)
- hole_len -= map->m_lblk - hole_start;
map->m_pblk = 0;
- map->m_len = min_t(unsigned int, map->m_len, hole_len);
-
+ map->m_len = min_t(unsigned int, map->m_len, len);
goto out;
}
@@ -4313,7 +4345,7 @@ got_allocated_blocks:
* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free().
*/
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
ext4_free_blocks(handle, inode, NULL, newblock,
@@ -5357,7 +5389,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
@@ -5365,7 +5397,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
punch_stop - punch_start, SHIFT_LEFT);
@@ -5497,7 +5529,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
path = ext4_find_extent(inode, offset_lblk, NULL, 0);
if (IS_ERR(path)) {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6aa15dafc677..54d6ff22585c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -174,7 +174,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
(atomic_read(&inode->i_writecount) == 1) &&
!EXT4_I(inode)->i_reserved_data_blocks) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 11e6f33677a2..df853c4d3a8c 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -576,9 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
return true;
- if (EXT4_SB(sb)->s_journal_bdev_handle &&
+ if (EXT4_SB(sb)->s_journal_bdev_file &&
fm->fmr_device ==
- new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev))
+ new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev))
return true;
return false;
}
@@ -648,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
memset(handlers, 0, sizeof(handlers));
handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
handlers[0].gfd_fn = ext4_getfsmap_datadev;
- if (EXT4_SB(sb)->s_journal_bdev_handle) {
+ if (EXT4_SB(sb)->s_journal_bdev_file) {
handlers[1].gfd_dev = new_encode_dev(
- EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev);
+ file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev);
handlers[1].gfd_fn = ext4_getfsmap_logdev;
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index a9f3716119d3..d8ca7f64f952 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -714,7 +714,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
* i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5af1b0b8680e..2ccf3b5e3a7c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,7 +371,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
*/
if ((ei->i_reserved_data_blocks == 0) &&
!inode_is_open_for_write(inode))
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
}
static int __check_block_validity(struct inode *inode, const char *func,
@@ -515,6 +515,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
map->m_len = retval;
} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
map->m_pblk = 0;
+ map->m_flags |= ext4_es_is_delayed(&es) ?
+ EXT4_MAP_DELAYED : 0;
retval = es.es_len - (map->m_lblk - es.es_lblk);
if (retval > map->m_len)
retval = map->m_len;
@@ -1703,11 +1705,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
- if (ext4_es_is_hole(&es)) {
- retval = 0;
- down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_es_is_hole(&es))
goto add_delayed;
- }
/*
* Delayed extent could be allocated by fallocate.
@@ -1749,26 +1748,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
retval = ext4_ind_map_blocks(NULL, inode, map, 0);
-
-add_delayed:
- if (retval == 0) {
- int ret;
-
- /*
- * XXX: __block_prepare_write() unmaps passed block,
- * is it OK?
- */
-
- ret = ext4_insert_delayed_block(inode, map->m_lblk);
- if (ret != 0) {
- retval = ret;
- goto out_unlock;
- }
-
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
- } else if (retval > 0) {
+ if (retval < 0) {
+ up_read(&EXT4_I(inode)->i_data_sem);
+ return retval;
+ }
+ if (retval > 0) {
unsigned int status;
if (unlikely(retval != map->m_len)) {
@@ -1783,11 +1767,21 @@ add_delayed:
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
+ up_read(&EXT4_I(inode)->i_data_sem);
+ return retval;
}
+ up_read(&EXT4_I(inode)->i_data_sem);
-out_unlock:
- up_read((&EXT4_I(inode)->i_data_sem));
+add_delayed:
+ down_write(&EXT4_I(inode)->i_data_sem);
+ retval = ext4_insert_delayed_block(inode, map->m_lblk);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (retval)
+ return retval;
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
return retval;
}
@@ -3268,6 +3262,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
iomap->addr = (u64) map->m_pblk << blkbits;
if (flags & IOMAP_DAX)
iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
+ } else if (map->m_flags & EXT4_MAP_DELAYED) {
+ iomap->type = IOMAP_DELALLOC;
+ iomap->addr = IOMAP_NULL_ADDR;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
@@ -3430,35 +3427,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = {
.iomap_end = ext4_iomap_end,
};
-static bool ext4_iomap_is_delalloc(struct inode *inode,
- struct ext4_map_blocks *map)
-{
- struct extent_status es;
- ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- map->m_lblk, end, &es);
-
- if (!es.es_len || es.es_lblk > end)
- return false;
-
- if (es.es_lblk > map->m_lblk) {
- map->m_len = es.es_lblk - map->m_lblk;
- return false;
- }
-
- offset = map->m_lblk - es.es_lblk;
- map->m_len = es.es_len - offset;
-
- return true;
-}
-
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
loff_t length, unsigned int flags,
struct iomap *iomap, struct iomap *srcmap)
{
int ret;
- bool delalloc = false;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
@@ -3499,13 +3472,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
return ret;
- if (ret == 0)
- delalloc = ext4_iomap_is_delalloc(inode, &map);
-
set_iomap:
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
- if (delalloc && iomap->type == IOMAP_HOLE)
- iomap->type = IOMAP_DELALLOC;
return 0;
}
@@ -4015,12 +3983,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
/* If there are blocks to remove, do it */
if (stop_block > first_block) {
+ ext4_lblk_t hole_len = stop_block - first_block;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
+ ext4_es_remove_extent(inode, first_block, hole_len);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_remove_space(inode, first_block,
@@ -4029,6 +3997,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
ret = ext4_ind_remove_space(handle, inode, first_block,
stop_block);
+ ext4_es_insert_extent(inode, first_block, hole_len, ~0,
+ EXTENT_STATUS_HOLE);
up_write(&EXT4_I(inode)->i_data_sem);
}
ext4_fc_track_range(handle, inode, first_block, stop_block);
@@ -4170,7 +4140,7 @@ int ext4_truncate(struct inode *inode)
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
err = ext4_ext_truncate(handle, inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index aa6be510eb8f..7160a71044c8 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -467,7 +467,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
ext4_reset_inode_seed(inode);
ext4_reset_inode_seed(inode_bl);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f44f668e407f..e4f7cf9d89c4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -564,14 +564,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
"freeing block already freed "
"(bit %u)",
first + i);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
}
@@ -677,7 +677,7 @@ do { \
} \
} while (0)
-static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
const char *function, int line)
{
struct super_block *sb = e4b->bd_sb;
@@ -696,7 +696,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
void *buddy2;
if (e4b->bd_info->bb_check_counter++ % 10)
- return 0;
+ return;
while (order > 1) {
buddy = mb_find_buddy(e4b, order, &max);
@@ -758,7 +758,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
grp = ext4_get_group_info(sb, e4b->bd_group);
if (!grp)
- return NULL;
+ return;
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
@@ -768,7 +768,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
for (i = 0; i < pa->pa_len; i++)
MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
}
- return 0;
}
#undef MB_CHECK_ASSERT
#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
@@ -842,7 +841,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
struct ext4_sb_info *sbi = EXT4_SB(sb);
int new_order;
- if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0)
return;
new_order = mb_avg_fragment_size_order(sb,
@@ -871,7 +870,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
* cr level needs an update.
*/
static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *iter;
@@ -945,7 +944,7 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o
* order. Updates *new_cr if cr level needs an update.
*/
static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *grp = NULL;
@@ -990,7 +989,7 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
* much and fall to CR_GOAL_LEN_SLOW in that case.
*/
static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *grp = NULL;
@@ -1125,11 +1124,11 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
}
if (*new_cr == CR_POWER2_ALIGNED) {
- ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups);
+ ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group);
} else if (*new_cr == CR_GOAL_LEN_FAST) {
- ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups);
+ ext4_mb_choose_next_group_goal_fast(ac, new_cr, group);
} else if (*new_cr == CR_BEST_AVAIL_LEN) {
- ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups);
+ ext4_mb_choose_next_group_best_avail(ac, new_cr, group);
} else {
/*
* TODO: For CR=2, we can arrange groups in an rb tree sorted by
@@ -1233,6 +1232,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
atomic64_add(period, &sbi->s_mb_generation_time);
}
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+ int count;
+ int order = 1;
+ void *buddy;
+
+ while ((buddy = mb_find_buddy(e4b, order++, &count)))
+ mb_set_bits(buddy, 0, count);
+
+ e4b->bd_info->bb_fragments = 0;
+ memset(e4b->bd_info->bb_counters, 0,
+ sizeof(*e4b->bd_info->bb_counters) *
+ (e4b->bd_sb->s_blocksize_bits + 2));
+
+ ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+ e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
+}
+
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
@@ -1891,11 +1908,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
- this_cpu_inc(discard_pa_seq);
- e4b->bd_info->bb_free += count;
- if (first < e4b->bd_info->bb_first_free)
- e4b->bd_info->bb_first_free = first;
-
/* access memory sequentially: check left neighbour,
* clear range and then check right neighbour
*/
@@ -1909,21 +1921,31 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t blocknr;
+ /*
+ * Fastcommit replay can free already freed blocks which
+ * corrupts allocation info. Regenerate it.
+ */
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ mb_regenerate_buddy(e4b);
+ goto check;
+ }
+
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(sbi, block);
- if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block (bit %u); block bitmap corrupt.",
- block);
- ext4_mark_group_bitmap_corrupted(
- sb, e4b->bd_group,
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
- }
- goto done;
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0, blocknr,
+ "freeing already freed block (bit %u); block bitmap corrupt.",
+ block);
+ return;
}
+ this_cpu_inc(discard_pa_seq);
+ e4b->bd_info->bb_free += count;
+ if (first < e4b->bd_info->bb_first_free)
+ e4b->bd_info->bb_first_free = first;
+
/* let's maintain fragments counter */
if (left_is_free && right_is_free)
e4b->bd_info->bb_fragments--;
@@ -1948,9 +1970,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first <= last)
mb_buddy_mark_free(e4b, first >> 1, last >> 1);
-done:
mb_set_largest_free_order(sb, e4b->bd_info);
mb_update_avg_fragment_size(sb, e4b->bd_info);
+check:
mb_check_buddy(e4b);
}
@@ -2276,6 +2298,9 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return;
ext4_lock_group(ac->ac_sb, group);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ goto out;
+
max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
@@ -2283,6 +2308,7 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
+out:
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
}
@@ -2309,12 +2335,10 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
if (err)
return err;
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
- ext4_mb_unload_buddy(e4b);
- return 0;
- }
-
ext4_lock_group(ac->ac_sb, group);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ goto out;
+
max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
ex.fe_logical = 0xDEADFA11; /* debug value */
@@ -2347,6 +2371,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
}
+out:
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
@@ -2380,12 +2405,12 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
k = mb_find_next_zero_bit(buddy, max, 0);
if (k >= max) {
+ ext4_mark_group_bitmap_corrupted(ac->ac_sb,
+ e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
"%d free clusters of order %d. But found 0",
grp->bb_counters[i], i);
- ext4_mark_group_bitmap_corrupted(ac->ac_sb,
- e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
break;
}
ac->ac_found++;
@@ -2436,12 +2461,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
* free blocks even though group info says we
* have free blocks
*/
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
"group info. But bitmap says 0",
free);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
break;
}
@@ -2467,12 +2492,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
if (WARN_ON(ex.fe_len <= 0))
break;
if (free < ex.fe_len) {
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
/*
* The number of free blocks differs. This mostly
* indicate that the bitmap is corrupt. So exit
@@ -3725,7 +3750,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
return count;
}
-int ext4_mb_release(struct super_block *sb)
+void ext4_mb_release(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
@@ -3801,8 +3826,6 @@ int ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
-
- return 0;
}
static inline int ext4_issue_discard(struct super_block *sb,
@@ -5284,7 +5307,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
* the caller MUST hold group/inode locks.
* TODO: optimize the case when there are no in-core structures yet
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
struct ext4_prealloc_space *pa)
{
@@ -5334,11 +5357,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
*/
}
atomic_add(free, &sbi->s_mb_discarded);
-
- return 0;
}
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_release_group_pa(struct ext4_buddy *e4b,
struct ext4_prealloc_space *pa)
{
@@ -5352,13 +5373,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
e4b->bd_group, group, pa->pa_pstart);
- return 0;
+ return;
}
mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
-
- return 0;
}
/*
@@ -5479,7 +5498,7 @@ out_dbg:
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
+void ext4_discard_preallocations(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -5491,9 +5510,8 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
struct rb_node *iter;
int err;
- if (!S_ISREG(inode->i_mode)) {
+ if (!S_ISREG(inode->i_mode))
return;
- }
if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
return;
@@ -5501,15 +5519,12 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
mb_debug(sb, "discard preallocation for inode %lu\n",
inode->i_ino);
trace_ext4_discard_preallocations(inode,
- atomic_read(&ei->i_prealloc_active), needed);
-
- if (needed == 0)
- needed = UINT_MAX;
+ atomic_read(&ei->i_prealloc_active));
repeat:
/* first, collect all pa's in the inode */
write_lock(&ei->i_prealloc_lock);
- for (iter = rb_first(&ei->i_prealloc_node); iter && needed;
+ for (iter = rb_first(&ei->i_prealloc_node); iter;
iter = rb_next(iter)) {
pa = rb_entry(iter, struct ext4_prealloc_space,
pa_node.inode_node);
@@ -5533,7 +5548,6 @@ repeat:
spin_unlock(&pa->pa_lock);
rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
list_add(&pa->u.pa_tmp_list, &list);
- needed--;
continue;
}
@@ -5943,7 +5957,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/*
* release all resource we used in allocation
*/
-static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+static void ext4_mb_release_context(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
@@ -5980,7 +5994,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
- return 0;
}
static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
@@ -6761,6 +6774,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
bool set_trimmed = false;
void *bitmap;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ return 0;
+
last = ext4_last_grp_cluster(sb, e4b->bd_group);
bitmap = e4b->bd_bitmap;
if (start == 0 && max >= last)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d7aeb5da7d86..56938532b4ce 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -192,7 +192,6 @@ struct ext4_allocation_context {
*/
ext4_grpblk_t ac_orig_goal_len;
- __u32 ac_groups_considered;
__u32 ac_flags; /* allocation hints */
__u16 ac_groups_scanned;
__u16 ac_groups_linear_remaining;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3aa57376d9c2..7cd4afa4de1d 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -618,6 +618,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
goto out;
o_end = o_start + len;
+ *moved_len = 0;
while (o_start < o_end) {
struct ext4_extent *ex;
ext4_lblk_t cur_blk, next_blk;
@@ -672,7 +673,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
*/
ext4_double_up_write_data_sem(orig_inode, donor_inode);
/* Swap original branches with new branches */
- move_extent_per_page(o_filp, donor_inode,
+ *moved_len += move_extent_per_page(o_filp, donor_inode,
orig_page_index, donor_page_index,
offset_in_page, cur_len,
unwritten, &ret);
@@ -682,14 +683,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
o_start += cur_len;
d_start += cur_len;
}
- *moved_len = o_start - orig_blk;
- if (*moved_len > len)
- *moved_len = len;
out:
if (*moved_len) {
- ext4_discard_preallocations(orig_inode, 0);
- ext4_discard_preallocations(donor_inode, 0);
+ ext4_discard_preallocations(orig_inode);
+ ext4_discard_preallocations(donor_inode);
}
ext4_free_ext_path(path);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 05b647e6bc19..5e4f65c14dfb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1762,7 +1762,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
struct buffer_head *bh;
err = ext4_fname_prepare_lookup(dir, dentry, &fname);
- generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
return NULL;
if (err)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dcba0f85dfe2..a8ba84eabab2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1359,14 +1359,14 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->s_journal_bdev_handle) {
+ if (sbi->s_journal_bdev_file) {
/*
* Invalidate the journal device's buffers. We don't want them
* floating about in memory - the physical journal device may
* hotswapped, and it breaks the `ro-after' testing code.
*/
- sync_blockdev(sbi->s_journal_bdev_handle->bdev);
- invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
+ sync_blockdev(file_bdev(sbi->s_journal_bdev_file));
+ invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
}
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -1525,7 +1525,7 @@ void ext4_clear_inode(struct inode *inode)
ext4_fc_del(inode);
invalidate_inode_buffers(inode);
clear_inode(inode);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
@@ -4233,7 +4233,7 @@ int ext4_calculate_overhead(struct super_block *sb)
* Add the internal journal blocks whether the journal has been
* loaded or not
*/
- if (sbi->s_journal && !sbi->s_journal_bdev_handle)
+ if (sbi->s_journal && !sbi->s_journal_bdev_file)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
@@ -5346,7 +5346,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sb->s_qcop = &ext4_qctl_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
- memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+ super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid));
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
@@ -5484,6 +5484,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount4;
}
+ generic_set_sb_d_ops(sb);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext4_msg(sb, KERN_ERR, "get root dentry failed");
@@ -5670,9 +5671,9 @@ failed_mount:
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
brelse(sbi->s_sbh);
- if (sbi->s_journal_bdev_handle) {
- invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
- bdev_release(sbi->s_journal_bdev_handle);
+ if (sbi->s_journal_bdev_file) {
+ invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
+ fput(sbi->s_journal_bdev_file);
}
out_fail:
invalidate_bdev(sb->s_bdev);
@@ -5842,30 +5843,30 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb,
return journal;
}
-static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
+static struct file *ext4_get_journal_blkdev(struct super_block *sb,
dev_t j_dev, ext4_fsblk_t *j_start,
ext4_fsblk_t *j_len)
{
struct buffer_head *bh;
struct block_device *bdev;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int hblock, blocksize;
ext4_fsblk_t sb_block;
unsigned long offset;
struct ext4_super_block *es;
int errno;
- bdev_handle = bdev_open_by_dev(j_dev,
+ bdev_file = bdev_file_open_by_dev(j_dev,
BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
sb, &fs_holder_ops);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
ext4_msg(sb, KERN_ERR,
"failed to open journal device unknown-block(%u,%u) %ld",
- MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle));
- return bdev_handle;
+ MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
+ return bdev_file;
}
- bdev = bdev_handle->bdev;
+ bdev = file_bdev(bdev_file);
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
@@ -5912,12 +5913,12 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
*j_start = sb_block + 1;
*j_len = ext4_blocks_count(es);
brelse(bh);
- return bdev_handle;
+ return bdev_file;
out_bh:
brelse(bh);
out_bdev:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return ERR_PTR(errno);
}
@@ -5927,14 +5928,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
journal_t *journal;
ext4_fsblk_t j_start;
ext4_fsblk_t j_len;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int errno = 0;
- bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
- if (IS_ERR(bdev_handle))
- return ERR_CAST(bdev_handle);
+ bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
- journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start,
+ journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start,
j_len, sb->s_blocksize);
if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "failed to create device journal");
@@ -5949,14 +5950,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
goto out_journal;
}
journal->j_private = sb;
- EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle;
+ EXT4_SB(sb)->s_journal_bdev_file = bdev_file;
ext4_init_journal_params(sb, journal);
return journal;
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return ERR_PTR(errno);
}
@@ -7314,12 +7315,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
static void ext4_kill_sb(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL;
+ struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL;
kill_block_super(sb);
- if (handle)
- bdev_release(handle);
+ if (bdev_file)
+ fput(bdev_file);
}
static struct file_system_type ext4_fs_type = {
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 75bf1f88843c..645240cc0229 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
if (!dentry) {
bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
- if (IS_ERR(bh))
- return ERR_CAST(bh);
- if (!bh || !ext4_buffer_uptodate(bh))
+ if (IS_ERR(bh) || !bh)
return ERR_PTR(-ECHILD);
+ if (!ext4_buffer_uptodate(bh)) {
+ brelse(bh);
+ return ERR_PTR(-ECHILD);
+ }
} else {
bh = ext4_bread(NULL, inode, 0, 0);
if (IS_ERR(bh))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 65294e3b0bef..4c77e8ce5c75 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,6 +24,7 @@
#include <linux/blkdev.h>
#include <linux/quotaops.h>
#include <linux/part_stat.h>
+#include <linux/rw_hint.h>
#include <crypto/hash.h>
#include <linux/fscrypt.h>
@@ -1239,7 +1240,7 @@ struct f2fs_bio_info {
#define FDEV(i) (sbi->devs[i])
#define RDEV(i) (raw_super->devs[i])
struct f2fs_dev_info {
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
char path[MAX_PATH_LEN];
unsigned int total_segments;
@@ -3364,17 +3365,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
return is_set_ckpt_flags(sbi, CP_ERROR_FLAG);
}
-static inline bool is_dot_dotdot(const u8 *name, size_t len)
-{
- if (len == 1 && name[0] == '.')
- return true;
-
- if (len == 2 && name[0] == '.' && name[1] == '.')
- return true;
-
- return false;
-}
-
static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b3bb815fc6aa..f7f63a567d86 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -531,7 +531,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
}
err = f2fs_prepare_lookup(dir, dentry, &fname);
- generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
goto out_splice;
if (err)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4c8836ded90f..e1065ba70207 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1971,9 +1971,15 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
}
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) {
+ unsigned int nofs_flags;
+ int ret;
+
trace_f2fs_issue_reset_zone(bdev, blkstart);
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- sector, nr_sects, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ sector, nr_sects);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}
__queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen);
@@ -4865,6 +4871,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
block_t zone_block, valid_block_cnt;
unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
int ret;
+ unsigned int nofs_flags;
if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
return 0;
@@ -4912,8 +4919,10 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
"pointer: valid block[0x%x,0x%x] cond[0x%x]",
zone_segno, valid_block_cnt, zone->cond);
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
- zone->start, zone->len, GFP_NOFS);
+ zone->start, zone->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret == -EOPNOTSUPP) {
ret = blkdev_issue_zeroout(fdev->bdev, zone->wp,
zone->len - (zone->wp - zone->start),
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d00d21a8b53a..b880b746f226 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1605,7 +1605,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
for (i = 0; i < sbi->s_ndevs; i++) {
if (i > 0)
- bdev_release(FDEV(i).bdev_handle);
+ fput(FDEV(i).bdev_file);
#ifdef CONFIG_BLK_DEV_ZONED
kvfree(FDEV(i).blkz_seq);
#endif
@@ -4247,7 +4247,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
for (i = 0; i < max_devices; i++) {
if (i == 0)
- FDEV(0).bdev_handle = sbi->sb->s_bdev_handle;
+ FDEV(0).bdev_file = sbi->sb->s_bdev_file;
else if (!RDEV(i).path[0])
break;
@@ -4267,14 +4267,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
FDEV(i).end_blk = FDEV(i).start_blk +
(FDEV(i).total_segments <<
sbi->log_blocks_per_seg) - 1;
- FDEV(i).bdev_handle = bdev_open_by_path(
+ FDEV(i).bdev_file = bdev_file_open_by_path(
FDEV(i).path, mode, sbi->sb, NULL);
}
}
- if (IS_ERR(FDEV(i).bdev_handle))
- return PTR_ERR(FDEV(i).bdev_handle);
+ if (IS_ERR(FDEV(i).bdev_file))
+ return PTR_ERR(FDEV(i).bdev_file);
- FDEV(i).bdev = FDEV(i).bdev_handle->bdev;
+ FDEV(i).bdev = file_bdev(FDEV(i).bdev_file);
/* to release errored devices */
sbi->s_ndevs = i + 1;
@@ -4496,7 +4496,7 @@ try_onemore:
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
- memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+ super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid));
sb->s_iflags |= SB_I_CGROUPWB;
/* init f2fs-specific super block info */
@@ -4660,6 +4660,7 @@ try_onemore:
goto free_node_inode;
}
+ generic_set_sb_d_ops(sb);
sb->s_root = d_make_root(root); /* allocate root dentry */
if (!sb->s_root) {
err = -ENOMEM;
@@ -4880,6 +4881,7 @@ free_sbi:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi);
+ sb->s_fs_info = NULL;
/* give only one another chance */
if (retry_cnt > 0 && skip_recovery) {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 1fac3dabf130..5c813696d1ff 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1762,6 +1762,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
else /* fat 16 or 12 */
sbi->vol_id = bpb.fat16_vol_id;
+ __le32 vol_id_le = cpu_to_le32(sbi->vol_id);
+ super_set_uuid(sb, (void *) &vol_id_le, sizeof(vol_id_le));
+
sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742..54cc85d3338e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -27,6 +27,7 @@
#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/mount.h>
+#include <linux/rw_hint.h>
#include <linux/poll.h>
#include <asm/siginfo.h>
@@ -268,8 +269,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
}
#endif
-static bool rw_hint_valid(enum rw_hint hint)
+static bool rw_hint_valid(u64 hint)
{
+ BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
+ BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
+ BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
+ BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
+ BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
+ BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);
+
switch (hint) {
case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
@@ -283,34 +291,40 @@ static bool rw_hint_valid(enum rw_hint hint)
}
}
-static long fcntl_rw_hint(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
- enum rw_hint hint;
- u64 h;
+ u64 hint = READ_ONCE(inode->i_write_hint);
- switch (cmd) {
- case F_GET_RW_HINT:
- h = inode->i_write_hint;
- if (copy_to_user(argp, &h, sizeof(*argp)))
- return -EFAULT;
- return 0;
- case F_SET_RW_HINT:
- if (copy_from_user(&h, argp, sizeof(h)))
- return -EFAULT;
- hint = (enum rw_hint) h;
- if (!rw_hint_valid(hint))
- return -EINVAL;
+ if (copy_to_user(argp, &hint, sizeof(*argp)))
+ return -EFAULT;
+ return 0;
+}
- inode_lock(inode);
- inode->i_write_hint = hint;
- inode_unlock(inode);
- return 0;
- default:
+static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct inode *inode = file_inode(file);
+ u64 __user *argp = (u64 __user *)arg;
+ u64 hint;
+
+ if (copy_from_user(&hint, argp, sizeof(hint)))
+ return -EFAULT;
+ if (!rw_hint_valid(hint))
return -EINVAL;
- }
+
+ WRITE_ONCE(inode->i_write_hint, hint);
+
+ /*
+ * file->f_mapping->host may differ from inode. As an example,
+ * blkdev_open() modifies file->f_mapping.
+ */
+ if (file->f_mapping->host != inode)
+ WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);
+
+ return 0;
}
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
@@ -416,8 +430,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = memfd_fcntl(filp, cmd, argi);
break;
case F_GET_RW_HINT:
+ err = fcntl_get_rw_hint(filp, cmd, arg);
+ break;
case F_SET_RW_HINT:
- err = fcntl_rw_hint(filp, cmd, arg);
+ err = fcntl_set_rw_hint(filp, cmd, arg);
break;
default:
break;
@@ -846,12 +862,6 @@ int send_sigurg(struct fown_struct *fown)
static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __ro_after_init;
-static void fasync_free_rcu(struct rcu_head *head)
-{
- kmem_cache_free(fasync_cache,
- container_of(head, struct fasync_struct, fa_rcu));
-}
-
/*
* Remove a fasync entry. If successfully removed, return
* positive and clear the FASYNC flag. If no entry exists,
@@ -877,7 +887,7 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
write_unlock_irq(&fa->fa_lock);
*fp = fa->fa_next;
- call_rcu(&fa->fa_rcu, fasync_free_rcu);
+ kfree_rcu(fa, fa_rcu);
filp->f_flags &= ~FASYNC;
result = 1;
break;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 18b3ba8dc8ea..57a12614addf 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path,
if (f_handle.handle_bytes > MAX_HANDLE_SZ)
return -EINVAL;
- handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+ handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
GFP_KERNEL);
if (!handle)
return -ENOMEM;
diff --git a/fs/file_table.c b/fs/file_table.c
index b991f90571b4..6925522faa0a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -276,21 +276,15 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
}
/**
- * alloc_file - allocate and initialize a 'struct file'
+ * file_init_path - initialize a 'struct file' based on path
*
+ * @file: the file to set up
* @path: the (dentry, vfsmount) pair for the new file
- * @flags: O_... flags with which the new file will be opened
* @fop: the 'struct file_operations' for the new file
*/
-static struct file *alloc_file(const struct path *path, int flags,
- const struct file_operations *fop)
+static void file_init_path(struct file *file, const struct path *path,
+ const struct file_operations *fop)
{
- struct file *file;
-
- file = alloc_empty_file(flags, current_cred());
- if (IS_ERR(file))
- return file;
-
file->f_path = *path;
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
@@ -309,22 +303,51 @@ static struct file *alloc_file(const struct path *path, int flags,
file->f_op = fop;
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(path->dentry->d_inode);
+}
+
+/**
+ * alloc_file - allocate and initialize a 'struct file'
+ *
+ * @path: the (dentry, vfsmount) pair for the new file
+ * @flags: O_... flags with which the new file will be opened
+ * @fop: the 'struct file_operations' for the new file
+ */
+static struct file *alloc_file(const struct path *path, int flags,
+ const struct file_operations *fop)
+{
+ struct file *file;
+
+ file = alloc_empty_file(flags, current_cred());
+ if (!IS_ERR(file))
+ file_init_path(file, path, fop);
return file;
}
-struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
- const char *name, int flags,
- const struct file_operations *fops)
+static inline int alloc_path_pseudo(const char *name, struct inode *inode,
+ struct vfsmount *mnt, struct path *path)
{
struct qstr this = QSTR_INIT(name, strlen(name));
+
+ path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
+ if (!path->dentry)
+ return -ENOMEM;
+ path->mnt = mntget(mnt);
+ d_instantiate(path->dentry, inode);
+ return 0;
+}
+
+struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
+ const char *name, int flags,
+ const struct file_operations *fops)
+{
+ int ret;
struct path path;
struct file *file;
- path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
- if (!path.dentry)
- return ERR_PTR(-ENOMEM);
- path.mnt = mntget(mnt);
- d_instantiate(path.dentry, inode);
+ ret = alloc_path_pseudo(name, inode, mnt, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
file = alloc_file(&path, flags, fops);
if (IS_ERR(file)) {
ihold(inode);
@@ -334,6 +357,30 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
}
EXPORT_SYMBOL(alloc_file_pseudo);
+struct file *alloc_file_pseudo_noaccount(struct inode *inode,
+ struct vfsmount *mnt, const char *name,
+ int flags,
+ const struct file_operations *fops)
+{
+ int ret;
+ struct path path;
+ struct file *file;
+
+ ret = alloc_path_pseudo(name, inode, mnt, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ file = alloc_empty_file_noaccount(flags, current_cred());
+ if (IS_ERR(file)) {
+ ihold(inode);
+ path_put(&path);
+ return file;
+ }
+ file_init_path(file, &path, fops);
+ return file;
+}
+EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);
+
struct file *alloc_file_clone(struct file *base, int flags,
const struct file_operations *fops)
{
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1767493dffda..e4f17c53ddfc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb)
spin_unlock_irq(&wb->work_lock);
}
+/*
+ * This function is used when the first inode for this wb is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
+ */
+static void wb_wakeup_delayed(struct bdi_writeback *wb)
+{
+ unsigned long timeout;
+
+ timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+ spin_lock_irq(&wb->work_lock);
+ if (test_bit(WB_registered, &wb->state))
+ queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+ spin_unlock_irq(&wb->work_lock);
+}
+
static void finish_writeback_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
@@ -1675,11 +1700,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state |= I_DIRTY_PAGES;
- else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
+ else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
if (!(inode->i_state & I_DIRTY_PAGES)) {
- inode->i_state &= ~I_PINNING_FSCACHE_WB;
- wbc->unpinned_fscache_wb = true;
- dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */
+ inode->i_state &= ~I_PINNING_NETFS_WB;
+ wbc->unpinned_netfs_wb = true;
+ dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
}
}
@@ -1691,7 +1716,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (ret == 0)
ret = err;
}
- wbc->unpinned_fscache_wb = false;
+ wbc->unpinned_netfs_wb = false;
trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index edb3712dcfa5..a4d6ca0b8971 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -83,8 +83,8 @@ static const struct fs_parameter_spec *fs_lookup_key(
}
/*
- * fs_parse - Parse a filesystem configuration parameter
- * @fc: The filesystem context to log errors through.
+ * __fs_parse - Parse a filesystem configuration parameter
+ * @log: The filesystem context to log errors through.
* @desc: The parameter description to use.
* @param: The parameter.
* @result: Where to place the result of the parse
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
deleted file mode 100644
index b313a978ae0a..000000000000
--- a/fs/fscache/Kconfig
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config FSCACHE
- tristate "General filesystem local caching manager"
- select NETFS_SUPPORT
- help
- This option enables a generic filesystem caching manager that can be
- used by various network and other filesystems to cache data locally.
- Different sorts of caches can be plugged in, depending on the
- resources available.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_STATS
- bool "Gather statistical information on local caching"
- depends on FSCACHE && PROC_FS
- select NETFS_STATS
- help
- This option causes statistical information to be gathered on local
- caching and exported through file:
-
- /proc/fs/fscache/stats
-
- The gathering of statistics adds a certain amount of overhead to
- execution as there are a quite a few stats gathered, and on a
- multi-CPU system these may be on cachelines that keep bouncing
- between CPUs. On the other hand, the stats are very useful for
- debugging purposes. Saying 'Y' here is recommended.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_DEBUG
- bool "Debug FS-Cache"
- depends on FSCACHE
- help
- This permits debugging to be dynamically enabled in the local caching
- management module. If this is set, the debugging output may be
- enabled by setting bits in /sys/modules/fscache/parameter/debug.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
deleted file mode 100644
index afb090ea16c4..000000000000
--- a/fs/fscache/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for general filesystem caching code
-#
-
-fscache-y := \
- cache.o \
- cookie.o \
- io.o \
- main.o \
- volume.o
-
-fscache-$(CONFIG_PROC_FS) += proc.o
-fscache-$(CONFIG_FSCACHE_STATS) += stats.o
-
-obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
deleted file mode 100644
index 1336f517e9b1..000000000000
--- a/fs/fscache/internal.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Internal definitions for FS-Cache
- *
- * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) "FS-Cache: " fmt
-
-#include <linux/slab.h>
-#include <linux/fscache-cache.h>
-#include <trace/events/fscache.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-
-/*
- * cache.c
- */
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_caches_seq_ops;
-#endif
-bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
-void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
-struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache);
-void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where);
-
-static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache)
-{
- return smp_load_acquire(&cache->state);
-}
-
-static inline bool fscache_cache_is_live(const struct fscache_cache *cache)
-{
- return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE;
-}
-
-static inline void fscache_set_cache_state(struct fscache_cache *cache,
- enum fscache_cache_state new_state)
-{
- smp_store_release(&cache->state, new_state);
-
-}
-
-static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
- enum fscache_cache_state old_state,
- enum fscache_cache_state new_state)
-{
- return try_cmpxchg_release(&cache->state, &old_state, new_state);
-}
-
-/*
- * cookie.c
- */
-extern struct kmem_cache *fscache_cookie_jar;
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_cookies_seq_ops;
-#endif
-extern struct timer_list fscache_cookie_lru_timer;
-
-extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
-extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
- enum fscache_access_trace why);
-
-static inline void fscache_see_cookie(struct fscache_cookie *cookie,
- enum fscache_cookie_trace where)
-{
- trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
- where);
-}
-
-/*
- * main.c
- */
-extern unsigned fscache_debug;
-
-extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
-
-/*
- * proc.c
- */
-#ifdef CONFIG_PROC_FS
-extern int __init fscache_proc_init(void);
-extern void fscache_proc_cleanup(void);
-#else
-#define fscache_proc_init() (0)
-#define fscache_proc_cleanup() do {} while (0)
-#endif
-
-/*
- * stats.c
- */
-#ifdef CONFIG_FSCACHE_STATS
-extern atomic_t fscache_n_volumes;
-extern atomic_t fscache_n_volumes_collision;
-extern atomic_t fscache_n_volumes_nomem;
-extern atomic_t fscache_n_cookies;
-extern atomic_t fscache_n_cookies_lru;
-extern atomic_t fscache_n_cookies_lru_expired;
-extern atomic_t fscache_n_cookies_lru_removed;
-extern atomic_t fscache_n_cookies_lru_dropped;
-
-extern atomic_t fscache_n_acquires;
-extern atomic_t fscache_n_acquires_ok;
-extern atomic_t fscache_n_acquires_oom;
-
-extern atomic_t fscache_n_invalidates;
-
-extern atomic_t fscache_n_relinquishes;
-extern atomic_t fscache_n_relinquishes_retire;
-extern atomic_t fscache_n_relinquishes_dropped;
-
-extern atomic_t fscache_n_resizes;
-extern atomic_t fscache_n_resizes_null;
-
-static inline void fscache_stat(atomic_t *stat)
-{
- atomic_inc(stat);
-}
-
-static inline void fscache_stat_d(atomic_t *stat)
-{
- atomic_dec(stat);
-}
-
-#define __fscache_stat(stat) (stat)
-
-int fscache_stats_show(struct seq_file *m, void *v);
-#else
-
-#define __fscache_stat(stat) (NULL)
-#define fscache_stat(stat) do {} while (0)
-#define fscache_stat_d(stat) do {} while (0)
-#endif
-
-/*
- * volume.c
- */
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_volumes_seq_ops;
-#endif
-
-struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
-void fscache_put_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
-bool fscache_begin_volume_access(struct fscache_volume *volume,
- struct fscache_cookie *cookie,
- enum fscache_access_trace why);
-void fscache_create_volume(struct fscache_volume *volume, bool wait);
-
-
-/*****************************************************************************/
-/*
- * debug tracing
- */
-#define dbgprintk(FMT, ...) \
- printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
-
-#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
-
-#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
-
-#ifdef __KDEBUG
-#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
-#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
-#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
-
-#elif defined(CONFIG_FSCACHE_DEBUG)
-#define _enter(FMT, ...) \
-do { \
- if (__do_kdebug(ENTER)) \
- kenter(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _leave(FMT, ...) \
-do { \
- if (__do_kdebug(LEAVE)) \
- kleave(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _debug(FMT, ...) \
-do { \
- if (__do_kdebug(DEBUG)) \
- kdebug(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#else
-#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
-#endif
-
-/*
- * determine whether a particular optional debugging point should be logged
- * - we need to go through three steps to persuade cpp to correctly join the
- * shorthand in FSCACHE_DEBUG_LEVEL with its prefix
- */
-#define ____do_kdebug(LEVEL, POINT) \
- unlikely((fscache_debug & \
- (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
-#define ___do_kdebug(LEVEL, POINT) \
- ____do_kdebug(LEVEL, POINT)
-#define __do_kdebug(POINT) \
- ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
-
-#define FSCACHE_DEBUG_CACHE 0
-#define FSCACHE_DEBUG_COOKIE 1
-#define FSCACHE_DEBUG_OBJECT 2
-#define FSCACHE_DEBUG_OPERATION 3
-
-#define FSCACHE_POINT_ENTER 1
-#define FSCACHE_POINT_LEAVE 2
-#define FSCACHE_POINT_DEBUG 4
-
-#ifndef FSCACHE_DEBUG_LEVEL
-#define FSCACHE_DEBUG_LEVEL CACHE
-#endif
-
-/*
- * assertions
- */
-#if 1 /* defined(__KDEBUGALL) */
-
-#define ASSERT(X) \
-do { \
- if (unlikely(!(X))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTCMP(X, OP, Y) \
-do { \
- if (unlikely(!((X) OP (Y)))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- pr_err("%lx " #OP " %lx is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTIF(C, X) \
-do { \
- if (unlikely((C) && !(X))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTIFCMP(C, X, OP, Y) \
-do { \
- if (unlikely((C) && !((X) OP (Y)))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- pr_err("%lx " #OP " %lx is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
- BUG(); \
- } \
-} while (0)
-
-#else
-
-#define ASSERT(X) do {} while (0)
-#define ASSERTCMP(X, OP, Y) do {} while (0)
-#define ASSERTIF(C, X) do {} while (0)
-#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
-
-#endif /* assert or not */
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 91e89e68177e..b6cad106c37e 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -474,8 +474,7 @@ err:
static void cuse_fc_release(struct fuse_conn *fc)
{
- struct cuse_conn *cc = fc_to_cc(fc);
- kfree_rcu(cc, fc.rcu);
+ kfree(fc_to_cc(fc));
}
/**
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 148a71b8b4d0..c007b0f0c3a7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2509,14 +2509,14 @@ static int convert_fuse_file_lock(struct fuse_conn *fc,
* translate it into the caller's pid namespace.
*/
rcu_read_lock();
- fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
+ fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
rcu_read_unlock();
break;
default:
return -EIO;
}
- fl->fl_type = ffl->type;
+ fl->c.flc_type = ffl->type;
return 0;
}
@@ -2530,10 +2530,10 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file,
memset(inarg, 0, sizeof(*inarg));
inarg->fh = ff->fh;
- inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+ inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner);
inarg->lk.start = fl->fl_start;
inarg->lk.end = fl->fl_end;
- inarg->lk.type = fl->fl_type;
+ inarg->lk.type = fl->c.flc_type;
inarg->lk.pid = pid;
if (flock)
inarg->lk_flags |= FUSE_LK_FLOCK;
@@ -2570,8 +2570,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
- int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
- struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
+ int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+ struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL;
pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
int err;
@@ -2581,7 +2581,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
}
/* Unlock on close is handled by the flush method */
- if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
+ if ((fl->c.flc_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
return 0;
fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1df83eebda92..bcbe34488862 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -888,6 +888,7 @@ struct fuse_mount {
/* Entry on fc->mounts */
struct list_head fc_entry;
+ struct rcu_head rcu;
};
static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2a6d44f91729..516ea2979a90 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -930,6 +930,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
+static void delayed_release(struct rcu_head *p)
+{
+ struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu);
+
+ put_user_ns(fc->user_ns);
+ fc->release(fc);
+}
+
void fuse_conn_put(struct fuse_conn *fc)
{
if (refcount_dec_and_test(&fc->count)) {
@@ -941,13 +949,12 @@ void fuse_conn_put(struct fuse_conn *fc)
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
- put_user_ns(fc->user_ns);
bucket = rcu_dereference_protected(fc->curr_bucket, 1);
if (bucket) {
WARN_ON(atomic_read(&bucket->count) != 1);
kfree(bucket);
}
- fc->release(fc);
+ call_rcu(&fc->rcu, delayed_release);
}
}
EXPORT_SYMBOL_GPL(fuse_conn_put);
@@ -1366,7 +1373,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init);
void fuse_free_conn(struct fuse_conn *fc)
{
WARN_ON(!list_empty(&fc->devices));
- kfree_rcu(fc, rcu);
+ kfree(fc);
}
EXPORT_SYMBOL_GPL(fuse_free_conn);
@@ -1902,7 +1909,7 @@ static void fuse_sb_destroy(struct super_block *sb)
void fuse_mount_destroy(struct fuse_mount *fm)
{
fuse_conn_put(fm->fc);
- kfree(fm);
+ kfree_rcu(fm, rcu);
}
EXPORT_SYMBOL(fuse_mount_destroy);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index d9ccfd27e4f1..789af5c8fade 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -2465,7 +2465,7 @@ out:
}
static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
- loff_t offset)
+ loff_t offset, unsigned int len)
{
int ret;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 177f1f41f225..2e215e8c3c88 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -32,25 +32,21 @@
static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
{
- struct dentry *parent = NULL;
+ struct dentry *parent;
struct gfs2_sbd *sdp;
struct gfs2_inode *dip;
- struct inode *dinode, *inode;
+ struct inode *inode;
struct gfs2_holder d_gh;
struct gfs2_inode *ip = NULL;
int error, valid = 0;
int had_lock = 0;
- if (flags & LOOKUP_RCU) {
- dinode = d_inode_rcu(READ_ONCE(dentry->d_parent));
- if (!dinode)
- return -ECHILD;
- } else {
- parent = dget_parent(dentry);
- dinode = d_inode(parent);
- }
- sdp = GFS2_SB(dinode);
- dip = GFS2_I(dinode);
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ parent = dget_parent(dentry);
+ sdp = GFS2_SB(d_inode(parent));
+ dip = GFS2_I(d_inode(parent));
inode = d_inode(dentry);
if (inode) {
@@ -66,8 +62,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
if (!had_lock) {
- error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
- flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh);
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
if (error)
goto out;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 992ca4effb50..4c42ada60ae7 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1440,10 +1440,10 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- if (!(fl->fl_flags & FL_POSIX))
+ if (!(fl->c.flc_flags & FL_POSIX))
return -ENOLCK;
if (gfs2_withdrawing_or_withdrawn(sdp)) {
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
locks_lock_file_wait(file, fl);
return -EIO;
}
@@ -1451,7 +1451,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl);
else if (IS_GETLK(cmd))
return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
- else if (fl->fl_type == F_UNLCK)
+ else if (lock_is_unlock(fl))
return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
else
return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
@@ -1483,7 +1483,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
int error = 0;
int sleeptime;
- state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+ state = lock_is_write(fl) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
flags = GL_EXACT | GL_NOPID;
if (!IS_SETLKW(cmd))
flags |= LM_FLAG_TRY_1CB;
@@ -1495,8 +1495,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (fl_gh->gh_state == state)
goto out;
locks_init_lock(&request);
- request.fl_type = F_UNLCK;
- request.fl_flags = FL_FLOCK;
+ request.c.flc_type = F_UNLCK;
+ request.c.flc_flags = FL_FLOCK;
locks_lock_file_wait(file, &request);
gfs2_glock_dq(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
@@ -1557,10 +1557,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
{
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
- if (fl->fl_type == F_UNLCK) {
+ if (lock_is_unlock(fl)) {
do_unflock(file, fl);
return 0;
} else {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6bfc9383b7b8..1b95db2c3aac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1882,10 +1882,10 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
WARN_ON_ONCE(!may_not_block);
return -ECHILD;
}
- if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- int noblock = may_not_block ? GL_NOBLOCK : 0;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
- LM_FLAG_ANY | noblock, &i_gh);
+ if (gfs2_glock_is_locked_by_me(gl) == NULL) {
+ if (may_not_block)
+ return -ECHILD;
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
if (error)
return error;
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1281e60be639..572d58e86296 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -214,7 +214,7 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf)
memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
- memcpy(&s->s_uuid, str->sb_uuid, 16);
+ super_set_uuid(s, str->sb_uuid, 16);
}
/**
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 7ededcb720c1..012a3d003fbe 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -190,6 +190,7 @@ struct hfsplus_sb_info {
int work_queued; /* non-zero delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
spinlock_t work_lock; /* protects sync_work and work_queued */
+ struct rcu_head rcu;
};
#define HFSPLUS_SB_WRITEBACKUP 0
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1986b4f18a90..97920202790f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -277,6 +277,14 @@ void hfsplus_mark_mdb_dirty(struct super_block *sb)
spin_unlock(&sbi->work_lock);
}
+static void delayed_free(struct rcu_head *p)
+{
+ struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu);
+
+ unload_nls(sbi->nls);
+ kfree(sbi);
+}
+
static void hfsplus_put_super(struct super_block *sb)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
@@ -302,9 +310,7 @@ static void hfsplus_put_super(struct super_block *sb)
hfs_btree_close(sbi->ext_tree);
kfree(sbi->s_vhdr_buf);
kfree(sbi->s_backup_vhdr_buf);
- unload_nls(sbi->nls);
- kfree(sb->s_fs_info);
- sb->s_fs_info = NULL;
+ call_rcu(&sbi->rcu, delayed_free);
}
static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b0cb70400996..ce9346099c72 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -30,7 +30,7 @@ struct hfsplus_wd {
* @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
* @buf: buffer for I/O
* @data: output pointer for location of requested data
- * @opf: request op flags
+ * @opf: I/O operation type and flags
*
* The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
* HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ea5b8e57d904..6502c7e776d1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -100,6 +100,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
loff_t len, vma_len;
int ret;
struct hstate *h = hstate_file(file);
+ vm_flags_t vm_flags;
/*
* vma address alignment (but not the pgoff alignment) has
@@ -141,10 +142,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
ret = -ENOMEM;
+
+ vm_flags = vma->vm_flags;
+ /*
+ * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
+ * reserving here. Note: only for SHM hugetlbfs file, the inode
+ * flag S_PRIVATE is set.
+ */
+ if (inode->i_flags & S_PRIVATE)
+ vm_flags |= VM_NORESERVE;
+
if (!hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
- vma->vm_flags))
+ vm_flags))
goto out;
ret = 0;
@@ -340,7 +351,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
folio_unlock(folio);
- if (!folio_test_has_hwpoisoned(folio))
+ if (!folio_test_hwpoison(folio))
want = nr;
else {
/*
@@ -922,7 +933,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
unsigned int ia_valid = attr->ia_valid;
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
- error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+ error = setattr_prepare(idmap, dentry, attr);
if (error)
return error;
@@ -939,7 +950,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
hugetlb_vmtruncate(inode, newsize);
}
- setattr_copy(&nop_mnt_idmap, inode, attr);
+ setattr_copy(idmap, inode, attr);
mark_inode_dirty(inode);
return 0;
}
@@ -974,6 +985,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
+ struct mnt_idmap *idmap,
struct inode *dir,
umode_t mode, dev_t dev)
{
@@ -995,7 +1007,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
inode->i_ino = get_next_ino();
- inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+ inode_init_owner(idmap, inode, dir, mode);
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
@@ -1039,7 +1051,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
{
struct inode *inode;
- inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
+ inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev);
if (!inode)
return -ENOSPC;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -1051,7 +1063,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
- int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry,
+ int retval = hugetlbfs_mknod(idmap, dir, dentry,
mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
@@ -1062,7 +1074,7 @@ static int hugetlbfs_create(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
- return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0);
+ return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}
static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
@@ -1071,7 +1083,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
{
struct inode *inode;
- inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
+ inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0);
if (!inode)
return -ENOSPC;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -1083,10 +1095,11 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *dentry,
const char *symname)
{
+ const umode_t mode = S_IFLNK|S_IRWXUGO;
struct inode *inode;
int error = -ENOSPC;
- inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
+ inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0);
if (inode) {
int l = strlen(symname)+1;
error = page_symlink(inode, symname, l);
@@ -1354,6 +1367,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
{
struct hugetlbfs_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
+ struct hstate *h;
char *rest;
unsigned long ps;
int opt;
@@ -1398,11 +1412,12 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_pagesize:
ps = memparse(param->string, &rest);
- ctx->hstate = size_to_hstate(ps);
- if (!ctx->hstate) {
+ h = size_to_hstate(ps);
+ if (!h) {
pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
return -EINVAL;
}
+ ctx->hstate = h;
return 0;
case Opt_min_size:
@@ -1553,6 +1568,7 @@ static struct file_system_type hugetlbfs_fs_type = {
.init_fs_context = hugetlbfs_init_fs_context,
.parameters = hugetlb_fs_parameters,
.kill_sb = kill_litter_super,
+ .fs_flags = FS_ALLOW_IDMAP,
};
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1606,7 +1622,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
}
file = ERR_PTR(-ENOSPC);
- inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
+ /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts. */
+ inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL,
+ S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto out;
if (creat_flags == HUGETLB_SHMFS_INODE)
diff --git a/fs/inode.c b/fs/inode.c
index 91048c4c9c9e..d290f007b3d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,6 +20,7 @@
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
+#include <linux/rw_hint.h>
#include <trace/events/writeback.h>
#include "internal.h"
@@ -588,7 +589,8 @@ void dump_mapping(const struct address_space *mapping)
}
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
- if (get_kernel_nofault(dentry, dentry_ptr)) {
+ if (get_kernel_nofault(dentry, dentry_ptr) ||
+ !dentry.d_parent || !dentry.d_name.name) {
pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
a_ops, ino, dentry_ptr);
return;
@@ -2285,7 +2287,7 @@ void __init inode_init(void)
sizeof(struct inode),
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
/* Hash may have been set up in inode_init_early */
@@ -2509,7 +2511,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode)
{
struct timespec64 now = current_time(inode);
- inode_set_ctime(inode, now.tv_sec, now.tv_nsec);
+ inode_set_ctime_to_ts(inode, now);
return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);
diff --git a/fs/internal.h b/fs/internal.h
index b67406435fc0..49c1fcfee4b3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -183,6 +183,7 @@ extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
+long do_ftruncate(struct file *file, loff_t length, int small);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
@@ -310,3 +311,10 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);
+struct stashed_operations {
+ void (*put_data)(void *data);
+ void (*init_inode)(struct inode *inode, void *data);
+};
+int path_from_stashed(struct dentry **stashed, unsigned long ino,
+ struct vfsmount *mnt, void *data, struct path *path);
+void stashed_dentry_prune(struct dentry *dentry);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 76cf22ac97d7..1d5abfdf0f22 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -763,6 +763,33 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
return err;
}
+static int ioctl_getfsuuid(struct file *file, void __user *argp)
+{
+ struct super_block *sb = file_inode(file)->i_sb;
+ struct fsuuid2 u = { .len = sb->s_uuid_len, };
+
+ if (!sb->s_uuid_len)
+ return -ENOIOCTLCMD;
+
+ memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len);
+
+ return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
+}
+
+static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp)
+{
+ struct super_block *sb = file_inode(file)->i_sb;
+
+ if (!strlen(sb->s_sysfs_name))
+ return -ENOIOCTLCMD;
+
+ struct fs_sysfs_path u = {};
+
+ u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name);
+
+ return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
+}
+
/*
* do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
* It's just a simple helper for sys_ioctl and compat_sys_ioctl.
@@ -845,6 +872,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
case FS_IOC_FSSETXATTR:
return ioctl_fssetxattr(filp, argp);
+ case FS_IOC_GETFSUUID:
+ return ioctl_getfsuuid(filp, argp);
+
+ case FS_IOC_GETFSSYSFSPATH:
+ return ioctl_get_fs_sysfs_path(filp, argp);
+
default:
if (S_ISREG(inode->i_mode))
return file_ioctl(filp, cmd, argp);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 093c4515b22a..4e8e41c8b3c0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (C) 2016-2019 Christoph Hellwig.
+ * Copyright (C) 2016-2023 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio,
return test_bit(block + blks_per_folio, ifs->state);
}
+static unsigned ifs_find_dirty_range(struct folio *folio,
+ struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned start_blk =
+ offset_in_folio(folio, *range_start) >> inode->i_blkbits;
+ unsigned end_blk = min_not_zero(
+ offset_in_folio(folio, range_end) >> inode->i_blkbits,
+ i_blocks_per_folio(inode, folio));
+ unsigned nblks = 1;
+
+ while (!ifs_block_is_dirty(folio, ifs, start_blk))
+ if (++start_blk == end_blk)
+ return 0;
+
+ while (start_blk + nblks < end_blk) {
+ if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
+ break;
+ nblks++;
+ }
+
+ *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
+ return nblks << inode->i_blkbits;
+}
+
+static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
+ u64 range_end)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (*range_start >= range_end)
+ return 0;
+
+ if (ifs)
+ return ifs_find_dirty_range(folio, ifs, range_start, range_end);
+ return range_end - *range_start;
+}
+
static void ifs_clear_range_dirty(struct folio *folio,
struct iomap_folio_state *ifs, size_t off, size_t len)
{
@@ -1454,15 +1492,10 @@ out_unlock:
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
- size_t len, int error)
+ size_t len)
{
struct iomap_folio_state *ifs = folio->private;
- if (error) {
- folio_set_error(folio);
- mapping_set_error(inode->i_mapping, error);
- }
-
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
@@ -1479,40 +1512,29 @@ static u32
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
{
struct inode *inode = ioend->io_inode;
- struct bio *bio = &ioend->io_inline_bio;
- struct bio *last = ioend->io_bio, *next;
- u64 start = bio->bi_iter.bi_sector;
- loff_t offset = ioend->io_offset;
- bool quiet = bio_flagged(bio, BIO_QUIET);
+ struct bio *bio = &ioend->io_bio;
+ struct folio_iter fi;
u32 folio_count = 0;
- for (bio = &ioend->io_inline_bio; bio; bio = next) {
- struct folio_iter fi;
-
- /*
- * For the last bio, bi_private points to the ioend, so we
- * need to explicitly end the iteration here.
- */
- if (bio == last)
- next = NULL;
- else
- next = bio->bi_private;
-
- /* walk all folios in bio, ending page IO on them */
- bio_for_each_folio_all(fi, bio) {
- iomap_finish_folio_write(inode, fi.folio, fi.length,
- error);
- folio_count++;
+ if (error) {
+ mapping_set_error(inode->i_mapping, error);
+ if (!bio_flagged(bio, BIO_QUIET)) {
+ pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+ inode->i_sb->s_id, inode->i_ino,
+ ioend->io_offset, ioend->io_sector);
}
- bio_put(bio);
}
- /* The ioend has been freed by bio_put() */
- if (unlikely(error && !quiet)) {
- printk_ratelimited(KERN_ERR
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
- inode->i_sb->s_id, inode->i_ino, offset, start);
+ /* walk all folios in bio, ending page IO on them */
+ bio_for_each_folio_all(fi, bio) {
+ if (error)
+ folio_set_error(fi.folio);
+ iomap_finish_folio_write(inode, fi.folio, fi.length);
+ folio_count++;
}
+
+ bio_put(bio); /* frees the ioend */
return folio_count;
}
@@ -1553,7 +1575,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends);
static bool
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
{
- if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+ if (ioend->io_bio.bi_status != next->io_bio.bi_status)
return false;
if ((ioend->io_flags & IOMAP_F_SHARED) ^
(next->io_flags & IOMAP_F_SHARED))
@@ -1618,47 +1640,46 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends);
static void iomap_writepage_end_bio(struct bio *bio)
{
- struct iomap_ioend *ioend = bio->bi_private;
-
- iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
+ iomap_finish_ioend(iomap_ioend_from_bio(bio),
+ blk_status_to_errno(bio->bi_status));
}
/*
* Submit the final bio for an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback
- * and unlocked them. In this situation, we need to fail the bio instead of
- * submitting it. This typically only happens on a filesystem shutdown.
+ * the submission process has failed after we've marked pages for writeback.
+ * We cannot cancel ioend directly in that case, so call the bio end I/O handler
+ * with the error status here to run the normal I/O completion handler to clear
+ * the writeback bit and let the file system proess the errors.
*/
-static int
-iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
- int error)
+static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
{
- ioend->io_bio->bi_private = ioend;
- ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
+ if (!wpc->ioend)
+ return error;
+ /*
+ * Let the file systems prepare the I/O submission and hook in an I/O
+ * comletion handler. This also needs to happen in case after a
+ * failure happened so that the file system end I/O handler gets called
+ * to clean up.
+ */
if (wpc->ops->prepare_ioend)
- error = wpc->ops->prepare_ioend(ioend, error);
+ error = wpc->ops->prepare_ioend(wpc->ioend, error);
+
if (error) {
- /*
- * If we're failing the IO now, just mark the ioend with an
- * error and finish it. This will run IO completion immediately
- * as there is only one reference to the ioend at this point in
- * time.
- */
- ioend->io_bio->bi_status = errno_to_blk_status(error);
- bio_endio(ioend->io_bio);
- return error;
+ wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&wpc->ioend->io_bio);
+ } else {
+ submit_bio(&wpc->ioend->io_bio);
}
- submit_bio(ioend->io_bio);
- return 0;
+ wpc->ioend = NULL;
+ return error;
}
-static struct iomap_ioend *
-iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
- loff_t offset, sector_t sector, struct writeback_control *wbc)
+static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct inode *inode, loff_t pos)
{
struct iomap_ioend *ioend;
struct bio *bio;
@@ -1666,63 +1687,42 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
REQ_OP_WRITE | wbc_to_write_flags(wbc),
GFP_NOFS, &iomap_ioend_bioset);
- bio->bi_iter.bi_sector = sector;
+ bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
+ bio->bi_end_io = iomap_writepage_end_bio;
wbc_init_bio(wbc, bio);
+ bio->bi_write_hint = inode->i_write_hint;
- ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
+ ioend = iomap_ioend_from_bio(bio);
INIT_LIST_HEAD(&ioend->io_list);
ioend->io_type = wpc->iomap.type;
ioend->io_flags = wpc->iomap.flags;
ioend->io_inode = inode;
ioend->io_size = 0;
- ioend->io_folios = 0;
- ioend->io_offset = offset;
- ioend->io_bio = bio;
- ioend->io_sector = sector;
- return ioend;
-}
-
-/*
- * Allocate a new bio, and chain the old bio to the new one.
- *
- * Note that we have to perform the chaining in this unintuitive order
- * so that the bi_private linkage is set up in the right direction for the
- * traversal in iomap_finish_ioend().
- */
-static struct bio *
-iomap_chain_bio(struct bio *prev)
-{
- struct bio *new;
-
- new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
- bio_clone_blkg_association(new, prev);
- new->bi_iter.bi_sector = bio_end_sector(prev);
+ ioend->io_offset = pos;
+ ioend->io_sector = bio->bi_iter.bi_sector;
- bio_chain(prev, new);
- bio_get(prev); /* for iomap_finish_ioend */
- submit_bio(prev);
- return new;
+ wpc->nr_folios = 0;
+ return ioend;
}
-static bool
-iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
- sector_t sector)
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
{
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
(wpc->ioend->io_flags & IOMAP_F_SHARED))
return false;
if (wpc->iomap.type != wpc->ioend->io_type)
return false;
- if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
+ if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
- if (sector != bio_end_sector(wpc->ioend->io_bio))
+ if (iomap_sector(&wpc->iomap, pos) !=
+ bio_end_sector(&wpc->ioend->io_bio))
return false;
/*
* Limit ioend bio chain lengths to minimise IO completion latency. This
* also prevents long tight loops ending page writeback on all the
* folios in the ioend.
*/
- if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
+ if (wpc->nr_folios >= IOEND_BATCH_SIZE)
return false;
return true;
}
@@ -1730,255 +1730,238 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
/*
* Test to see if we have an existing ioend structure that we could append to
* first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly. Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
*/
-static void
-iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
- struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct list_head *iolist)
+static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+ struct inode *inode, loff_t pos, unsigned len)
{
- sector_t sector = iomap_sector(&wpc->iomap, pos);
- unsigned len = i_blocksize(inode);
+ struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
+ int error;
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
- if (wpc->ioend)
- list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+new_ioend:
+ error = iomap_submit_ioend(wpc, 0);
+ if (error)
+ return error;
+ wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
}
- if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
- wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
- bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
- }
+ if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+ goto new_ioend;
if (ifs)
atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, &folio->page, len);
+ return 0;
}
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * the forward progress guarantees we need to provide. The current ioend we're
- * adding blocks to is cached in the writepage context, and if the new block
- * doesn't append to the cached ioend, it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected. While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int
-iomap_writepage_map(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode,
- struct folio *folio, u64 end_pos)
+static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+ struct inode *inode, u64 pos, unsigned dirty_len,
+ unsigned *count)
{
- struct iomap_folio_state *ifs = folio->private;
- struct iomap_ioend *ioend, *next;
- unsigned len = i_blocksize(inode);
- unsigned nblocks = i_blocks_per_folio(inode, folio);
- u64 pos = folio_pos(folio);
- int error = 0, count = 0, i;
- LIST_HEAD(submit_list);
-
- WARN_ON_ONCE(end_pos <= pos);
-
- if (!ifs && nblocks > 1) {
- ifs = ifs_alloc(inode, folio, 0);
- iomap_set_range_dirty(folio, 0, end_pos - pos);
- }
+ int error;
- WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
-
- /*
- * Walk through the folio to find areas to write back. If we
- * run off the end of the current map or find the current map
- * invalid, grab a new one.
- */
- for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
- if (ifs && !ifs_block_is_dirty(folio, ifs, i))
- continue;
+ do {
+ unsigned map_len;
- error = wpc->ops->map_blocks(wpc, inode, pos);
+ error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
if (error)
break;
- trace_iomap_writepage_map(inode, &wpc->iomap);
- if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
- continue;
- if (wpc->iomap.type == IOMAP_HOLE)
- continue;
- iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,
- &submit_list);
- count++;
- }
- if (count)
- wpc->ioend->io_folios++;
+ trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
- WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
- WARN_ON_ONCE(!folio_test_locked(folio));
- WARN_ON_ONCE(folio_test_writeback(folio));
- WARN_ON_ONCE(folio_test_dirty(folio));
+ map_len = min_t(u64, dirty_len,
+ wpc->iomap.offset + wpc->iomap.length - pos);
+ WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+ switch (wpc->iomap.type) {
+ case IOMAP_INLINE:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ case IOMAP_HOLE:
+ break;
+ default:
+ error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
+ map_len);
+ if (!error)
+ (*count)++;
+ break;
+ }
+ dirty_len -= map_len;
+ pos += map_len;
+ } while (dirty_len && !error);
/*
* We cannot cancel the ioend directly here on error. We may have
* already set other pages under writeback and hence we have to run I/O
* completion to mark the error state of the pages under writeback
* appropriately.
+ *
+ * Just let the file system know what portion of the folio failed to
+ * map.
*/
- if (unlikely(error)) {
- /*
- * Let the filesystem know what portion of the current page
- * failed to map. If the page hasn't been added to ioend, it
- * won't be affected by I/O completion and we must unlock it
- * now.
- */
- if (wpc->ops->discard_folio)
- wpc->ops->discard_folio(folio, pos);
- if (!count) {
- folio_unlock(folio);
- goto done;
- }
- }
-
- /*
- * We can have dirty bits set past end of file in page_mkwrite path
- * while mapping the last partial folio. Hence it's better to clear
- * all the dirty bits in the folio here.
- */
- iomap_clear_range_dirty(folio, 0, folio_size(folio));
- folio_start_writeback(folio);
- folio_unlock(folio);
-
- /*
- * Preserve the original error if there was one; catch
- * submission errors here and propagate into subsequent ioend
- * submissions.
- */
- list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
- int error2;
-
- list_del_init(&ioend->io_list);
- error2 = iomap_submit_ioend(wpc, ioend, error);
- if (error2 && !error)
- error = error2;
- }
-
- /*
- * We can end up here with no error and nothing to write only if we race
- * with a partial page truncate on a sub-page block sized filesystem.
- */
- if (!count)
- folio_end_writeback(folio);
-done:
- mapping_set_error(inode->i_mapping, error);
+ if (error && wpc->ops->discard_folio)
+ wpc->ops->discard_folio(folio, pos);
return error;
}
/*
- * Write out a dirty page.
+ * Check interaction of the folio with the file end.
*
- * For delalloc space on the page, we need to allocate space and flush it.
- * For unwritten space on the page, we need to start the conversion to
- * regular allocated space.
+ * If the folio is entirely beyond i_size, return false. If it straddles
+ * i_size, adjust end_pos and zero all data beyond i_size.
*/
-static int iomap_do_writepage(struct folio *folio,
- struct writeback_control *wbc, void *data)
+static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
+ u64 *end_pos)
{
- struct iomap_writepage_ctx *wpc = data;
- struct inode *inode = folio->mapping->host;
- u64 end_pos, isize;
-
- trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
+ u64 isize = i_size_read(inode);
- /*
- * Refuse to write the folio out if we're called from reclaim context.
- *
- * This avoids stack overflows when called from deeply used stacks in
- * random callers for direct reclaim or memcg reclaim. We explicitly
- * allow reclaim from kswapd as the stack usage there is relatively low.
- *
- * This should never happen except in the case of a VM regression so
- * warn about it.
- */
- if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
- PF_MEMALLOC))
- goto redirty;
-
- /*
- * Is this folio beyond the end of the file?
- *
- * The folio index is less than the end_index, adjust the end_pos
- * to the highest offset that this folio should represent.
- * -----------------------------------------------------
- * | file mapping | <EOF> |
- * -----------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | |
- * ^--------------------------------^----------|--------
- * | desired writeback range | see else |
- * ---------------------------------^------------------|
- */
- isize = i_size_read(inode);
- end_pos = folio_pos(folio) + folio_size(folio);
- if (end_pos > isize) {
- /*
- * Check whether the page to write out is beyond or straddles
- * i_size or not.
- * -------------------------------------------------------
- * | file mapping | <EOF> |
- * -------------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
- * ^--------------------------------^-----------|---------
- * | | Straddles |
- * ---------------------------------^-----------|--------|
- */
+ if (*end_pos > isize) {
size_t poff = offset_in_folio(folio, isize);
pgoff_t end_index = isize >> PAGE_SHIFT;
/*
- * Skip the page if it's fully outside i_size, e.g.
- * due to a truncate operation that's in progress. We've
- * cleaned this page and truncate will finish things off for
- * us.
+ * If the folio is entirely ouside of i_size, skip it.
+ *
+ * This can happen due to a truncate operation that is in
+ * progress and in that case truncate will finish it off once
+ * we've dropped the folio lock.
*
- * Note that the end_index is unsigned long. If the given
- * offset is greater than 16TB on a 32-bit system then if we
- * checked if the page is fully outside i_size with
- * "if (page->index >= end_index + 1)", "end_index + 1" would
- * overflow and evaluate to 0. Hence this page would be
+ * Note that the pgoff_t used for end_index is an unsigned long.
+ * If the given offset is greater than 16TB on a 32-bit system,
+ * then if we checked if the folio is fully outside i_size with
+ * "if (folio->index >= end_index + 1)", "end_index + 1" would
+ * overflow and evaluate to 0. Hence this folio would be
* redirtied and written out repeatedly, which would result in
* an infinite loop; the user program performing this operation
* would hang. Instead, we can detect this situation by
- * checking if the page is totally beyond i_size or if its
+ * checking if the folio is totally beyond i_size or if its
* offset is just equal to the EOF.
*/
if (folio->index > end_index ||
(folio->index == end_index && poff == 0))
- goto unlock;
+ return false;
/*
- * The page straddles i_size. It must be zeroed out on each
- * and every writepage invocation because it may be mmapped.
- * "A file is mapped in multiples of the page size. For a file
- * that is not a multiple of the page size, the remaining
- * memory is zeroed when mapped, and writes to that region are
- * not written out to the file."
+ * The folio straddles i_size.
+ *
+ * It must be zeroed out on each and every writepage invocation
+ * because it may be mmapped:
+ *
+ * A file is mapped in multiples of the page size. For a
+ * file that is not a multiple of the page size, the
+ * remaining memory is zeroed when mapped, and writes to that
+ * region are not written out to the file.
+ *
+ * Also adjust the writeback range to skip all blocks entirely
+ * beyond i_size.
*/
folio_zero_segment(folio, poff, folio_size(folio));
- end_pos = isize;
+ *end_pos = round_up(isize, i_blocksize(inode));
+ }
+
+ return true;
+}
+
+static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio)
+{
+ struct iomap_folio_state *ifs = folio->private;
+ struct inode *inode = folio->mapping->host;
+ u64 pos = folio_pos(folio);
+ u64 end_pos = pos + folio_size(folio);
+ unsigned count = 0;
+ int error = 0;
+ u32 rlen;
+
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ WARN_ON_ONCE(folio_test_dirty(folio));
+ WARN_ON_ONCE(folio_test_writeback(folio));
+
+ trace_iomap_writepage(inode, pos, folio_size(folio));
+
+ if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
+ folio_unlock(folio);
+ return 0;
+ }
+ WARN_ON_ONCE(end_pos <= pos);
+
+ if (i_blocks_per_folio(inode, folio) > 1) {
+ if (!ifs) {
+ ifs = ifs_alloc(inode, folio, 0);
+ iomap_set_range_dirty(folio, 0, end_pos - pos);
+ }
+
+ /*
+ * Keep the I/O completion handler from clearing the writeback
+ * bit until we have submitted all blocks by adding a bias to
+ * ifs->write_bytes_pending, which is dropped after submitting
+ * all blocks.
+ */
+ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+ atomic_inc(&ifs->write_bytes_pending);
}
- return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
+ /*
+ * Set the writeback bit ASAP, as the I/O completion for the single
+ * block per folio case happen hit as soon as we're submitting the bio.
+ */
+ folio_start_writeback(folio);
-redirty:
- folio_redirty_for_writepage(wbc, folio);
-unlock:
+ /*
+ * Walk through the folio to find dirty areas to write back.
+ */
+ while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+ error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
+ pos, rlen, &count);
+ if (error)
+ break;
+ pos += rlen;
+ }
+
+ if (count)
+ wpc->nr_folios++;
+
+ /*
+ * We can have dirty bits set past end of file in page_mkwrite path
+ * while mapping the last partial folio. Hence it's better to clear
+ * all the dirty bits in the folio here.
+ */
+ iomap_clear_range_dirty(folio, 0, folio_size(folio));
+
+ /*
+ * Usually the writeback bit is cleared by the I/O completion handler.
+ * But we may end up either not actually writing any blocks, or (when
+ * there are multiple blocks in a folio) all I/O might have finished
+ * already at this point. In that case we need to clear the writeback
+ * bit ourselves right after unlocking the page.
+ */
folio_unlock(folio);
- return 0;
+ if (ifs) {
+ if (atomic_dec_and_test(&ifs->write_bytes_pending))
+ folio_end_writeback(folio);
+ } else {
+ if (!count)
+ folio_end_writeback(folio);
+ }
+ mapping_set_error(inode->i_mapping, error);
+ return error;
+}
+
+static int iomap_do_writepage(struct folio *folio,
+ struct writeback_control *wbc, void *data)
+{
+ return iomap_writepage_map(data, wbc, folio);
}
int
@@ -1988,18 +1971,24 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
{
int ret;
+ /*
+ * Writeback from reclaim context should never happen except in the case
+ * of a VM regression so warn about it and refuse to write the data.
+ */
+ if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) ==
+ PF_MEMALLOC))
+ return -EIO;
+
wpc->ops = ops;
ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
- if (!wpc->ioend)
- return ret;
- return iomap_submit_ioend(wpc, wpc->ioend, ret);
+ return iomap_submit_ioend(wpc, ret);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
static int __init iomap_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct iomap_ioend, io_inline_bio),
+ offsetof(struct iomap_ioend, io_bio),
BIOSET_NEED_BVECS);
}
fs_initcall(iomap_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..f3b43d223a46 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+ bio->bi_write_hint = inode->i_write_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index c16fd55f5595..0a991c4ce87d 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -154,7 +154,48 @@ DEFINE_EVENT(iomap_class, name, \
TP_ARGS(inode, iomap))
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
-DEFINE_IOMAP_EVENT(iomap_writepage_map);
+
+TRACE_EVENT(iomap_writepage_map,
+ TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
+ struct iomap *iomap),
+ TP_ARGS(inode, pos, dirty_len, iomap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(u64, pos)
+ __field(u64, dirty_len)
+ __field(u64, addr)
+ __field(loff_t, offset)
+ __field(u64, length)
+ __field(u16, type)
+ __field(u16, flags)
+ __field(dev_t, bdev)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pos = pos;
+ __entry->dirty_len = dirty_len;
+ __entry->addr = iomap->addr;
+ __entry->offset = iomap->offset;
+ __entry->length = iomap->length;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx "
+ "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ MAJOR(__entry->bdev), MINOR(__entry->bdev),
+ __entry->pos,
+ __entry->dirty_len,
+ __entry->addr,
+ __entry->offset,
+ __entry->length,
+ __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
+ __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+);
TRACE_EVENT(iomap_iter,
TP_PROTO(struct iomap_iter *iter, const void *ops,
@@ -165,6 +206,7 @@ TRACE_EVENT(iomap_iter,
__field(u64, ino)
__field(loff_t, pos)
__field(u64, length)
+ __field(s64, processed)
__field(unsigned int, flags)
__field(const void *, ops)
__field(unsigned long, caller)
@@ -174,15 +216,17 @@ TRACE_EVENT(iomap_iter,
__entry->ino = iter->inode->i_ino;
__entry->pos = iter->pos;
__entry->length = iomap_length(iter);
+ __entry->processed = iter->processed;
__entry->flags = iter->flags;
__entry->ops = ops;
__entry->caller = caller;
),
- TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pos,
__entry->length,
+ __entry->processed,
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
__entry->flags,
__entry->ops,
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 8eec84c651bf..cb3cda1390ad 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2763,9 +2763,7 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl)
* leafno - the number of the leaf to be updated.
* newval - the new value for the leaf.
*
- * RETURN VALUES:
- * 0 - success
- * -EIO - i/o error
+ * RETURN VALUES: none
*/
static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
{
@@ -2792,10 +2790,6 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
* get the buddy size (number of words covered) of
* the new value.
*/
-
- if ((newval - tp->dmt_budmin) > BUDMIN)
- return -EIO;
-
budsz = BUDSIZE(newval, tp->dmt_budmin);
/* try to join.
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index cb6d1fda66a7..73389c68e251 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
int lmLogOpen(struct super_block *sb)
{
int rc;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct jfs_log *log;
struct jfs_sb_info *sbi = JFS_SBI(sb);
@@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb)
mutex_lock(&jfs_log_mutex);
list_for_each_entry(log, &jfs_external_logs, journal_list) {
- if (log->bdev_handle->bdev->bd_dev == sbi->logdev) {
+ if (file_bdev(log->bdev_file)->bd_dev == sbi->logdev) {
if (!uuid_equal(&log->uuid, &sbi->loguuid)) {
jfs_warn("wrong uuid on JFS journal");
mutex_unlock(&jfs_log_mutex);
@@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb)
* file systems to log may have n-to-1 relationship;
*/
- bdev_handle = bdev_open_by_dev(sbi->logdev,
+ bdev_file = bdev_file_open_by_dev(sbi->logdev,
BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL);
- if (IS_ERR(bdev_handle)) {
- rc = PTR_ERR(bdev_handle);
+ if (IS_ERR(bdev_file)) {
+ rc = PTR_ERR(bdev_file);
goto free;
}
- log->bdev_handle = bdev_handle;
+ log->bdev_file = bdev_file;
uuid_copy(&log->uuid, &sbi->loguuid);
/*
@@ -1141,7 +1141,7 @@ journal_found:
lbmLogShutdown(log);
close: /* close external log device */
- bdev_release(bdev_handle);
+ fput(bdev_file);
free: /* free log descriptor */
mutex_unlock(&jfs_log_mutex);
@@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb)
init_waitqueue_head(&log->syncwait);
set_bit(log_INLINELOG, &log->flag);
- log->bdev_handle = sb->s_bdev_handle;
+ log->bdev_file = sb->s_bdev_file;
log->base = addressPXD(&JFS_SBI(sb)->logpxd);
log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
(L2LOGPSIZE - sb->s_blocksize_bits);
@@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb)
{
struct jfs_sb_info *sbi = JFS_SBI(sb);
struct jfs_log *log = sbi->log;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int rc = 0;
jfs_info("lmLogClose: log:0x%p", log);
@@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb)
* external log as separate logical volume
*/
list_del(&log->journal_list);
- bdev_handle = log->bdev_handle;
+ bdev_file = log->bdev_file;
rc = lmLogShutdown(log);
- bdev_release(bdev_handle);
+ fput(bdev_file);
kfree(log);
@@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bp->l_flag |= lbmREAD;
- bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS);
+ bio = bio_alloc(file_bdev(log->bdev_file), 1, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
@@ -2115,7 +2115,7 @@ static void lbmStartIO(struct lbuf * bp)
jfs_info("lbmStartIO");
if (!log->no_integrity)
- bdev = log->bdev_handle->bdev;
+ bdev = file_bdev(log->bdev_file);
bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC,
GFP_NOFS);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 84aa2d253907..8b8994e48cd0 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -356,7 +356,7 @@ struct jfs_log {
* before writing syncpt.
*/
struct list_head journal_list; /* Global list */
- struct bdev_handle *bdev_handle; /* 4: log lv pointer */
+ struct file *bdev_file; /* 4: log lv pointer */
int serial; /* 4: log mount serial number */
s64 base; /* @8: log extent address (inline log ) */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 9b5c6a20b30c..98f9a432c336 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -431,7 +431,7 @@ int updateSuper(struct super_block *sb, uint state)
if (state == FM_MOUNT) {
/* record log's dev_t and mount serial number */
j_sb->s_logdev = cpu_to_le32(
- new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev));
+ new_encode_dev(file_bdev(sbi->log->bdev_file)->bd_dev));
j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
} else if (state == FM_CLEAN) {
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 8d8e556bd610..73f09a762b79 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -932,7 +932,7 @@ static int __init init_jfs_fs(void)
jfs_inode_cachep =
kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+ 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
offsetof(struct jfs_inode_info, i_inline_all),
sizeof_field(struct jfs_inode_info, i_inline_all),
init_once);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8b2bd65d70e7..bce1d7ac95ca 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -54,9 +54,9 @@ static bool kernfs_lockdep(struct kernfs_node *kn)
static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
{
if (!kn)
- return strlcpy(buf, "(null)", buflen);
+ return strscpy(buf, "(null)", buflen);
- return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
+ return strscpy(buf, kn->parent ? kn->name : "/", buflen);
}
/* kernfs_node_depth - compute depth from @from to @to */
@@ -127,7 +127,7 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
*
* [3] when @kn_to is %NULL result will be "(null)"
*
- * Return: the length of the full path. If the full length is equal to or
+ * Return: the length of the constructed path. If the path would have been
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -138,16 +138,17 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
struct kernfs_node *kn, *common;
const char parent_str[] = "/..";
size_t depth_from, depth_to, len = 0;
+ ssize_t copied;
int i, j;
if (!kn_to)
- return strlcpy(buf, "(null)", buflen);
+ return strscpy(buf, "(null)", buflen);
if (!kn_from)
kn_from = kernfs_root(kn_to)->kn;
if (kn_from == kn_to)
- return strlcpy(buf, "/", buflen);
+ return strscpy(buf, "/", buflen);
common = kernfs_common_ancestor(kn_from, kn_to);
if (WARN_ON(!common))
@@ -158,18 +159,19 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
buf[0] = '\0';
- for (i = 0; i < depth_from; i++)
- len += strlcpy(buf + len, parent_str,
- len < buflen ? buflen - len : 0);
+ for (i = 0; i < depth_from; i++) {
+ copied = strscpy(buf + len, parent_str, buflen - len);
+ if (copied < 0)
+ return copied;
+ len += copied;
+ }
/* Calculate how many bytes we need for the rest */
for (i = depth_to - 1; i >= 0; i--) {
for (kn = kn_to, j = 0; j < i; j++)
kn = kn->parent;
- len += strlcpy(buf + len, "/",
- len < buflen ? buflen - len : 0);
- len += strlcpy(buf + len, kn->name,
- len < buflen ? buflen - len : 0);
+
+ len += scnprintf(buf + len, buflen - len, "/%s", kn->name);
}
return len;
@@ -182,12 +184,12 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
* @buflen: size of @buf
*
* Copies the name of @kn into @buf of @buflen bytes. The behavior is
- * similar to strlcpy().
+ * similar to strscpy().
*
* Fills buffer with "(null)" if @kn is %NULL.
*
- * Return: the length of @kn's name and if @buf isn't long enough,
- * it's filled up to @buflen-1 and nul terminated.
+ * Return: the resulting length of @buf. If @buf isn't long enough,
+ * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG.
*
* This function can be called from any context.
*/
@@ -214,7 +216,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
* path (which includes '..'s) as needed to reach from @from to @to is
* returned.
*
- * Return: the length of the full path. If the full length is equal to or
+ * Return: the length of the constructed path. If the path would have been
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -265,12 +267,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
sizeof(kernfs_pr_cont_buf));
if (sz < 0) {
- pr_cont("(error)");
- goto out;
- }
-
- if (sz >= sizeof(kernfs_pr_cont_buf)) {
- pr_cont("(name too long)");
+ if (sz == -E2BIG)
+ pr_cont("(name too long)");
+ else
+ pr_cont("(error)");
goto out;
}
@@ -676,6 +676,18 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
{
struct kernfs_node *kn;
+ if (parent->mode & S_ISGID) {
+ /* this code block imitates inode_init_owner() for
+ * kernfs
+ */
+
+ if (parent->iattr)
+ gid = parent->iattr->ia_gid;
+
+ if (flags & KERNFS_DIR)
+ mode |= S_ISGID;
+ }
+
kn = __kernfs_new_node(kernfs_root(parent), parent,
name, mode, uid, gid, flags);
if (kn) {
@@ -850,16 +862,16 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
const unsigned char *path,
const void *ns)
{
- size_t len;
+ ssize_t len;
char *p, *name;
lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
spin_lock_irq(&kernfs_pr_cont_lock);
- len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
+ len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
- if (len >= sizeof(kernfs_pr_cont_buf)) {
+ if (len < 0) {
spin_unlock_irq(&kernfs_pr_cont_lock);
return NULL;
}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f0cb729e9a97..ffa4565c275a 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -447,7 +447,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
* warnings and we don't want to add spurious locking dependency
* between the two. Check whether mmap is actually implemented
* without grabbing @of->mutex by testing HAS_MMAP flag. See the
- * comment in kernfs_file_open() for more details.
+ * comment in kernfs_fop_open() for more details.
*/
if (!(of->kn->flags & KERNFS_HAS_MMAP))
return -ENODEV;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 4628edde2e7e..e29f4edf9572 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -125,9 +125,6 @@ static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb,
inode = kernfs_get_inode(sb, kn);
kernfs_put(kn);
- if (!inode)
- return ERR_PTR(-ESTALE);
-
return d_obtain_alias(inode);
}
@@ -361,7 +358,9 @@ int kernfs_get_tree(struct fs_context *fc)
}
sb->s_flags |= SB_ACTIVE;
- uuid_gen(&sb->s_uuid);
+ uuid_t uuid;
+ uuid_gen(&uuid);
+ super_set_uuid(sb, uuid.b, sizeof(uuid));
down_write(&root->kernfs_supers_rwsem);
list_add(&info->node, &info->root->supers);
diff --git a/fs/libfs.c b/fs/libfs.c
index eec6031b0155..0d14ae808fcf 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -23,6 +23,7 @@
#include <linux/fsnotify.h>
#include <linux/unicode.h>
#include <linux/fscrypt.h>
+#include <linux/pidfs.h>
#include <linux/uaccess.h>
@@ -240,17 +241,22 @@ const struct inode_operations simple_dir_inode_operations = {
};
EXPORT_SYMBOL(simple_dir_inode_operations);
-static void offset_set(struct dentry *dentry, u32 offset)
+/* 0 is '.', 1 is '..', so always start with offset 2 or more */
+enum {
+ DIR_OFFSET_MIN = 2,
+};
+
+static void offset_set(struct dentry *dentry, long offset)
{
- dentry->d_fsdata = (void *)((uintptr_t)(offset));
+ dentry->d_fsdata = (void *)offset;
}
-static u32 dentry2offset(struct dentry *dentry)
+static long dentry2offset(struct dentry *dentry)
{
- return (u32)((uintptr_t)(dentry->d_fsdata));
+ return (long)dentry->d_fsdata;
}
-static struct lock_class_key simple_offset_xa_lock;
+static struct lock_class_key simple_offset_lock_class;
/**
* simple_offset_init - initialize an offset_ctx
@@ -259,11 +265,9 @@ static struct lock_class_key simple_offset_xa_lock;
*/
void simple_offset_init(struct offset_ctx *octx)
{
- xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1);
- lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock);
-
- /* 0 is '.', 1 is '..', so always start with offset 2 */
- octx->next_offset = 2;
+ mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE);
+ lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class);
+ octx->next_offset = DIR_OFFSET_MIN;
}
/**
@@ -271,20 +275,19 @@ void simple_offset_init(struct offset_ctx *octx)
* @octx: directory offset ctx to be updated
* @dentry: new dentry being added
*
- * Returns zero on success. @so_ctx and the dentry offset are updated.
+ * Returns zero on success. @octx and the dentry's offset are updated.
* Otherwise, a negative errno value is returned.
*/
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
{
- static const struct xa_limit limit = XA_LIMIT(2, U32_MAX);
- u32 offset;
+ unsigned long offset;
int ret;
if (dentry2offset(dentry) != 0)
return -EBUSY;
- ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit,
- &octx->next_offset, GFP_KERNEL);
+ ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
+ LONG_MAX, &octx->next_offset, GFP_KERNEL);
if (ret < 0)
return ret;
@@ -300,17 +303,49 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
*/
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
{
- u32 offset;
+ long offset;
offset = dentry2offset(dentry);
if (offset == 0)
return;
- xa_erase(&octx->xa, offset);
+ mtree_erase(&octx->mt, offset);
offset_set(dentry, 0);
}
/**
+ * simple_offset_empty - Check if a dentry can be unlinked
+ * @dentry: dentry to be tested
+ *
+ * Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
+ */
+int simple_offset_empty(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ struct offset_ctx *octx;
+ struct dentry *child;
+ unsigned long index;
+ int ret = 1;
+
+ if (!inode || !S_ISDIR(inode->i_mode))
+ return ret;
+
+ index = DIR_OFFSET_MIN;
+ octx = inode->i_op->get_offset_ctx(inode);
+ mt_for_each(&octx->mt, child, index, LONG_MAX) {
+ spin_lock(&child->d_lock);
+ if (simple_positive(child)) {
+ spin_unlock(&child->d_lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&child->d_lock);
+ }
+
+ return ret;
+}
+
+/**
* simple_offset_rename_exchange - exchange rename with directory offsets
* @old_dir: parent of dentry being moved
* @old_dentry: dentry being moved
@@ -327,8 +362,8 @@ int simple_offset_rename_exchange(struct inode *old_dir,
{
struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
- u32 old_index = dentry2offset(old_dentry);
- u32 new_index = dentry2offset(new_dentry);
+ long old_index = dentry2offset(old_dentry);
+ long new_index = dentry2offset(new_dentry);
int ret;
simple_offset_remove(old_ctx, old_dentry);
@@ -354,9 +389,9 @@ int simple_offset_rename_exchange(struct inode *old_dir,
out_restore:
offset_set(old_dentry, old_index);
- xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL);
+ mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL);
offset_set(new_dentry, new_index);
- xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL);
+ mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL);
return ret;
}
@@ -369,7 +404,7 @@ out_restore:
*/
void simple_offset_destroy(struct offset_ctx *octx)
{
- xa_destroy(&octx->xa);
+ mtree_destroy(&octx->mt);
}
/**
@@ -399,15 +434,16 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
/* In this case, ->private_data is protected by f_pos_lock */
file->private_data = NULL;
- return vfs_setpos(file, offset, U32_MAX);
+ return vfs_setpos(file, offset, LONG_MAX);
}
-static struct dentry *offset_find_next(struct xa_state *xas)
+static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
{
+ MA_STATE(mas, &octx->mt, offset, offset);
struct dentry *child, *found = NULL;
rcu_read_lock();
- child = xas_next_entry(xas, U32_MAX);
+ child = mas_find(&mas, LONG_MAX);
if (!child)
goto out;
spin_lock(&child->d_lock);
@@ -421,8 +457,8 @@ out:
static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
{
- u32 offset = dentry2offset(dentry);
struct inode *inode = d_inode(dentry);
+ long offset = dentry2offset(dentry);
return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
inode->i_ino, fs_umode_to_dtype(inode->i_mode));
@@ -430,12 +466,11 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
{
- struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
- XA_STATE(xas, &so_ctx->xa, ctx->pos);
+ struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
struct dentry *dentry;
while (true) {
- dentry = offset_find_next(&xas);
+ dentry = offset_find_next(octx, ctx->pos);
if (!dentry)
return ERR_PTR(-ENOENT);
@@ -444,8 +479,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
break;
}
+ ctx->pos = dentry2offset(dentry) + 1;
dput(dentry);
- ctx->pos = xas.xa_index + 1;
}
return NULL;
}
@@ -481,7 +516,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx)
return 0;
/* In this case, ->private_data is protected by f_pos_lock */
- if (ctx->pos == 2)
+ if (ctx->pos == DIR_OFFSET_MIN)
file->private_data = NULL;
else if (file->private_data == ERR_PTR(-ENOENT))
return 0;
@@ -1580,7 +1615,7 @@ EXPORT_SYMBOL(alloc_anon_inode);
* All arguments are ignored and it just returns -EINVAL.
*/
int
-simple_nosetlease(struct file *filp, int arg, struct file_lock **flp,
+simple_nosetlease(struct file *filp, int arg, struct file_lease **flp,
void **priv)
{
return -EINVAL;
@@ -1704,16 +1739,28 @@ bool is_empty_dir_inode(struct inode *inode)
static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
- const struct dentry *parent = READ_ONCE(dentry->d_parent);
- const struct inode *dir = READ_ONCE(parent->d_inode);
- const struct super_block *sb = dentry->d_sb;
- const struct unicode_map *um = sb->s_encoding;
- struct qstr qstr = QSTR_INIT(str, len);
+ const struct dentry *parent;
+ const struct inode *dir;
char strbuf[DNAME_INLINE_LEN];
- int ret;
+ struct qstr qstr;
+
+ /*
+ * Attempt a case-sensitive match first. It is cheaper and
+ * should cover most lookups, including all the sane
+ * applications that expect a case-sensitive filesystem.
+ *
+ * This comparison is safe under RCU because the caller
+ * guarantees the consistency between str and len. See
+ * __d_lookup_rcu_op_compare() for details.
+ */
+ if (len == name->len && !memcmp(str, name->name, len))
+ return 0;
+ parent = READ_ONCE(dentry->d_parent);
+ dir = READ_ONCE(parent->d_inode);
if (!dir || !IS_CASEFOLDED(dir))
- goto fallback;
+ return 1;
+
/*
* If the dentry name is stored in-line, then it may be concurrently
* modified by a rename. If this happens, the VFS will eventually retry
@@ -1724,20 +1771,14 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
if (len <= DNAME_INLINE_LEN - 1) {
memcpy(strbuf, str, len);
strbuf[len] = 0;
- qstr.name = strbuf;
+ str = strbuf;
/* prevent compiler from optimizing out the temporary buffer */
barrier();
}
- ret = utf8_strncasecmp(um, name, &qstr);
- if (ret >= 0)
- return ret;
+ qstr.len = len;
+ qstr.name = str;
- if (sb_has_strict_encoding(sb))
- return -EINVAL;
-fallback:
- if (len != name->len)
- return 1;
- return !!memcmp(str, name->name, len);
+ return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
}
/**
@@ -1752,7 +1793,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
const struct inode *dir = READ_ONCE(dentry->d_inode);
struct super_block *sb = dentry->d_sb;
const struct unicode_map *um = sb->s_encoding;
- int ret = 0;
+ int ret;
if (!dir || !IS_CASEFOLDED(dir))
return 0;
@@ -1766,73 +1807,45 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
static const struct dentry_operations generic_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
-};
-#endif
-
#ifdef CONFIG_FS_ENCRYPTION
-static const struct dentry_operations generic_encrypted_dentry_ops = {
.d_revalidate = fscrypt_d_revalidate,
+#endif
};
#endif
-#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
-static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
- .d_hash = generic_ci_d_hash,
- .d_compare = generic_ci_d_compare,
+#ifdef CONFIG_FS_ENCRYPTION
+static const struct dentry_operations generic_encrypted_dentry_ops = {
.d_revalidate = fscrypt_d_revalidate,
};
#endif
/**
- * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
- * @dentry: dentry to set ops on
- *
- * Casefolded directories need d_hash and d_compare set, so that the dentries
- * contained in them are handled case-insensitively. Note that these operations
- * are needed on the parent directory rather than on the dentries in it, and
- * while the casefolding flag can be toggled on and off on an empty directory,
- * dentry_operations can't be changed later. As a result, if the filesystem has
- * casefolding support enabled at all, we have to give all dentries the
- * casefolding operations even if their inode doesn't have the casefolding flag
- * currently (and thus the casefolding ops would be no-ops for now).
- *
- * Encryption works differently in that the only dentry operation it needs is
- * d_revalidate, which it only needs on dentries that have the no-key name flag.
- * The no-key flag can't be set "later", so we don't have to worry about that.
+ * generic_set_sb_d_ops - helper for choosing the set of
+ * filesystem-wide dentry operations for the enabled features
+ * @sb: superblock to be configured
*
- * Finally, to maximize compatibility with overlayfs (which isn't compatible
- * with certain dentry operations) and to avoid taking an unnecessary
- * performance hit, we use custom dentry_operations for each possible
- * combination rather than always installing all operations.
+ * Filesystems supporting casefolding and/or fscrypt can call this
+ * helper at mount-time to configure sb->s_d_op to best set of dentry
+ * operations required for the enabled features. The helper must be
+ * called after these have been configured, but before the root dentry
+ * is created.
*/
-void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
+void generic_set_sb_d_ops(struct super_block *sb)
{
-#ifdef CONFIG_FS_ENCRYPTION
- bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
-#endif
#if IS_ENABLED(CONFIG_UNICODE)
- bool needs_ci_ops = dentry->d_sb->s_encoding;
-#endif
-#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
- if (needs_encrypt_ops && needs_ci_ops) {
- d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
+ if (sb->s_encoding) {
+ sb->s_d_op = &generic_ci_dentry_ops;
return;
}
#endif
#ifdef CONFIG_FS_ENCRYPTION
- if (needs_encrypt_ops) {
- d_set_d_op(dentry, &generic_encrypted_dentry_ops);
- return;
- }
-#endif
-#if IS_ENABLED(CONFIG_UNICODE)
- if (needs_ci_ops) {
- d_set_d_op(dentry, &generic_ci_dentry_ops);
+ if (sb->s_cop) {
+ sb->s_d_op = &generic_encrypted_dentry_ops;
return;
}
#endif
}
-EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
+EXPORT_SYMBOL(generic_set_sb_d_ops);
/**
* inode_maybe_inc_iversion - increments i_version
@@ -1973,3 +1986,144 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
return ts;
}
EXPORT_SYMBOL(simple_inode_init_ts);
+
+static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
+{
+ struct dentry *dentry;
+
+ guard(rcu)();
+ dentry = READ_ONCE(stashed);
+ if (!dentry)
+ return NULL;
+ if (!lockref_get_not_dead(&dentry->d_lockref))
+ return NULL;
+ return dentry;
+}
+
+static struct dentry *prepare_anon_dentry(struct dentry **stashed,
+ unsigned long ino,
+ struct super_block *sb,
+ void *data)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+ const struct stashed_operations *sops = sb->s_fs_info;
+
+ dentry = d_alloc_anon(sb);
+ if (!dentry)
+ return ERR_PTR(-ENOMEM);
+
+ inode = new_inode_pseudo(sb);
+ if (!inode) {
+ dput(dentry);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ inode->i_ino = ino;
+ inode->i_flags |= S_IMMUTABLE;
+ inode->i_mode = S_IFREG;
+ simple_inode_init_ts(inode);
+ sops->init_inode(inode, data);
+
+ /* Notice when this is changed. */
+ WARN_ON_ONCE(!S_ISREG(inode->i_mode));
+ WARN_ON_ONCE(!IS_IMMUTABLE(inode));
+
+ /* Store address of location where dentry's supposed to be stashed. */
+ dentry->d_fsdata = stashed;
+
+ /* @data is now owned by the fs */
+ d_instantiate(dentry, inode);
+ return dentry;
+}
+
+static struct dentry *stash_dentry(struct dentry **stashed,
+ struct dentry *dentry)
+{
+ guard(rcu)();
+ for (;;) {
+ struct dentry *old;
+
+ /* Assume any old dentry was cleared out. */
+ old = cmpxchg(stashed, NULL, dentry);
+ if (likely(!old))
+ return dentry;
+
+ /* Check if somebody else installed a reusable dentry. */
+ if (lockref_get_not_dead(&old->d_lockref))
+ return old;
+
+ /* There's an old dead dentry there, try to take it over. */
+ if (likely(try_cmpxchg(stashed, &old, dentry)))
+ return dentry;
+ }
+}
+
+/**
+ * path_from_stashed - create path from stashed or new dentry
+ * @stashed: where to retrieve or stash dentry
+ * @ino: inode number to use
+ * @mnt: mnt of the filesystems to use
+ * @data: data to store in inode->i_private
+ * @path: path to create
+ *
+ * The function tries to retrieve a stashed dentry from @stashed. If the dentry
+ * is still valid then it will be reused. If the dentry isn't able the function
+ * will allocate a new dentry and inode. It will then check again whether it
+ * can reuse an existing dentry in case one has been added in the meantime or
+ * update @stashed with the newly added dentry.
+ *
+ * Special-purpose helper for nsfs and pidfs.
+ *
+ * Return: On success zero and on failure a negative error is returned.
+ */
+int path_from_stashed(struct dentry **stashed, unsigned long ino,
+ struct vfsmount *mnt, void *data, struct path *path)
+{
+ struct dentry *dentry;
+ const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
+
+ /* See if dentry can be reused. */
+ path->dentry = get_stashed_dentry(*stashed);
+ if (path->dentry) {
+ sops->put_data(data);
+ goto out_path;
+ }
+
+ /* Allocate a new dentry. */
+ dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, data);
+ if (IS_ERR(dentry)) {
+ sops->put_data(data);
+ return PTR_ERR(dentry);
+ }
+
+ /* Added a new dentry. @data is now owned by the filesystem. */
+ path->dentry = stash_dentry(stashed, dentry);
+ if (path->dentry != dentry)
+ dput(dentry);
+
+out_path:
+ WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
+ WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
+ path->mnt = mntget(mnt);
+ return 0;
+}
+
+void stashed_dentry_prune(struct dentry *dentry)
+{
+ struct dentry **stashed = dentry->d_fsdata;
+ struct inode *inode = d_inode(dentry);
+
+ if (WARN_ON_ONCE(!stashed))
+ return;
+
+ if (!inode)
+ return;
+
+ /*
+ * Only replace our own @dentry as someone else might've
+ * already cleared out @dentry and stashed their own
+ * dentry in there.
+ */
+ cmpxchg(stashed, dentry, NULL);
+}
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 8161667c976f..527458db4525 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -243,7 +243,7 @@ static void encode_nlm4_holder(struct xdr_stream *xdr,
u64 l_offset, l_len;
__be32 *p;
- encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_RDLCK);
encode_int32(xdr, lock->svid);
encode_netobj(xdr, lock->oh.data, lock->oh.len);
@@ -270,7 +270,7 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
goto out_overflow;
exclusive = be32_to_cpup(p++);
lock->svid = be32_to_cpup(p);
- fl->fl_pid = (pid_t)lock->svid;
+ fl->c.flc_pid = (pid_t)lock->svid;
error = decode_netobj(xdr, &lock->oh);
if (unlikely(error))
@@ -280,8 +280,8 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
if (unlikely(p == NULL))
goto out_overflow;
- fl->fl_flags = FL_POSIX;
- fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
+ fl->c.flc_flags = FL_POSIX;
+ fl->c.flc_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
p = xdr_decode_hyper(p, &l_offset);
xdr_decode_hyper(p, &l_len);
nlm4svc_set_file_lock_range(fl, l_offset, l_len);
@@ -357,7 +357,7 @@ static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
const struct nlm_lock *lock = &args->lock;
encode_cookie(xdr, &args->cookie);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm4_lock(xdr, lock);
}
@@ -380,7 +380,7 @@ static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
encode_cookie(xdr, &args->cookie);
encode_bool(xdr, args->block);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm4_lock(xdr, lock);
encode_bool(xdr, args->reclaim);
encode_int32(xdr, args->state);
@@ -403,7 +403,7 @@ static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
encode_cookie(xdr, &args->cookie);
encode_bool(xdr, args->block);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm4_lock(xdr, lock);
}
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 5d85715be763..a7e0519ec024 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -185,7 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
continue;
if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
continue;
- if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)), fh) != 0)
+ if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->c.flc_file)), fh) != 0)
continue;
/* Alright, we found a lock. Set the return status
* and wake up the caller
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index fba6c7fa7474..cebcc283b7ce 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -133,7 +133,8 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
char *nodename = req->a_host->h_rpcclnt->cl_nodename;
nlmclnt_next_cookie(&argp->cookie);
- memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
+ memcpy(&lock->fh, NFS_FH(file_inode(fl->c.flc_file)),
+ sizeof(struct nfs_fh));
lock->caller = nodename;
lock->oh.data = req->a_owner;
lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
@@ -142,7 +143,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
lock->svid = fl->fl_u.nfs_fl.owner->pid;
lock->fl.fl_start = fl->fl_start;
lock->fl.fl_end = fl->fl_end;
- lock->fl.fl_type = fl->fl_type;
+ lock->fl.c.flc_type = fl->c.flc_type;
}
static void nlmclnt_release_lockargs(struct nlm_rqst *req)
@@ -182,7 +183,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *dat
call->a_callback_data = data;
if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
- if (fl->fl_type != F_UNLCK) {
+ if (fl->c.flc_type != F_UNLCK) {
call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
status = nlmclnt_lock(call, fl);
} else
@@ -432,13 +433,14 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
{
int status;
- status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST);
+ status = nlmclnt_call(nfs_file_cred(fl->c.flc_file), req,
+ NLMPROC_TEST);
if (status < 0)
goto out;
switch (req->a_res.status) {
case nlm_granted:
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
break;
case nlm_lck_denied:
/*
@@ -446,8 +448,8 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
*/
fl->fl_start = req->a_res.lock.fl.fl_start;
fl->fl_end = req->a_res.lock.fl.fl_end;
- fl->fl_type = req->a_res.lock.fl.fl_type;
- fl->fl_pid = -req->a_res.lock.fl.fl_pid;
+ fl->c.flc_type = req->a_res.lock.fl.c.flc_type;
+ fl->c.flc_pid = -req->a_res.lock.fl.c.flc_pid;
break;
default:
status = nlm_stat_to_errno(req->a_res.status);
@@ -485,14 +487,15 @@ static const struct file_lock_operations nlmclnt_lock_ops = {
static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
{
fl->fl_u.nfs_fl.state = 0;
- fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host, fl->fl_owner);
+ fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host,
+ fl->c.flc_owner);
INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
fl->fl_ops = &nlmclnt_lock_ops;
}
static int do_vfs_lock(struct file_lock *fl)
{
- return locks_lock_file_wait(fl->fl_file, fl);
+ return locks_lock_file_wait(fl->c.flc_file, fl);
}
/*
@@ -518,12 +521,12 @@ static int do_vfs_lock(struct file_lock *fl)
static int
nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
{
- const struct cred *cred = nfs_file_cred(fl->fl_file);
+ const struct cred *cred = nfs_file_cred(fl->c.flc_file);
struct nlm_host *host = req->a_host;
struct nlm_res *resp = &req->a_res;
struct nlm_wait block;
- unsigned char fl_flags = fl->fl_flags;
- unsigned char fl_type;
+ unsigned char flags = fl->c.flc_flags;
+ unsigned char type;
__be32 b_status;
int status = -ENOLCK;
@@ -531,9 +534,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
goto out;
req->a_args.state = nsm_local_state;
- fl->fl_flags |= FL_ACCESS;
+ fl->c.flc_flags |= FL_ACCESS;
status = do_vfs_lock(fl);
- fl->fl_flags = fl_flags;
+ fl->c.flc_flags = flags;
if (status < 0)
goto out;
@@ -591,11 +594,11 @@ again:
goto again;
}
/* Ensure the resulting lock will get added to granted list */
- fl->fl_flags |= FL_SLEEP;
+ fl->c.flc_flags |= FL_SLEEP;
if (do_vfs_lock(fl) < 0)
printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
up_read(&host->h_rwsem);
- fl->fl_flags = fl_flags;
+ fl->c.flc_flags = flags;
status = 0;
}
if (status < 0)
@@ -605,7 +608,7 @@ again:
* cases NLM_LCK_DENIED is returned for a permanent error. So
* turn it into an ENOLCK.
*/
- if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP))
+ if (resp->status == nlm_lck_denied && (flags & FL_SLEEP))
status = -ENOLCK;
else
status = nlm_stat_to_errno(resp->status);
@@ -622,13 +625,13 @@ out_unlock:
req->a_host->h_addrlen, req->a_res.status);
dprintk("lockd: lock attempt ended in fatal error.\n"
" Attempting to unlock.\n");
- fl_type = fl->fl_type;
- fl->fl_type = F_UNLCK;
+ type = fl->c.flc_type;
+ fl->c.flc_type = F_UNLCK;
down_read(&host->h_rwsem);
do_vfs_lock(fl);
up_read(&host->h_rwsem);
- fl->fl_type = fl_type;
- fl->fl_flags = fl_flags;
+ fl->c.flc_type = type;
+ fl->c.flc_flags = flags;
nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
return status;
}
@@ -651,12 +654,14 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl,
nlmclnt_setlockargs(req, fl);
req->a_args.reclaim = 1;
- status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK);
+ status = nlmclnt_call(nfs_file_cred(fl->c.flc_file), req,
+ NLMPROC_LOCK);
if (status >= 0 && req->a_res.status == nlm_granted)
return 0;
printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d "
- "(errno %d, status %d)\n", fl->fl_pid,
+ "(errno %d, status %d)\n",
+ fl->c.flc_pid,
status, ntohl(req->a_res.status));
/*
@@ -683,26 +688,26 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
struct nlm_host *host = req->a_host;
struct nlm_res *resp = &req->a_res;
int status;
- unsigned char fl_flags = fl->fl_flags;
+ unsigned char flags = fl->c.flc_flags;
/*
* Note: the server is supposed to either grant us the unlock
* request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
* case, we want to unlock.
*/
- fl->fl_flags |= FL_EXISTS;
+ fl->c.flc_flags |= FL_EXISTS;
down_read(&host->h_rwsem);
status = do_vfs_lock(fl);
up_read(&host->h_rwsem);
- fl->fl_flags = fl_flags;
+ fl->c.flc_flags = flags;
if (status == -ENOENT) {
status = 0;
goto out;
}
refcount_inc(&req->a_count);
- status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
- NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+ status = nlmclnt_async_call(nfs_file_cred(fl->c.flc_file), req,
+ NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
if (status < 0)
goto out;
@@ -795,8 +800,8 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
req->a_args.block = block;
refcount_inc(&req->a_count);
- status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
- NLMPROC_CANCEL, &nlmclnt_cancel_ops);
+ status = nlmclnt_async_call(nfs_file_cred(fl->c.flc_file), req,
+ NLMPROC_CANCEL, &nlmclnt_cancel_ops);
if (status == 0 && req->a_res.status == nlm_lck_denied)
status = -ENOLCK;
nlmclnt_release_call(req);
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 4df62f635529..a3e97278b997 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -238,7 +238,7 @@ static void encode_nlm_holder(struct xdr_stream *xdr,
u32 l_offset, l_len;
__be32 *p;
- encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_RDLCK);
encode_int32(xdr, lock->svid);
encode_netobj(xdr, lock->oh.data, lock->oh.len);
@@ -265,7 +265,7 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
goto out_overflow;
exclusive = be32_to_cpup(p++);
lock->svid = be32_to_cpup(p);
- fl->fl_pid = (pid_t)lock->svid;
+ fl->c.flc_pid = (pid_t)lock->svid;
error = decode_netobj(xdr, &lock->oh);
if (unlikely(error))
@@ -275,8 +275,8 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
if (unlikely(p == NULL))
goto out_overflow;
- fl->fl_flags = FL_POSIX;
- fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
+ fl->c.flc_flags = FL_POSIX;
+ fl->c.flc_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
l_offset = be32_to_cpup(p++);
l_len = be32_to_cpup(p);
end = l_offset + l_len - 1;
@@ -357,7 +357,7 @@ static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
const struct nlm_lock *lock = &args->lock;
encode_cookie(xdr, &args->cookie);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm_lock(xdr, lock);
}
@@ -380,7 +380,7 @@ static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
encode_cookie(xdr, &args->cookie);
encode_bool(xdr, args->block);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm_lock(xdr, lock);
encode_bool(xdr, args->reclaim);
encode_int32(xdr, args->state);
@@ -403,7 +403,7 @@ static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
encode_cookie(xdr, &args->cookie);
encode_bool(xdr, args->block);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm_lock(xdr, lock);
}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index b72023a6b4c1..8a72c418cdcc 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -52,16 +52,16 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
*filp = file;
/* Set up the missing parts of the file_lock structure */
- lock->fl.fl_flags = FL_POSIX;
- lock->fl.fl_file = file->f_file[mode];
- lock->fl.fl_pid = current->tgid;
+ lock->fl.c.flc_flags = FL_POSIX;
+ lock->fl.c.flc_file = file->f_file[mode];
+ lock->fl.c.flc_pid = current->tgid;
lock->fl.fl_start = (loff_t)lock->lock_start;
lock->fl.fl_end = lock->lock_len ?
(loff_t)(lock->lock_start + lock->lock_len - 1) :
OFFSET_MAX;
lock->fl.fl_lmops = &nlmsvc_lock_operations;
nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
- if (!lock->fl.fl_owner) {
+ if (!lock->fl.c.flc_owner) {
/* lockowner allocation has failed */
nlmsvc_release_host(host);
return nlm_lck_denied_nolocks;
@@ -106,7 +106,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
- test_owner = argp->lock.fl.fl_owner;
+ test_owner = argp->lock.fl.c.flc_owner;
/* Now check for conflicting locks */
resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
if (resp->status == nlm_drop_reply)
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 2dc10900ad1c..1f2149db10f2 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -150,16 +150,17 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
struct file_lock *fl;
dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n",
- file, lock->fl.fl_pid,
+ file, lock->fl.c.flc_pid,
(long long)lock->fl.fl_start,
- (long long)lock->fl.fl_end, lock->fl.fl_type);
+ (long long)lock->fl.fl_end,
+ lock->fl.c.flc_type);
spin_lock(&nlm_blocked_lock);
list_for_each_entry(block, &nlm_blocked, b_list) {
fl = &block->b_call->a_args.lock.fl;
dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
- block->b_file, fl->fl_pid,
+ block->b_file, fl->c.flc_pid,
(long long)fl->fl_start,
- (long long)fl->fl_end, fl->fl_type,
+ (long long)fl->fl_end, fl->c.flc_type,
nlmdbg_cookie2a(&block->b_call->a_args.cookie));
if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) {
kref_get(&block->b_count);
@@ -244,7 +245,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
goto failed_free;
/* Set notifier function for VFS, and init args */
- call->a_args.lock.fl.fl_flags |= FL_SLEEP;
+ call->a_args.lock.fl.c.flc_flags |= FL_SLEEP;
call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
nlmclnt_next_cookie(&call->a_args.cookie);
@@ -402,14 +403,14 @@ static struct nlm_lockowner *nlmsvc_find_lockowner(struct nlm_host *host, pid_t
void
nlmsvc_release_lockowner(struct nlm_lock *lock)
{
- if (lock->fl.fl_owner)
- nlmsvc_put_lockowner(lock->fl.fl_owner);
+ if (lock->fl.c.flc_owner)
+ nlmsvc_put_lockowner(lock->fl.c.flc_owner);
}
void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host,
pid_t pid)
{
- fl->fl_owner = nlmsvc_find_lockowner(host, pid);
+ fl->c.flc_owner = nlmsvc_find_lockowner(host, pid);
}
/*
@@ -425,7 +426,7 @@ static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
/* set default data area */
call->a_args.lock.oh.data = call->a_owner;
- call->a_args.lock.svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid;
+ call->a_args.lock.svid = ((struct nlm_lockowner *) lock->fl.c.flc_owner)->pid;
if (lock->oh.len > NLMCLNT_OHSIZE) {
void *data = kmalloc(lock->oh.len, GFP_KERNEL);
@@ -489,7 +490,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
inode->i_sb->s_id, inode->i_ino,
- lock->fl.fl_type, lock->fl.fl_pid,
+ lock->fl.c.flc_type,
+ lock->fl.c.flc_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end,
wait);
@@ -512,7 +514,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
lock = &block->b_call->a_args.lock;
} else
- lock->fl.fl_flags &= ~FL_SLEEP;
+ lock->fl.c.flc_flags &= ~FL_SLEEP;
if (block->b_flags & B_QUEUED) {
dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n",
@@ -560,10 +562,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
spin_unlock(&nlm_blocked_lock);
if (!wait)
- lock->fl.fl_flags &= ~FL_SLEEP;
+ lock->fl.c.flc_flags &= ~FL_SLEEP;
mode = lock_to_openmode(&lock->fl);
error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL);
- lock->fl.fl_flags &= ~FL_SLEEP;
+ lock->fl.c.flc_flags &= ~FL_SLEEP;
dprintk("lockd: vfs_lock_file returned %d\n", error);
switch (error) {
@@ -616,7 +618,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
nlmsvc_file_inode(file)->i_sb->s_id,
nlmsvc_file_inode(file)->i_ino,
- lock->fl.fl_type,
+ lock->fl.c.flc_type,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
@@ -636,19 +638,19 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
- if (lock->fl.fl_type == F_UNLCK) {
+ if (lock->fl.c.flc_type == F_UNLCK) {
ret = nlm_granted;
goto out;
}
dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
- lock->fl.fl_type, (long long)lock->fl.fl_start,
+ lock->fl.c.flc_type, (long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
conflock->caller = "somehost"; /* FIXME */
conflock->len = strlen(conflock->caller);
conflock->oh.len = 0; /* don't return OH info */
- conflock->svid = lock->fl.fl_pid;
- conflock->fl.fl_type = lock->fl.fl_type;
+ conflock->svid = lock->fl.c.flc_pid;
+ conflock->fl.c.flc_type = lock->fl.c.flc_type;
conflock->fl.fl_start = lock->fl.fl_start;
conflock->fl.fl_end = lock->fl.fl_end;
locks_release_private(&lock->fl);
@@ -673,21 +675,21 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n",
nlmsvc_file_inode(file)->i_sb->s_id,
nlmsvc_file_inode(file)->i_ino,
- lock->fl.fl_pid,
+ lock->fl.c.flc_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
/* First, cancel any lock that might be there */
nlmsvc_cancel_blocked(net, file, lock);
- lock->fl.fl_type = F_UNLCK;
- lock->fl.fl_file = file->f_file[O_RDONLY];
- if (lock->fl.fl_file)
- error = vfs_lock_file(lock->fl.fl_file, F_SETLK,
+ lock->fl.c.flc_type = F_UNLCK;
+ lock->fl.c.flc_file = file->f_file[O_RDONLY];
+ if (lock->fl.c.flc_file)
+ error = vfs_lock_file(lock->fl.c.flc_file, F_SETLK,
&lock->fl, NULL);
- lock->fl.fl_file = file->f_file[O_WRONLY];
- if (lock->fl.fl_file)
- error |= vfs_lock_file(lock->fl.fl_file, F_SETLK,
+ lock->fl.c.flc_file = file->f_file[O_WRONLY];
+ if (lock->fl.c.flc_file)
+ error |= vfs_lock_file(lock->fl.c.flc_file, F_SETLK,
&lock->fl, NULL);
return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
@@ -710,7 +712,7 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
nlmsvc_file_inode(file)->i_sb->s_id,
nlmsvc_file_inode(file)->i_ino,
- lock->fl.fl_pid,
+ lock->fl.c.flc_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
@@ -863,12 +865,12 @@ nlmsvc_grant_blocked(struct nlm_block *block)
/* vfs_lock_file() can mangle fl_start and fl_end, but we need
* them unchanged for the GRANT_MSG
*/
- lock->fl.fl_flags |= FL_SLEEP;
+ lock->fl.c.flc_flags |= FL_SLEEP;
fl_start = lock->fl.fl_start;
fl_end = lock->fl.fl_end;
mode = lock_to_openmode(&lock->fl);
error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL);
- lock->fl.fl_flags &= ~FL_SLEEP;
+ lock->fl.c.flc_flags &= ~FL_SLEEP;
lock->fl.fl_start = fl_start;
lock->fl.fl_end = fl_end;
@@ -993,8 +995,8 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
/* Client doesn't want it, just unlock it */
nlmsvc_unlink_block(block);
fl = &block->b_call->a_args.lock.fl;
- fl->fl_type = F_UNLCK;
- error = vfs_lock_file(fl->fl_file, F_SETLK, fl, NULL);
+ fl->c.flc_type = F_UNLCK;
+ error = vfs_lock_file(fl->c.flc_file, F_SETLK, fl, NULL);
if (error)
pr_warn("lockd: unable to unlock lock rejected by client!\n");
break;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 32784f508c81..a03220e66ce0 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -77,12 +77,12 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Set up the missing parts of the file_lock structure */
mode = lock_to_openmode(&lock->fl);
- lock->fl.fl_flags = FL_POSIX;
- lock->fl.fl_file = file->f_file[mode];
- lock->fl.fl_pid = current->tgid;
+ lock->fl.c.flc_flags = FL_POSIX;
+ lock->fl.c.flc_file = file->f_file[mode];
+ lock->fl.c.flc_pid = current->tgid;
lock->fl.fl_lmops = &nlmsvc_lock_operations;
nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
- if (!lock->fl.fl_owner) {
+ if (!lock->fl.c.flc_owner) {
/* lockowner allocation has failed */
nlmsvc_release_host(host);
return nlm_lck_denied_nolocks;
@@ -127,7 +127,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
- test_owner = argp->lock.fl.fl_owner;
+ test_owner = argp->lock.fl.c.flc_owner;
/* Now check for conflicting locks */
resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index e3b6229e7ae5..9103896164f6 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -73,7 +73,7 @@ static inline unsigned int file_hash(struct nfs_fh *f)
int lock_to_openmode(struct file_lock *lock)
{
- return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY;
+ return lock_is_write(lock) ? O_WRONLY : O_RDONLY;
}
/*
@@ -181,18 +181,18 @@ static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl)
struct file_lock lock;
locks_init_lock(&lock);
- lock.fl_type = F_UNLCK;
+ lock.c.flc_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
- lock.fl_owner = fl->fl_owner;
- lock.fl_pid = fl->fl_pid;
- lock.fl_flags = FL_POSIX;
+ lock.c.flc_owner = fl->c.flc_owner;
+ lock.c.flc_pid = fl->c.flc_pid;
+ lock.c.flc_flags = FL_POSIX;
- lock.fl_file = file->f_file[O_RDONLY];
- if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
+ lock.c.flc_file = file->f_file[O_RDONLY];
+ if (lock.c.flc_file && vfs_lock_file(lock.c.flc_file, F_SETLK, &lock, NULL))
goto out_err;
- lock.fl_file = file->f_file[O_WRONLY];
- if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
+ lock.c.flc_file = file->f_file[O_WRONLY];
+ if (lock.c.flc_file && vfs_lock_file(lock.c.flc_file, F_SETLK, &lock, NULL))
goto out_err;
return 0;
out_err:
@@ -218,14 +218,14 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
again:
file->f_locks = 0;
spin_lock(&flctx->flc_lock);
- list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+ for_each_file_lock(fl, &flctx->flc_posix) {
if (fl->fl_lmops != &nlmsvc_lock_operations)
continue;
/* update current lock count */
file->f_locks++;
- lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host;
+ lockhost = ((struct nlm_lockowner *) fl->c.flc_owner)->host;
if (match(lockhost, host)) {
spin_unlock(&flctx->flc_lock);
@@ -272,7 +272,7 @@ nlm_file_inuse(struct nlm_file *file)
if (flctx && !list_empty_careful(&flctx->flc_posix)) {
spin_lock(&flctx->flc_lock);
- list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+ for_each_file_lock(fl, &flctx->flc_posix) {
if (fl->fl_lmops == &nlmsvc_lock_operations) {
spin_unlock(&flctx->flc_lock);
return 1;
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 2fb5748dae0c..adfcce2bf11b 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -88,8 +88,8 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
return false;
locks_init_lock(fl);
- fl->fl_flags = FL_POSIX;
- fl->fl_type = F_RDLCK;
+ fl->c.flc_flags = FL_POSIX;
+ fl->c.flc_type = F_RDLCK;
end = start + len - 1;
fl->fl_start = s32_to_loff_t(start);
if (len == 0 || end < 0)
@@ -107,7 +107,7 @@ svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock)
s32 start, len;
/* exclusive */
- if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0)
+ if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0)
return false;
if (xdr_stream_encode_u32(xdr, lock->svid) < 0)
return false;
@@ -164,7 +164,7 @@ nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
return true;
}
@@ -184,7 +184,7 @@ nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
@@ -209,7 +209,7 @@ nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
return true;
}
@@ -223,7 +223,7 @@ nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
- argp->lock.fl.fl_type = F_UNLCK;
+ argp->lock.fl.c.flc_type = F_UNLCK;
return true;
}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 5fcbf30cd275..3d28b9c3ed15 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -89,8 +89,8 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
return false;
locks_init_lock(fl);
- fl->fl_flags = FL_POSIX;
- fl->fl_type = F_RDLCK;
+ fl->c.flc_flags = FL_POSIX;
+ fl->c.flc_type = F_RDLCK;
nlm4svc_set_file_lock_range(fl, lock->lock_start, lock->lock_len);
return true;
}
@@ -102,7 +102,7 @@ svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock)
s64 start, len;
/* exclusive */
- if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0)
+ if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0)
return false;
if (xdr_stream_encode_u32(xdr, lock->svid) < 0)
return false;
@@ -159,7 +159,7 @@ nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
return true;
}
@@ -179,7 +179,7 @@ nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
@@ -204,7 +204,7 @@ nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
return true;
}
@@ -218,7 +218,7 @@ nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
- argp->lock.fl.fl_type = F_UNLCK;
+ argp->lock.fl.c.flc_type = F_UNLCK;
return true;
}
diff --git a/fs/locks.c b/fs/locks.c
index cc7c117ee192..90c8746874de 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -48,7 +48,6 @@
* children.
*
*/
-
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
@@ -70,24 +69,28 @@
#include <linux/uaccess.h>
-#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
-#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
-#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
-#define IS_REMOTELCK(fl) (fl->fl_pid <= 0)
+static struct file_lock *file_lock(struct file_lock_core *flc)
+{
+ return container_of(flc, struct file_lock, c);
+}
+
+static struct file_lease *file_lease(struct file_lock_core *flc)
+{
+ return container_of(flc, struct file_lease, c);
+}
-static bool lease_breaking(struct file_lock *fl)
+static bool lease_breaking(struct file_lease *fl)
{
- return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
+ return fl->c.flc_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
}
-static int target_leasetype(struct file_lock *fl)
+static int target_leasetype(struct file_lease *fl)
{
- if (fl->fl_flags & FL_UNLOCK_PENDING)
+ if (fl->c.flc_flags & FL_UNLOCK_PENDING)
return F_UNLCK;
- if (fl->fl_flags & FL_DOWNGRADE_PENDING)
+ if (fl->c.flc_flags & FL_DOWNGRADE_PENDING)
return F_RDLCK;
- return fl->fl_type;
+ return fl->c.flc_type;
}
static int leases_enable = 1;
@@ -168,6 +171,7 @@ static DEFINE_SPINLOCK(blocked_lock_lock);
static struct kmem_cache *flctx_cache __ro_after_init;
static struct kmem_cache *filelock_cache __ro_after_init;
+static struct kmem_cache *filelease_cache __ro_after_init;
static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
@@ -204,11 +208,12 @@ out:
static void
locks_dump_ctx_list(struct list_head *list, char *list_type)
{
- struct file_lock *fl;
+ struct file_lock_core *flc;
- list_for_each_entry(fl, list, fl_list) {
- pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
- }
+ list_for_each_entry(flc, list, flc_list)
+ pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
+ list_type, flc->flc_owner, flc->flc_flags,
+ flc->flc_type, flc->flc_pid);
}
static void
@@ -229,19 +234,19 @@ locks_check_ctx_lists(struct inode *inode)
}
static void
-locks_check_ctx_file_list(struct file *filp, struct list_head *list,
- char *list_type)
+locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type)
{
- struct file_lock *fl;
+ struct file_lock_core *flc;
struct inode *inode = file_inode(filp);
- list_for_each_entry(fl, list, fl_list)
- if (fl->fl_file == filp)
+ list_for_each_entry(flc, list, flc_list)
+ if (flc->flc_file == filp)
pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx "
" fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
list_type, MAJOR(inode->i_sb->s_dev),
MINOR(inode->i_sb->s_dev), inode->i_ino,
- fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
+ flc->flc_owner, flc->flc_flags,
+ flc->flc_type, flc->flc_pid);
}
void
@@ -255,13 +260,13 @@ locks_free_lock_context(struct inode *inode)
}
}
-static void locks_init_lock_heads(struct file_lock *fl)
+static void locks_init_lock_heads(struct file_lock_core *flc)
{
- INIT_HLIST_NODE(&fl->fl_link);
- INIT_LIST_HEAD(&fl->fl_list);
- INIT_LIST_HEAD(&fl->fl_blocked_requests);
- INIT_LIST_HEAD(&fl->fl_blocked_member);
- init_waitqueue_head(&fl->fl_wait);
+ INIT_HLIST_NODE(&flc->flc_link);
+ INIT_LIST_HEAD(&flc->flc_list);
+ INIT_LIST_HEAD(&flc->flc_blocked_requests);
+ INIT_LIST_HEAD(&flc->flc_blocked_member);
+ init_waitqueue_head(&flc->flc_wait);
}
/* Allocate an empty lock structure. */
@@ -270,19 +275,33 @@ struct file_lock *locks_alloc_lock(void)
struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);
if (fl)
- locks_init_lock_heads(fl);
+ locks_init_lock_heads(&fl->c);
return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lock);
+/* Allocate an empty lock structure. */
+struct file_lease *locks_alloc_lease(void)
+{
+ struct file_lease *fl = kmem_cache_zalloc(filelease_cache, GFP_KERNEL);
+
+ if (fl)
+ locks_init_lock_heads(&fl->c);
+
+ return fl;
+}
+EXPORT_SYMBOL_GPL(locks_alloc_lease);
+
void locks_release_private(struct file_lock *fl)
{
- BUG_ON(waitqueue_active(&fl->fl_wait));
- BUG_ON(!list_empty(&fl->fl_list));
- BUG_ON(!list_empty(&fl->fl_blocked_requests));
- BUG_ON(!list_empty(&fl->fl_blocked_member));
- BUG_ON(!hlist_unhashed(&fl->fl_link));
+ struct file_lock_core *flc = &fl->c;
+
+ BUG_ON(waitqueue_active(&flc->flc_wait));
+ BUG_ON(!list_empty(&flc->flc_list));
+ BUG_ON(!list_empty(&flc->flc_blocked_requests));
+ BUG_ON(!list_empty(&flc->flc_blocked_member));
+ BUG_ON(!hlist_unhashed(&flc->flc_link));
if (fl->fl_ops) {
if (fl->fl_ops->fl_release_private)
@@ -292,8 +311,8 @@ void locks_release_private(struct file_lock *fl)
if (fl->fl_lmops) {
if (fl->fl_lmops->lm_put_owner) {
- fl->fl_lmops->lm_put_owner(fl->fl_owner);
- fl->fl_owner = NULL;
+ fl->fl_lmops->lm_put_owner(flc->flc_owner);
+ flc->flc_owner = NULL;
}
fl->fl_lmops = NULL;
}
@@ -309,16 +328,15 @@ EXPORT_SYMBOL_GPL(locks_release_private);
* %true: @owner has at least one blocker
* %false: @owner has no blockers
*/
-bool locks_owner_has_blockers(struct file_lock_context *flctx,
- fl_owner_t owner)
+bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner)
{
- struct file_lock *fl;
+ struct file_lock_core *flc;
spin_lock(&flctx->flc_lock);
- list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
- if (fl->fl_owner != owner)
+ list_for_each_entry(flc, &flctx->flc_posix, flc_list) {
+ if (flc->flc_owner != owner)
continue;
- if (!list_empty(&fl->fl_blocked_requests)) {
+ if (!list_empty(&flc->flc_blocked_requests)) {
spin_unlock(&flctx->flc_lock);
return true;
}
@@ -336,35 +354,52 @@ void locks_free_lock(struct file_lock *fl)
}
EXPORT_SYMBOL(locks_free_lock);
+/* Free a lease which is not in use. */
+void locks_free_lease(struct file_lease *fl)
+{
+ kmem_cache_free(filelease_cache, fl);
+}
+EXPORT_SYMBOL(locks_free_lease);
+
static void
locks_dispose_list(struct list_head *dispose)
{
- struct file_lock *fl;
+ struct file_lock_core *flc;
while (!list_empty(dispose)) {
- fl = list_first_entry(dispose, struct file_lock, fl_list);
- list_del_init(&fl->fl_list);
- locks_free_lock(fl);
+ flc = list_first_entry(dispose, struct file_lock_core, flc_list);
+ list_del_init(&flc->flc_list);
+ if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
+ locks_free_lease(file_lease(flc));
+ else
+ locks_free_lock(file_lock(flc));
}
}
void locks_init_lock(struct file_lock *fl)
{
memset(fl, 0, sizeof(struct file_lock));
- locks_init_lock_heads(fl);
+ locks_init_lock_heads(&fl->c);
}
EXPORT_SYMBOL(locks_init_lock);
+void locks_init_lease(struct file_lease *fl)
+{
+ memset(fl, 0, sizeof(*fl));
+ locks_init_lock_heads(&fl->c);
+}
+EXPORT_SYMBOL(locks_init_lease);
+
/*
* Initialize a new lock from an existing file_lock structure.
*/
void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
- new->fl_owner = fl->fl_owner;
- new->fl_pid = fl->fl_pid;
- new->fl_file = NULL;
- new->fl_flags = fl->fl_flags;
- new->fl_type = fl->fl_type;
+ new->c.flc_owner = fl->c.flc_owner;
+ new->c.flc_pid = fl->c.flc_pid;
+ new->c.flc_file = NULL;
+ new->c.flc_flags = fl->c.flc_flags;
+ new->c.flc_type = fl->c.flc_type;
new->fl_start = fl->fl_start;
new->fl_end = fl->fl_end;
new->fl_lmops = fl->fl_lmops;
@@ -372,7 +407,7 @@ void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
if (fl->fl_lmops) {
if (fl->fl_lmops->lm_get_owner)
- fl->fl_lmops->lm_get_owner(fl->fl_owner);
+ fl->fl_lmops->lm_get_owner(fl->c.flc_owner);
}
}
EXPORT_SYMBOL(locks_copy_conflock);
@@ -384,7 +419,7 @@ void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
locks_copy_conflock(new, fl);
- new->fl_file = fl->fl_file;
+ new->c.flc_file = fl->c.flc_file;
new->fl_ops = fl->fl_ops;
if (fl->fl_ops) {
@@ -400,15 +435,17 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
/*
* As ctx->flc_lock is held, new requests cannot be added to
- * ->fl_blocked_requests, so we don't need a lock to check if it
+ * ->flc_blocked_requests, so we don't need a lock to check if it
* is empty.
*/
- if (list_empty(&fl->fl_blocked_requests))
+ if (list_empty(&fl->c.flc_blocked_requests))
return;
spin_lock(&blocked_lock_lock);
- list_splice_init(&fl->fl_blocked_requests, &new->fl_blocked_requests);
- list_for_each_entry(f, &new->fl_blocked_requests, fl_blocked_member)
- f->fl_blocker = new;
+ list_splice_init(&fl->c.flc_blocked_requests,
+ &new->c.flc_blocked_requests);
+ list_for_each_entry(f, &new->c.flc_blocked_requests,
+ c.flc_blocked_member)
+ f->c.flc_blocker = &new->c;
spin_unlock(&blocked_lock_lock);
}
@@ -429,21 +466,21 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
{
locks_init_lock(fl);
- fl->fl_file = filp;
- fl->fl_owner = filp;
- fl->fl_pid = current->tgid;
- fl->fl_flags = FL_FLOCK;
- fl->fl_type = type;
+ fl->c.flc_file = filp;
+ fl->c.flc_owner = filp;
+ fl->c.flc_pid = current->tgid;
+ fl->c.flc_flags = FL_FLOCK;
+ fl->c.flc_type = type;
fl->fl_end = OFFSET_MAX;
}
-static int assign_type(struct file_lock *fl, int type)
+static int assign_type(struct file_lock_core *flc, int type)
{
switch (type) {
case F_RDLCK:
case F_WRLCK:
case F_UNLCK:
- fl->fl_type = type;
+ flc->flc_type = type;
break;
default:
return -EINVAL;
@@ -488,14 +525,14 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
} else
fl->fl_end = OFFSET_MAX;
- fl->fl_owner = current->files;
- fl->fl_pid = current->tgid;
- fl->fl_file = filp;
- fl->fl_flags = FL_POSIX;
+ fl->c.flc_owner = current->files;
+ fl->c.flc_pid = current->tgid;
+ fl->c.flc_file = filp;
+ fl->c.flc_flags = FL_POSIX;
fl->fl_ops = NULL;
fl->fl_lmops = NULL;
- return assign_type(fl, l->l_type);
+ return assign_type(&fl->c, l->l_type);
}
/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
@@ -516,16 +553,16 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
/* default lease lock manager operations */
static bool
-lease_break_callback(struct file_lock *fl)
+lease_break_callback(struct file_lease *fl)
{
kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
return false;
}
static void
-lease_setup(struct file_lock *fl, void **priv)
+lease_setup(struct file_lease *fl, void **priv)
{
- struct file *filp = fl->fl_file;
+ struct file *filp = fl->c.flc_file;
struct fasync_struct *fa = *priv;
/*
@@ -539,7 +576,7 @@ lease_setup(struct file_lock *fl, void **priv)
__f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);
}
-static const struct lock_manager_operations lease_manager_ops = {
+static const struct lease_manager_operations lease_manager_ops = {
.lm_break = lease_break_callback,
.lm_change = lease_modify,
.lm_setup = lease_setup,
@@ -548,27 +585,24 @@ static const struct lock_manager_operations lease_manager_ops = {
/*
* Initialize a lease, use the default lock manager operations
*/
-static int lease_init(struct file *filp, int type, struct file_lock *fl)
+static int lease_init(struct file *filp, int type, struct file_lease *fl)
{
- if (assign_type(fl, type) != 0)
+ if (assign_type(&fl->c, type) != 0)
return -EINVAL;
- fl->fl_owner = filp;
- fl->fl_pid = current->tgid;
+ fl->c.flc_owner = filp;
+ fl->c.flc_pid = current->tgid;
- fl->fl_file = filp;
- fl->fl_flags = FL_LEASE;
- fl->fl_start = 0;
- fl->fl_end = OFFSET_MAX;
- fl->fl_ops = NULL;
+ fl->c.flc_file = filp;
+ fl->c.flc_flags = FL_LEASE;
fl->fl_lmops = &lease_manager_ops;
return 0;
}
/* Allocate a file_lock initialised to this type of lease */
-static struct file_lock *lease_alloc(struct file *filp, int type)
+static struct file_lease *lease_alloc(struct file *filp, int type)
{
- struct file_lock *fl = locks_alloc_lock();
+ struct file_lease *fl = locks_alloc_lease();
int error = -ENOMEM;
if (fl == NULL)
@@ -576,7 +610,7 @@ static struct file_lock *lease_alloc(struct file *filp, int type)
error = lease_init(filp, type, fl);
if (error) {
- locks_free_lock(fl);
+ locks_free_lease(fl);
return ERR_PTR(error);
}
return fl;
@@ -593,26 +627,26 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
/*
* Check whether two locks have the same owner.
*/
-static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
+static int posix_same_owner(struct file_lock_core *fl1, struct file_lock_core *fl2)
{
- return fl1->fl_owner == fl2->fl_owner;
+ return fl1->flc_owner == fl2->flc_owner;
}
/* Must be called with the flc_lock held! */
-static void locks_insert_global_locks(struct file_lock *fl)
+static void locks_insert_global_locks(struct file_lock_core *flc)
{
struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);
percpu_rwsem_assert_held(&file_rwsem);
spin_lock(&fll->lock);
- fl->fl_link_cpu = smp_processor_id();
- hlist_add_head(&fl->fl_link, &fll->hlist);
+ flc->flc_link_cpu = smp_processor_id();
+ hlist_add_head(&flc->flc_link, &fll->hlist);
spin_unlock(&fll->lock);
}
/* Must be called with the flc_lock held! */
-static void locks_delete_global_locks(struct file_lock *fl)
+static void locks_delete_global_locks(struct file_lock_core *flc)
{
struct file_lock_list_struct *fll;
@@ -623,33 +657,33 @@ static void locks_delete_global_locks(struct file_lock *fl)
* is done while holding the flc_lock, and new insertions into the list
* also require that it be held.
*/
- if (hlist_unhashed(&fl->fl_link))
+ if (hlist_unhashed(&flc->flc_link))
return;
- fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu);
+ fll = per_cpu_ptr(&file_lock_list, flc->flc_link_cpu);
spin_lock(&fll->lock);
- hlist_del_init(&fl->fl_link);
+ hlist_del_init(&flc->flc_link);
spin_unlock(&fll->lock);
}
static unsigned long
-posix_owner_key(struct file_lock *fl)
+posix_owner_key(struct file_lock_core *flc)
{
- return (unsigned long)fl->fl_owner;
+ return (unsigned long) flc->flc_owner;
}
-static void locks_insert_global_blocked(struct file_lock *waiter)
+static void locks_insert_global_blocked(struct file_lock_core *waiter)
{
lockdep_assert_held(&blocked_lock_lock);
- hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
+ hash_add(blocked_hash, &waiter->flc_link, posix_owner_key(waiter));
}
-static void locks_delete_global_blocked(struct file_lock *waiter)
+static void locks_delete_global_blocked(struct file_lock_core *waiter)
{
lockdep_assert_held(&blocked_lock_lock);
- hash_del(&waiter->fl_link);
+ hash_del(&waiter->flc_link);
}
/* Remove waiter from blocker's block list.
@@ -657,41 +691,39 @@ static void locks_delete_global_blocked(struct file_lock *waiter)
*
* Must be called with blocked_lock_lock held.
*/
-static void __locks_delete_block(struct file_lock *waiter)
+static void __locks_unlink_block(struct file_lock_core *waiter)
{
locks_delete_global_blocked(waiter);
- list_del_init(&waiter->fl_blocked_member);
+ list_del_init(&waiter->flc_blocked_member);
}
-static void __locks_wake_up_blocks(struct file_lock *blocker)
+static void __locks_wake_up_blocks(struct file_lock_core *blocker)
{
- while (!list_empty(&blocker->fl_blocked_requests)) {
- struct file_lock *waiter;
+ while (!list_empty(&blocker->flc_blocked_requests)) {
+ struct file_lock_core *waiter;
+ struct file_lock *fl;
- waiter = list_first_entry(&blocker->fl_blocked_requests,
- struct file_lock, fl_blocked_member);
- __locks_delete_block(waiter);
- if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
- waiter->fl_lmops->lm_notify(waiter);
+ waiter = list_first_entry(&blocker->flc_blocked_requests,
+ struct file_lock_core, flc_blocked_member);
+
+ fl = file_lock(waiter);
+ __locks_unlink_block(waiter);
+ if ((waiter->flc_flags & (FL_POSIX | FL_FLOCK)) &&
+ fl->fl_lmops && fl->fl_lmops->lm_notify)
+ fl->fl_lmops->lm_notify(fl);
else
- wake_up(&waiter->fl_wait);
+ locks_wake_up(fl);
/*
- * The setting of fl_blocker to NULL marks the "done"
+ * The setting of flc_blocker to NULL marks the "done"
* point in deleting a block. Paired with acquire at the top
* of locks_delete_block().
*/
- smp_store_release(&waiter->fl_blocker, NULL);
+ smp_store_release(&waiter->flc_blocker, NULL);
}
}
-/**
- * locks_delete_block - stop waiting for a file lock
- * @waiter: the lock which was waiting
- *
- * lockd/nfsd need to disconnect the lock while working on it.
- */
-int locks_delete_block(struct file_lock *waiter)
+static int __locks_delete_block(struct file_lock_core *waiter)
{
int status = -ENOENT;
@@ -716,24 +748,35 @@ int locks_delete_block(struct file_lock *waiter)
* no new locks can be inserted into its fl_blocked_requests list, and
* can avoid doing anything further if the list is empty.
*/
- if (!smp_load_acquire(&waiter->fl_blocker) &&
- list_empty(&waiter->fl_blocked_requests))
+ if (!smp_load_acquire(&waiter->flc_blocker) &&
+ list_empty(&waiter->flc_blocked_requests))
return status;
spin_lock(&blocked_lock_lock);
- if (waiter->fl_blocker)
+ if (waiter->flc_blocker)
status = 0;
__locks_wake_up_blocks(waiter);
- __locks_delete_block(waiter);
+ __locks_unlink_block(waiter);
/*
* The setting of fl_blocker to NULL marks the "done" point in deleting
* a block. Paired with acquire at the top of this function.
*/
- smp_store_release(&waiter->fl_blocker, NULL);
+ smp_store_release(&waiter->flc_blocker, NULL);
spin_unlock(&blocked_lock_lock);
return status;
}
+
+/**
+ * locks_delete_block - stop waiting for a file lock
+ * @waiter: the lock which was waiting
+ *
+ * lockd/nfsd need to disconnect the lock while working on it.
+ */
+int locks_delete_block(struct file_lock *waiter)
+{
+ return __locks_delete_block(&waiter->c);
+}
EXPORT_SYMBOL(locks_delete_block);
/* Insert waiter into blocker's block list.
@@ -751,26 +794,28 @@ EXPORT_SYMBOL(locks_delete_block);
* waiters, and add beneath any waiter that blocks the new waiter.
* Thus wakeups don't happen until needed.
*/
-static void __locks_insert_block(struct file_lock *blocker,
- struct file_lock *waiter,
- bool conflict(struct file_lock *,
- struct file_lock *))
+static void __locks_insert_block(struct file_lock_core *blocker,
+ struct file_lock_core *waiter,
+ bool conflict(struct file_lock_core *,
+ struct file_lock_core *))
{
- struct file_lock *fl;
- BUG_ON(!list_empty(&waiter->fl_blocked_member));
+ struct file_lock_core *flc;
+ BUG_ON(!list_empty(&waiter->flc_blocked_member));
new_blocker:
- list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member)
- if (conflict(fl, waiter)) {
- blocker = fl;
+ list_for_each_entry(flc, &blocker->flc_blocked_requests, flc_blocked_member)
+ if (conflict(flc, waiter)) {
+ blocker = flc;
goto new_blocker;
}
- waiter->fl_blocker = blocker;
- list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests);
- if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
+ waiter->flc_blocker = blocker;
+ list_add_tail(&waiter->flc_blocked_member,
+ &blocker->flc_blocked_requests);
+
+ if ((blocker->flc_flags & (FL_POSIX|FL_OFDLCK)) == FL_POSIX)
locks_insert_global_blocked(waiter);
- /* The requests in waiter->fl_blocked are known to conflict with
+ /* The requests in waiter->flc_blocked are known to conflict with
* waiter, but might not conflict with blocker, or the requests
* and lock which block it. So they all need to be woken.
*/
@@ -778,10 +823,10 @@ new_blocker:
}
/* Must be called with flc_lock held. */
-static void locks_insert_block(struct file_lock *blocker,
- struct file_lock *waiter,
- bool conflict(struct file_lock *,
- struct file_lock *))
+static void locks_insert_block(struct file_lock_core *blocker,
+ struct file_lock_core *waiter,
+ bool conflict(struct file_lock_core *,
+ struct file_lock_core *))
{
spin_lock(&blocked_lock_lock);
__locks_insert_block(blocker, waiter, conflict);
@@ -793,7 +838,7 @@ static void locks_insert_block(struct file_lock *blocker,
*
* Must be called with the inode->flc_lock held!
*/
-static void locks_wake_up_blocks(struct file_lock *blocker)
+static void locks_wake_up_blocks(struct file_lock_core *blocker)
{
/*
* Avoid taking global lock if list is empty. This is safe since new
@@ -802,7 +847,7 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
* fl_blocked_requests list does not require the flc_lock, so we must
* recheck list_empty() after acquiring the blocked_lock_lock.
*/
- if (list_empty(&blocker->fl_blocked_requests))
+ if (list_empty(&blocker->flc_blocked_requests))
return;
spin_lock(&blocked_lock_lock);
@@ -811,39 +856,39 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
}
static void
-locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
+locks_insert_lock_ctx(struct file_lock_core *fl, struct list_head *before)
{
- list_add_tail(&fl->fl_list, before);
+ list_add_tail(&fl->flc_list, before);
locks_insert_global_locks(fl);
}
static void
-locks_unlink_lock_ctx(struct file_lock *fl)
+locks_unlink_lock_ctx(struct file_lock_core *fl)
{
locks_delete_global_locks(fl);
- list_del_init(&fl->fl_list);
+ list_del_init(&fl->flc_list);
locks_wake_up_blocks(fl);
}
static void
-locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
+locks_delete_lock_ctx(struct file_lock_core *fl, struct list_head *dispose)
{
locks_unlink_lock_ctx(fl);
if (dispose)
- list_add(&fl->fl_list, dispose);
+ list_add(&fl->flc_list, dispose);
else
- locks_free_lock(fl);
+ locks_free_lock(file_lock(fl));
}
/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
* checks for shared/exclusive status of overlapping locks.
*/
-static bool locks_conflict(struct file_lock *caller_fl,
- struct file_lock *sys_fl)
+static bool locks_conflict(struct file_lock_core *caller_flc,
+ struct file_lock_core *sys_flc)
{
- if (sys_fl->fl_type == F_WRLCK)
+ if (sys_flc->flc_type == F_WRLCK)
return true;
- if (caller_fl->fl_type == F_WRLCK)
+ if (caller_flc->flc_type == F_WRLCK)
return true;
return false;
}
@@ -851,20 +896,23 @@ static bool locks_conflict(struct file_lock *caller_fl,
/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
* checking before calling the locks_conflict().
*/
-static bool posix_locks_conflict(struct file_lock *caller_fl,
- struct file_lock *sys_fl)
+static bool posix_locks_conflict(struct file_lock_core *caller_flc,
+ struct file_lock_core *sys_flc)
{
+ struct file_lock *caller_fl = file_lock(caller_flc);
+ struct file_lock *sys_fl = file_lock(sys_flc);
+
/* POSIX locks owned by the same process do not conflict with
* each other.
*/
- if (posix_same_owner(caller_fl, sys_fl))
+ if (posix_same_owner(caller_flc, sys_flc))
return false;
/* Check whether they overlap */
if (!locks_overlap(caller_fl, sys_fl))
return false;
- return locks_conflict(caller_fl, sys_fl);
+ return locks_conflict(caller_flc, sys_flc);
}
/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK
@@ -873,28 +921,31 @@ static bool posix_locks_conflict(struct file_lock *caller_fl,
static bool posix_test_locks_conflict(struct file_lock *caller_fl,
struct file_lock *sys_fl)
{
+ struct file_lock_core *caller = &caller_fl->c;
+ struct file_lock_core *sys = &sys_fl->c;
+
/* F_UNLCK checks any locks on the same fd. */
- if (caller_fl->fl_type == F_UNLCK) {
- if (!posix_same_owner(caller_fl, sys_fl))
+ if (lock_is_unlock(caller_fl)) {
+ if (!posix_same_owner(caller, sys))
return false;
return locks_overlap(caller_fl, sys_fl);
}
- return posix_locks_conflict(caller_fl, sys_fl);
+ return posix_locks_conflict(caller, sys);
}
/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
* checking before calling the locks_conflict().
*/
-static bool flock_locks_conflict(struct file_lock *caller_fl,
- struct file_lock *sys_fl)
+static bool flock_locks_conflict(struct file_lock_core *caller_flc,
+ struct file_lock_core *sys_flc)
{
/* FLOCK locks referring to the same filp do not conflict with
* each other.
*/
- if (caller_fl->fl_file == sys_fl->fl_file)
+ if (caller_flc->flc_file == sys_flc->flc_file)
return false;
- return locks_conflict(caller_fl, sys_fl);
+ return locks_conflict(caller_flc, sys_flc);
}
void
@@ -908,13 +959,13 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
ctx = locks_inode_context(inode);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
return;
}
retry:
spin_lock(&ctx->flc_lock);
- list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+ list_for_each_entry(cfl, &ctx->flc_posix, c.flc_list) {
if (!posix_test_locks_conflict(fl, cfl))
continue;
if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
@@ -930,7 +981,7 @@ retry:
locks_copy_conflock(fl, cfl);
goto out;
}
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
out:
spin_unlock(&ctx->flc_lock);
return;
@@ -972,25 +1023,27 @@ EXPORT_SYMBOL(posix_test_lock);
#define MAX_DEADLK_ITERATIONS 10
-/* Find a lock that the owner of the given block_fl is blocking on. */
-static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
+/* Find a lock that the owner of the given @blocker is blocking on. */
+static struct file_lock_core *what_owner_is_waiting_for(struct file_lock_core *blocker)
{
- struct file_lock *fl;
+ struct file_lock_core *flc;
- hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
- if (posix_same_owner(fl, block_fl)) {
- while (fl->fl_blocker)
- fl = fl->fl_blocker;
- return fl;
+ hash_for_each_possible(blocked_hash, flc, flc_link, posix_owner_key(blocker)) {
+ if (posix_same_owner(flc, blocker)) {
+ while (flc->flc_blocker)
+ flc = flc->flc_blocker;
+ return flc;
}
}
return NULL;
}
/* Must be called with the blocked_lock_lock held! */
-static int posix_locks_deadlock(struct file_lock *caller_fl,
- struct file_lock *block_fl)
+static bool posix_locks_deadlock(struct file_lock *caller_fl,
+ struct file_lock *block_fl)
{
+ struct file_lock_core *caller = &caller_fl->c;
+ struct file_lock_core *blocker = &block_fl->c;
int i = 0;
lockdep_assert_held(&blocked_lock_lock);
@@ -999,16 +1052,16 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
* This deadlock detector can't reasonably detect deadlocks with
* FL_OFDLCK locks, since they aren't owned by a process, per-se.
*/
- if (IS_OFDLCK(caller_fl))
- return 0;
+ if (caller->flc_flags & FL_OFDLCK)
+ return false;
- while ((block_fl = what_owner_is_waiting_for(block_fl))) {
+ while ((blocker = what_owner_is_waiting_for(blocker))) {
if (i++ > MAX_DEADLK_ITERATIONS)
- return 0;
- if (posix_same_owner(caller_fl, block_fl))
- return 1;
+ return false;
+ if (posix_same_owner(caller, blocker))
+ return true;
}
- return 0;
+ return false;
}
/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
@@ -1027,14 +1080,14 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
bool found = false;
LIST_HEAD(dispose);
- ctx = locks_get_lock_context(inode, request->fl_type);
+ ctx = locks_get_lock_context(inode, request->c.flc_type);
if (!ctx) {
- if (request->fl_type != F_UNLCK)
+ if (request->c.flc_type != F_UNLCK)
return -ENOMEM;
- return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0;
+ return (request->c.flc_flags & FL_EXISTS) ? -ENOENT : 0;
}
- if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
+ if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK)) {
new_fl = locks_alloc_lock();
if (!new_fl)
return -ENOMEM;
@@ -1042,41 +1095,41 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
- if (request->fl_flags & FL_ACCESS)
+ if (request->c.flc_flags & FL_ACCESS)
goto find_conflict;
- list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
- if (request->fl_file != fl->fl_file)
+ list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
+ if (request->c.flc_file != fl->c.flc_file)
continue;
- if (request->fl_type == fl->fl_type)
+ if (request->c.flc_type == fl->c.flc_type)
goto out;
found = true;
- locks_delete_lock_ctx(fl, &dispose);
+ locks_delete_lock_ctx(&fl->c, &dispose);
break;
}
- if (request->fl_type == F_UNLCK) {
- if ((request->fl_flags & FL_EXISTS) && !found)
+ if (lock_is_unlock(request)) {
+ if ((request->c.flc_flags & FL_EXISTS) && !found)
error = -ENOENT;
goto out;
}
find_conflict:
- list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
- if (!flock_locks_conflict(request, fl))
+ list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
+ if (!flock_locks_conflict(&request->c, &fl->c))
continue;
error = -EAGAIN;
- if (!(request->fl_flags & FL_SLEEP))
+ if (!(request->c.flc_flags & FL_SLEEP))
goto out;
error = FILE_LOCK_DEFERRED;
- locks_insert_block(fl, request, flock_locks_conflict);
+ locks_insert_block(&fl->c, &request->c, flock_locks_conflict);
goto out;
}
- if (request->fl_flags & FL_ACCESS)
+ if (request->c.flc_flags & FL_ACCESS)
goto out;
locks_copy_lock(new_fl, request);
locks_move_blocks(new_fl, request);
- locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
+ locks_insert_lock_ctx(&new_fl->c, &ctx->flc_flock);
new_fl = NULL;
error = 0;
@@ -1105,9 +1158,9 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
void *owner;
void (*func)(void);
- ctx = locks_get_lock_context(inode, request->fl_type);
+ ctx = locks_get_lock_context(inode, request->c.flc_type);
if (!ctx)
- return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM;
+ return lock_is_unlock(request) ? 0 : -ENOMEM;
/*
* We may need two file_lock structures for this operation,
@@ -1115,8 +1168,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
*
* In some cases we can be sure, that no new locks will be needed
*/
- if (!(request->fl_flags & FL_ACCESS) &&
- (request->fl_type != F_UNLCK ||
+ if (!(request->c.flc_flags & FL_ACCESS) &&
+ (request->c.flc_type != F_UNLCK ||
request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
new_fl = locks_alloc_lock();
new_fl2 = locks_alloc_lock();
@@ -1130,9 +1183,9 @@ retry:
* there are any, either return error or put the request on the
* blocker's list of waiters and the global blocked_hash.
*/
- if (request->fl_type != F_UNLCK) {
- list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
- if (!posix_locks_conflict(request, fl))
+ if (request->c.flc_type != F_UNLCK) {
+ list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
+ if (!posix_locks_conflict(&request->c, &fl->c))
continue;
if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
&& (*fl->fl_lmops->lm_lock_expirable)(fl)) {
@@ -1148,7 +1201,7 @@ retry:
if (conflock)
locks_copy_conflock(conflock, fl);
error = -EAGAIN;
- if (!(request->fl_flags & FL_SLEEP))
+ if (!(request->c.flc_flags & FL_SLEEP))
goto out;
/*
* Deadlock detection and insertion into the blocked
@@ -1160,10 +1213,10 @@ retry:
* Ensure that we don't find any locks blocked on this
* request during deadlock detection.
*/
- __locks_wake_up_blocks(request);
+ __locks_wake_up_blocks(&request->c);
if (likely(!posix_locks_deadlock(request, fl))) {
error = FILE_LOCK_DEFERRED;
- __locks_insert_block(fl, request,
+ __locks_insert_block(&fl->c, &request->c,
posix_locks_conflict);
}
spin_unlock(&blocked_lock_lock);
@@ -1173,22 +1226,22 @@ retry:
/* If we're just looking for a conflict, we're done. */
error = 0;
- if (request->fl_flags & FL_ACCESS)
+ if (request->c.flc_flags & FL_ACCESS)
goto out;
/* Find the first old lock with the same owner as the new lock */
- list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
- if (posix_same_owner(request, fl))
+ list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
+ if (posix_same_owner(&request->c, &fl->c))
break;
}
/* Process locks with this owner. */
- list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
- if (!posix_same_owner(request, fl))
+ list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, c.flc_list) {
+ if (!posix_same_owner(&request->c, &fl->c))
break;
/* Detect adjacent or overlapping regions (if same lock type) */
- if (request->fl_type == fl->fl_type) {
+ if (request->c.flc_type == fl->c.flc_type) {
/* In all comparisons of start vs end, use
* "start - 1" rather than "end + 1". If end
* is OFFSET_MAX, end + 1 will become negative.
@@ -1215,7 +1268,7 @@ retry:
else
request->fl_end = fl->fl_end;
if (added) {
- locks_delete_lock_ctx(fl, &dispose);
+ locks_delete_lock_ctx(&fl->c, &dispose);
continue;
}
request = fl;
@@ -1228,7 +1281,7 @@ retry:
continue;
if (fl->fl_start > request->fl_end)
break;
- if (request->fl_type == F_UNLCK)
+ if (lock_is_unlock(request))
added = true;
if (fl->fl_start < request->fl_start)
left = fl;
@@ -1244,7 +1297,7 @@ retry:
* one (This may happen several times).
*/
if (added) {
- locks_delete_lock_ctx(fl, &dispose);
+ locks_delete_lock_ctx(&fl->c, &dispose);
continue;
}
/*
@@ -1261,8 +1314,9 @@ retry:
locks_move_blocks(new_fl, request);
request = new_fl;
new_fl = NULL;
- locks_insert_lock_ctx(request, &fl->fl_list);
- locks_delete_lock_ctx(fl, &dispose);
+ locks_insert_lock_ctx(&request->c,
+ &fl->c.flc_list);
+ locks_delete_lock_ctx(&fl->c, &dispose);
added = true;
}
}
@@ -1279,8 +1333,8 @@ retry:
error = 0;
if (!added) {
- if (request->fl_type == F_UNLCK) {
- if (request->fl_flags & FL_EXISTS)
+ if (lock_is_unlock(request)) {
+ if (request->c.flc_flags & FL_EXISTS)
error = -ENOENT;
goto out;
}
@@ -1291,7 +1345,7 @@ retry:
}
locks_copy_lock(new_fl, request);
locks_move_blocks(new_fl, request);
- locks_insert_lock_ctx(new_fl, &fl->fl_list);
+ locks_insert_lock_ctx(&new_fl->c, &fl->c.flc_list);
fl = new_fl;
new_fl = NULL;
}
@@ -1303,14 +1357,14 @@ retry:
left = new_fl2;
new_fl2 = NULL;
locks_copy_lock(left, right);
- locks_insert_lock_ctx(left, &fl->fl_list);
+ locks_insert_lock_ctx(&left->c, &fl->c.flc_list);
}
right->fl_start = request->fl_end + 1;
- locks_wake_up_blocks(right);
+ locks_wake_up_blocks(&right->c);
}
if (left) {
left->fl_end = request->fl_start - 1;
- locks_wake_up_blocks(left);
+ locks_wake_up_blocks(&left->c);
}
out:
spin_unlock(&ctx->flc_lock);
@@ -1364,8 +1418,8 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = posix_lock_inode(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait,
- list_empty(&fl->fl_blocked_member));
+ error = wait_event_interruptible(fl->c.flc_wait,
+ list_empty(&fl->c.flc_blocked_member));
if (error)
break;
}
@@ -1373,37 +1427,37 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
return error;
}
-static void lease_clear_pending(struct file_lock *fl, int arg)
+static void lease_clear_pending(struct file_lease *fl, int arg)
{
switch (arg) {
case F_UNLCK:
- fl->fl_flags &= ~FL_UNLOCK_PENDING;
+ fl->c.flc_flags &= ~FL_UNLOCK_PENDING;
fallthrough;
case F_RDLCK:
- fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
+ fl->c.flc_flags &= ~FL_DOWNGRADE_PENDING;
}
}
/* We already had a lease on this file; just change its type */
-int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
+int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
{
- int error = assign_type(fl, arg);
+ int error = assign_type(&fl->c, arg);
if (error)
return error;
lease_clear_pending(fl, arg);
- locks_wake_up_blocks(fl);
+ locks_wake_up_blocks(&fl->c);
if (arg == F_UNLCK) {
- struct file *filp = fl->fl_file;
+ struct file *filp = fl->c.flc_file;
f_delown(filp);
filp->f_owner.signum = 0;
- fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
+ fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
if (fl->fl_fasync != NULL) {
printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
fl->fl_fasync = NULL;
}
- locks_delete_lock_ctx(fl, dispose);
+ locks_delete_lock_ctx(&fl->c, dispose);
}
return 0;
}
@@ -1420,11 +1474,11 @@ static bool past_time(unsigned long then)
static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
struct file_lock_context *ctx = inode->i_flctx;
- struct file_lock *fl, *tmp;
+ struct file_lease *fl, *tmp;
lockdep_assert_held(&ctx->flc_lock);
- list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
+ list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
trace_time_out_leases(inode, fl);
if (past_time(fl->fl_downgrade_time))
lease_modify(fl, F_RDLCK, dispose);
@@ -1433,38 +1487,40 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
}
}
-static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
+static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc)
{
bool rc;
+ struct file_lease *lease = file_lease(lc);
+ struct file_lease *breaker = file_lease(bc);
if (lease->fl_lmops->lm_breaker_owns_lease
&& lease->fl_lmops->lm_breaker_owns_lease(lease))
return false;
- if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) {
+ if ((bc->flc_flags & FL_LAYOUT) != (lc->flc_flags & FL_LAYOUT)) {
rc = false;
goto trace;
}
- if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) {
+ if ((bc->flc_flags & FL_DELEG) && (lc->flc_flags & FL_LEASE)) {
rc = false;
goto trace;
}
- rc = locks_conflict(breaker, lease);
+ rc = locks_conflict(bc, lc);
trace:
trace_leases_conflict(rc, lease, breaker);
return rc;
}
static bool
-any_leases_conflict(struct inode *inode, struct file_lock *breaker)
+any_leases_conflict(struct inode *inode, struct file_lease *breaker)
{
struct file_lock_context *ctx = inode->i_flctx;
- struct file_lock *fl;
+ struct file_lock_core *flc;
lockdep_assert_held(&ctx->flc_lock);
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (leases_conflict(fl, breaker))
+ list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
+ if (leases_conflict(flc, &breaker->c))
return true;
}
return false;
@@ -1487,7 +1543,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
int error = 0;
struct file_lock_context *ctx;
- struct file_lock *new_fl, *fl, *tmp;
+ struct file_lease *new_fl, *fl, *tmp;
unsigned long break_time;
int want_write = (mode & O_ACCMODE) != O_RDONLY;
LIST_HEAD(dispose);
@@ -1495,7 +1551,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
if (IS_ERR(new_fl))
return PTR_ERR(new_fl);
- new_fl->fl_flags = type;
+ new_fl->c.flc_flags = type;
/* typically we will check that ctx is non-NULL before calling */
ctx = locks_inode_context(inode);
@@ -1519,22 +1575,22 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
break_time++; /* so that 0 means no break time */
}
- list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
- if (!leases_conflict(fl, new_fl))
+ list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
+ if (!leases_conflict(&fl->c, &new_fl->c))
continue;
if (want_write) {
- if (fl->fl_flags & FL_UNLOCK_PENDING)
+ if (fl->c.flc_flags & FL_UNLOCK_PENDING)
continue;
- fl->fl_flags |= FL_UNLOCK_PENDING;
+ fl->c.flc_flags |= FL_UNLOCK_PENDING;
fl->fl_break_time = break_time;
} else {
if (lease_breaking(fl))
continue;
- fl->fl_flags |= FL_DOWNGRADE_PENDING;
+ fl->c.flc_flags |= FL_DOWNGRADE_PENDING;
fl->fl_downgrade_time = break_time;
}
if (fl->fl_lmops->lm_break(fl))
- locks_delete_lock_ctx(fl, &dispose);
+ locks_delete_lock_ctx(&fl->c, &dispose);
}
if (list_empty(&ctx->flc_lease))
@@ -1547,26 +1603,26 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
}
restart:
- fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
+ fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
break_time = fl->fl_break_time;
if (break_time != 0)
break_time -= jiffies;
if (break_time == 0)
break_time++;
- locks_insert_block(fl, new_fl, leases_conflict);
+ locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
trace_break_lease_block(inode, new_fl);
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
- error = wait_event_interruptible_timeout(new_fl->fl_wait,
- list_empty(&new_fl->fl_blocked_member),
- break_time);
+ error = wait_event_interruptible_timeout(new_fl->c.flc_wait,
+ list_empty(&new_fl->c.flc_blocked_member),
+ break_time);
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
trace_break_lease_unblock(inode, new_fl);
- locks_delete_block(new_fl);
+ __locks_delete_block(&new_fl->c);
if (error >= 0) {
/*
* Wait for the next conflicting lease that has not been
@@ -1583,7 +1639,7 @@ out:
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
free_lock:
- locks_free_lock(new_fl);
+ locks_free_lease(new_fl);
return error;
}
EXPORT_SYMBOL(__break_lease);
@@ -1601,14 +1657,14 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time)
{
bool has_lease = false;
struct file_lock_context *ctx;
- struct file_lock *fl;
+ struct file_lock_core *flc;
ctx = locks_inode_context(inode);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
- fl = list_first_entry_or_null(&ctx->flc_lease,
- struct file_lock, fl_list);
- if (fl && (fl->fl_type == F_WRLCK))
+ flc = list_first_entry_or_null(&ctx->flc_lease,
+ struct file_lock_core, flc_list);
+ if (flc && flc->flc_type == F_WRLCK)
has_lease = true;
spin_unlock(&ctx->flc_lock);
}
@@ -1643,7 +1699,7 @@ EXPORT_SYMBOL(lease_get_mtime);
*/
int fcntl_getlease(struct file *filp)
{
- struct file_lock *fl;
+ struct file_lease *fl;
struct inode *inode = file_inode(filp);
struct file_lock_context *ctx;
int type = F_UNLCK;
@@ -1654,8 +1710,8 @@ int fcntl_getlease(struct file *filp)
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (fl->fl_file != filp)
+ list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
+ if (fl->c.flc_file != filp)
continue;
type = target_leasetype(fl);
break;
@@ -1715,12 +1771,12 @@ check_conflicting_open(struct file *filp, const int arg, int flags)
}
static int
-generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv)
+generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv)
{
- struct file_lock *fl, *my_fl = NULL, *lease;
+ struct file_lease *fl, *my_fl = NULL, *lease;
struct inode *inode = file_inode(filp);
struct file_lock_context *ctx;
- bool is_deleg = (*flp)->fl_flags & FL_DELEG;
+ bool is_deleg = (*flp)->c.flc_flags & FL_DELEG;
int error;
LIST_HEAD(dispose);
@@ -1746,7 +1802,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
- error = check_conflicting_open(filp, arg, lease->fl_flags);
+ error = check_conflicting_open(filp, arg, lease->c.flc_flags);
if (error)
goto out;
@@ -1759,9 +1815,9 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
* except for this filp.
*/
error = -EAGAIN;
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (fl->fl_file == filp &&
- fl->fl_owner == lease->fl_owner) {
+ list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
+ if (fl->c.flc_file == filp &&
+ fl->c.flc_owner == lease->c.flc_owner) {
my_fl = fl;
continue;
}
@@ -1776,7 +1832,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
* Modifying our existing lease is OK, but no getting a
* new lease if someone else is opening for write:
*/
- if (fl->fl_flags & FL_UNLOCK_PENDING)
+ if (fl->c.flc_flags & FL_UNLOCK_PENDING)
goto out;
}
@@ -1792,7 +1848,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
if (!leases_enable)
goto out;
- locks_insert_lock_ctx(lease, &ctx->flc_lease);
+ locks_insert_lock_ctx(&lease->c, &ctx->flc_lease);
/*
* The check in break_lease() is lockless. It's possible for another
* open to race in after we did the earlier check for a conflicting
@@ -1803,9 +1859,9 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
* precedes these checks.
*/
smp_mb();
- error = check_conflicting_open(filp, arg, lease->fl_flags);
+ error = check_conflicting_open(filp, arg, lease->c.flc_flags);
if (error) {
- locks_unlink_lock_ctx(lease);
+ locks_unlink_lock_ctx(&lease->c);
goto out;
}
@@ -1826,7 +1882,7 @@ out:
static int generic_delete_lease(struct file *filp, void *owner)
{
int error = -EAGAIN;
- struct file_lock *fl, *victim = NULL;
+ struct file_lease *fl, *victim = NULL;
struct inode *inode = file_inode(filp);
struct file_lock_context *ctx;
LIST_HEAD(dispose);
@@ -1839,9 +1895,9 @@ static int generic_delete_lease(struct file *filp, void *owner)
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (fl->fl_file == filp &&
- fl->fl_owner == owner) {
+ list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
+ if (fl->c.flc_file == filp &&
+ fl->c.flc_owner == owner) {
victim = fl;
break;
}
@@ -1866,21 +1922,9 @@ static int generic_delete_lease(struct file *filp, void *owner)
* The (input) flp->fl_lmops->lm_break function is required
* by break_lease().
*/
-int generic_setlease(struct file *filp, int arg, struct file_lock **flp,
+int generic_setlease(struct file *filp, int arg, struct file_lease **flp,
void **priv)
{
- struct inode *inode = file_inode(filp);
- vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode);
- int error;
-
- if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
- return -EACCES;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
- error = security_file_lock(filp, arg);
- if (error)
- return error;
-
switch (arg) {
case F_UNLCK:
return generic_delete_lease(filp, *priv);
@@ -1913,7 +1957,7 @@ lease_notifier_chain_init(void)
}
static inline void
-setlease_notifier(int arg, struct file_lock *lease)
+setlease_notifier(int arg, struct file_lease *lease)
{
if (arg != F_UNLCK)
srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
@@ -1931,6 +1975,19 @@ void lease_unregister_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(lease_unregister_notifier);
+
+int
+kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
+{
+ if (lease)
+ setlease_notifier(arg, *lease);
+ if (filp->f_op->setlease)
+ return filp->f_op->setlease(filp, arg, lease, priv);
+ else
+ return generic_setlease(filp, arg, lease, priv);
+}
+EXPORT_SYMBOL_GPL(kernel_setlease);
+
/**
* vfs_setlease - sets a lease on an open file
* @filp: file pointer
@@ -1949,20 +2006,26 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier);
* may be NULL if the lm_setup operation doesn't require it.
*/
int
-vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv)
+vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
{
- if (lease)
- setlease_notifier(arg, *lease);
- if (filp->f_op->setlease)
- return filp->f_op->setlease(filp, arg, lease, priv);
- else
- return generic_setlease(filp, arg, lease, priv);
+ struct inode *inode = file_inode(filp);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode);
+ int error;
+
+ if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
+ return -EACCES;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ error = security_file_lock(filp, arg);
+ if (error)
+ return error;
+ return kernel_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(vfs_setlease);
static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
{
- struct file_lock *fl;
+ struct file_lease *fl;
struct fasync_struct *new;
int error;
@@ -1972,14 +2035,14 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
new = fasync_alloc();
if (!new) {
- locks_free_lock(fl);
+ locks_free_lease(fl);
return -ENOMEM;
}
new->fa_fd = fd;
error = vfs_setlease(filp, arg, &fl, (void **)&new);
if (fl)
- locks_free_lock(fl);
+ locks_free_lease(fl);
if (new)
fasync_free(new);
return error;
@@ -2017,8 +2080,8 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = flock_lock_inode(inode, fl);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait,
- list_empty(&fl->fl_blocked_member));
+ error = wait_event_interruptible(fl->c.flc_wait,
+ list_empty(&fl->c.flc_blocked_member));
if (error)
break;
}
@@ -2036,7 +2099,7 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+ switch (fl->c.flc_flags & (FL_POSIX|FL_FLOCK)) {
case FL_POSIX:
res = posix_lock_inode_wait(inode, fl);
break;
@@ -2098,13 +2161,13 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
flock_make_lock(f.file, &fl, type);
- error = security_file_lock(f.file, fl.fl_type);
+ error = security_file_lock(f.file, fl.c.flc_type);
if (error)
goto out_putf;
can_sleep = !(cmd & LOCK_NB);
if (can_sleep)
- fl.fl_flags |= FL_SLEEP;
+ fl.c.flc_flags |= FL_SLEEP;
if (f.file->f_op->flock)
error = f.file->f_op->flock(f.file,
@@ -2130,7 +2193,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
*/
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
- WARN_ON_ONCE(filp != fl->fl_file);
+ WARN_ON_ONCE(filp != fl->c.flc_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_GETLK, fl);
posix_test_lock(filp, fl);
@@ -2145,25 +2208,28 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
*
* Used to translate a fl_pid into a namespace virtual pid number
*/
-static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
+static pid_t locks_translate_pid(struct file_lock_core *fl, struct pid_namespace *ns)
{
pid_t vnr;
struct pid *pid;
- if (IS_OFDLCK(fl))
+ if (fl->flc_flags & FL_OFDLCK)
return -1;
- if (IS_REMOTELCK(fl))
- return fl->fl_pid;
+
+ /* Remote locks report a negative pid value */
+ if (fl->flc_pid <= 0)
+ return fl->flc_pid;
+
/*
* If the flock owner process is dead and its pid has been already
* freed, the translation below won't work, but we still want to show
* flock owner pid number in init pidns.
*/
if (ns == &init_pid_ns)
- return (pid_t)fl->fl_pid;
+ return (pid_t) fl->flc_pid;
rcu_read_lock();
- pid = find_pid_ns(fl->fl_pid, &init_pid_ns);
+ pid = find_pid_ns(fl->flc_pid, &init_pid_ns);
vnr = pid_nr_ns(pid, ns);
rcu_read_unlock();
return vnr;
@@ -2171,7 +2237,7 @@ static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
- flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
+ flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
#if BITS_PER_LONG == 32
/*
* Make sure we can represent the posix lock via
@@ -2186,19 +2252,19 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
fl->fl_end - fl->fl_start + 1;
flock->l_whence = 0;
- flock->l_type = fl->fl_type;
+ flock->l_type = fl->c.flc_type;
return 0;
}
#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
- flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
+ flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
flock->l_start = fl->fl_start;
flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
fl->fl_end - fl->fl_start + 1;
flock->l_whence = 0;
- flock->l_type = fl->fl_type;
+ flock->l_type = fl->c.flc_type;
}
#endif
@@ -2227,16 +2293,16 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
if (flock->l_pid != 0)
goto out;
- fl->fl_flags |= FL_OFDLCK;
- fl->fl_owner = filp;
+ fl->c.flc_flags |= FL_OFDLCK;
+ fl->c.flc_owner = filp;
}
error = vfs_test_lock(filp, fl);
if (error)
goto out;
- flock->l_type = fl->fl_type;
- if (fl->fl_type != F_UNLCK) {
+ flock->l_type = fl->c.flc_type;
+ if (fl->c.flc_type != F_UNLCK) {
error = posix_lock_to_flock(flock, fl);
if (error)
goto out;
@@ -2283,7 +2349,7 @@ out:
*/
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
- WARN_ON_ONCE(filp != fl->fl_file);
+ WARN_ON_ONCE(filp != fl->c.flc_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, cmd, fl);
else
@@ -2296,7 +2362,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
{
int error;
- error = security_file_lock(filp, fl->fl_type);
+ error = security_file_lock(filp, fl->c.flc_type);
if (error)
return error;
@@ -2304,8 +2370,8 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
error = vfs_lock_file(filp, cmd, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait,
- list_empty(&fl->fl_blocked_member));
+ error = wait_event_interruptible(fl->c.flc_wait,
+ list_empty(&fl->c.flc_blocked_member));
if (error)
break;
}
@@ -2318,13 +2384,13 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
static int
check_fmode_for_setlk(struct file_lock *fl)
{
- switch (fl->fl_type) {
+ switch (fl->c.flc_type) {
case F_RDLCK:
- if (!(fl->fl_file->f_mode & FMODE_READ))
+ if (!(fl->c.flc_file->f_mode & FMODE_READ))
return -EBADF;
break;
case F_WRLCK:
- if (!(fl->fl_file->f_mode & FMODE_WRITE))
+ if (!(fl->c.flc_file->f_mode & FMODE_WRITE))
return -EBADF;
}
return 0;
@@ -2363,8 +2429,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
cmd = F_SETLK;
- file_lock->fl_flags |= FL_OFDLCK;
- file_lock->fl_owner = filp;
+ file_lock->c.flc_flags |= FL_OFDLCK;
+ file_lock->c.flc_owner = filp;
break;
case F_OFD_SETLKW:
error = -EINVAL;
@@ -2372,11 +2438,11 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
cmd = F_SETLKW;
- file_lock->fl_flags |= FL_OFDLCK;
- file_lock->fl_owner = filp;
+ file_lock->c.flc_flags |= FL_OFDLCK;
+ file_lock->c.flc_owner = filp;
fallthrough;
case F_SETLKW:
- file_lock->fl_flags |= FL_SLEEP;
+ file_lock->c.flc_flags |= FL_SLEEP;
}
error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2386,8 +2452,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
* lock that was just acquired. There is no need to do that when we're
* unlocking though, or for OFD locks.
*/
- if (!error && file_lock->fl_type != F_UNLCK &&
- !(file_lock->fl_flags & FL_OFDLCK)) {
+ if (!error && file_lock->c.flc_type != F_UNLCK &&
+ !(file_lock->c.flc_flags & FL_OFDLCK)) {
struct files_struct *files = current->files;
/*
* We need that spin_lock here - it prevents reordering between
@@ -2398,7 +2464,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
f = files_lookup_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (f != filp) {
- file_lock->fl_type = F_UNLCK;
+ file_lock->c.flc_type = F_UNLCK;
error = do_lock_file_wait(filp, cmd, file_lock);
WARN_ON_ONCE(error);
error = -EBADF;
@@ -2437,16 +2503,16 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
if (flock->l_pid != 0)
goto out;
- fl->fl_flags |= FL_OFDLCK;
- fl->fl_owner = filp;
+ fl->c.flc_flags |= FL_OFDLCK;
+ fl->c.flc_owner = filp;
}
error = vfs_test_lock(filp, fl);
if (error)
goto out;
- flock->l_type = fl->fl_type;
- if (fl->fl_type != F_UNLCK)
+ flock->l_type = fl->c.flc_type;
+ if (fl->c.flc_type != F_UNLCK)
posix_lock_to_flock64(flock, fl);
out:
@@ -2486,8 +2552,8 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
cmd = F_SETLK64;
- file_lock->fl_flags |= FL_OFDLCK;
- file_lock->fl_owner = filp;
+ file_lock->c.flc_flags |= FL_OFDLCK;
+ file_lock->c.flc_owner = filp;
break;
case F_OFD_SETLKW:
error = -EINVAL;
@@ -2495,11 +2561,11 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
cmd = F_SETLKW64;
- file_lock->fl_flags |= FL_OFDLCK;
- file_lock->fl_owner = filp;
+ file_lock->c.flc_flags |= FL_OFDLCK;
+ file_lock->c.flc_owner = filp;
fallthrough;
case F_SETLKW64:
- file_lock->fl_flags |= FL_SLEEP;
+ file_lock->c.flc_flags |= FL_SLEEP;
}
error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2509,8 +2575,8 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
* lock that was just acquired. There is no need to do that when we're
* unlocking though, or for OFD locks.
*/
- if (!error && file_lock->fl_type != F_UNLCK &&
- !(file_lock->fl_flags & FL_OFDLCK)) {
+ if (!error && file_lock->c.flc_type != F_UNLCK &&
+ !(file_lock->c.flc_flags & FL_OFDLCK)) {
struct files_struct *files = current->files;
/*
* We need that spin_lock here - it prevents reordering between
@@ -2521,7 +2587,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
f = files_lookup_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (f != filp) {
- file_lock->fl_type = F_UNLCK;
+ file_lock->c.flc_type = F_UNLCK;
error = do_lock_file_wait(filp, cmd, file_lock);
WARN_ON_ONCE(error);
error = -EBADF;
@@ -2555,13 +2621,13 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
return;
locks_init_lock(&lock);
- lock.fl_type = F_UNLCK;
- lock.fl_flags = FL_POSIX | FL_CLOSE;
+ lock.c.flc_type = F_UNLCK;
+ lock.c.flc_flags = FL_POSIX | FL_CLOSE;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
- lock.fl_owner = owner;
- lock.fl_pid = current->tgid;
- lock.fl_file = filp;
+ lock.c.flc_owner = owner;
+ lock.c.flc_pid = current->tgid;
+ lock.c.flc_file = filp;
lock.fl_ops = NULL;
lock.fl_lmops = NULL;
@@ -2584,7 +2650,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
return;
flock_make_lock(filp, &fl, F_UNLCK);
- fl.fl_flags |= FL_CLOSE;
+ fl.c.flc_flags |= FL_CLOSE;
if (filp->f_op->flock)
filp->f_op->flock(filp, F_SETLKW, &fl);
@@ -2599,7 +2665,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
static void
locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
- struct file_lock *fl, *tmp;
+ struct file_lease *fl, *tmp;
LIST_HEAD(dispose);
if (list_empty(&ctx->flc_lease))
@@ -2607,8 +2673,8 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
- list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
- if (filp == fl->fl_file)
+ list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list)
+ if (filp == fl->c.flc_file)
lease_modify(fl, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
@@ -2652,7 +2718,7 @@ void locks_remove_file(struct file *filp)
*/
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
- WARN_ON_ONCE(filp != fl->fl_file);
+ WARN_ON_ONCE(filp != fl->c.flc_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_CANCELLK, fl);
return 0;
@@ -2691,69 +2757,73 @@ struct locks_iterator {
loff_t li_pos;
};
-static void lock_get_status(struct seq_file *f, struct file_lock *fl,
+static void lock_get_status(struct seq_file *f, struct file_lock_core *flc,
loff_t id, char *pfx, int repeat)
{
struct inode *inode = NULL;
- unsigned int fl_pid;
+ unsigned int pid;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
- int type;
+ int type = flc->flc_type;
+ struct file_lock *fl = file_lock(flc);
+
+ pid = locks_translate_pid(flc, proc_pidns);
- fl_pid = locks_translate_pid(fl, proc_pidns);
/*
* If lock owner is dead (and pid is freed) or not visible in current
* pidns, zero is shown as a pid value. Check lock info from
* init_pid_ns to get saved lock pid value.
*/
-
- if (fl->fl_file != NULL)
- inode = file_inode(fl->fl_file);
+ if (flc->flc_file != NULL)
+ inode = file_inode(flc->flc_file);
seq_printf(f, "%lld: ", id);
if (repeat)
seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx);
- if (IS_POSIX(fl)) {
- if (fl->fl_flags & FL_ACCESS)
+ if (flc->flc_flags & FL_POSIX) {
+ if (flc->flc_flags & FL_ACCESS)
seq_puts(f, "ACCESS");
- else if (IS_OFDLCK(fl))
+ else if (flc->flc_flags & FL_OFDLCK)
seq_puts(f, "OFDLCK");
else
seq_puts(f, "POSIX ");
seq_printf(f, " %s ",
(inode == NULL) ? "*NOINODE*" : "ADVISORY ");
- } else if (IS_FLOCK(fl)) {
+ } else if (flc->flc_flags & FL_FLOCK) {
seq_puts(f, "FLOCK ADVISORY ");
- } else if (IS_LEASE(fl)) {
- if (fl->fl_flags & FL_DELEG)
+ } else if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) {
+ struct file_lease *lease = file_lease(flc);
+
+ type = target_leasetype(lease);
+
+ if (flc->flc_flags & FL_DELEG)
seq_puts(f, "DELEG ");
else
seq_puts(f, "LEASE ");
- if (lease_breaking(fl))
+ if (lease_breaking(lease))
seq_puts(f, "BREAKING ");
- else if (fl->fl_file)
+ else if (flc->flc_file)
seq_puts(f, "ACTIVE ");
else
seq_puts(f, "BREAKER ");
} else {
seq_puts(f, "UNKNOWN UNKNOWN ");
}
- type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
(type == F_RDLCK) ? "READ" : "UNLCK");
if (inode) {
/* userspace relies on this representation of dev_t */
- seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
+ seq_printf(f, "%d %02x:%02x:%lu ", pid,
MAJOR(inode->i_sb->s_dev),
MINOR(inode->i_sb->s_dev), inode->i_ino);
} else {
- seq_printf(f, "%d <none>:0 ", fl_pid);
+ seq_printf(f, "%d <none>:0 ", pid);
}
- if (IS_POSIX(fl)) {
+ if (flc->flc_flags & FL_POSIX) {
if (fl->fl_end == OFFSET_MAX)
seq_printf(f, "%Ld EOF\n", fl->fl_start);
else
@@ -2763,17 +2833,18 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
}
}
-static struct file_lock *get_next_blocked_member(struct file_lock *node)
+static struct file_lock_core *get_next_blocked_member(struct file_lock_core *node)
{
- struct file_lock *tmp;
+ struct file_lock_core *tmp;
/* NULL node or root node */
- if (node == NULL || node->fl_blocker == NULL)
+ if (node == NULL || node->flc_blocker == NULL)
return NULL;
/* Next member in the linked list could be itself */
- tmp = list_next_entry(node, fl_blocked_member);
- if (list_entry_is_head(tmp, &node->fl_blocker->fl_blocked_requests, fl_blocked_member)
+ tmp = list_next_entry(node, flc_blocked_member);
+ if (list_entry_is_head(tmp, &node->flc_blocker->flc_blocked_requests,
+ flc_blocked_member)
|| tmp == node) {
return NULL;
}
@@ -2784,18 +2855,18 @@ static struct file_lock *get_next_blocked_member(struct file_lock *node)
static int locks_show(struct seq_file *f, void *v)
{
struct locks_iterator *iter = f->private;
- struct file_lock *cur, *tmp;
+ struct file_lock_core *cur, *tmp;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
int level = 0;
- cur = hlist_entry(v, struct file_lock, fl_link);
+ cur = hlist_entry(v, struct file_lock_core, flc_link);
if (locks_translate_pid(cur, proc_pidns) == 0)
return 0;
- /* View this crossed linked list as a binary tree, the first member of fl_blocked_requests
- * is the left child of current node, the next silibing in fl_blocked_member is the
- * right child, we can alse get the parent of current node from fl_blocker, so this
+ /* View this crossed linked list as a binary tree, the first member of flc_blocked_requests
+ * is the left child of current node, the next silibing in flc_blocked_member is the
+ * right child, we can alse get the parent of current node from flc_blocker, so this
* question becomes traversal of a binary tree
*/
while (cur != NULL) {
@@ -2804,17 +2875,18 @@ static int locks_show(struct seq_file *f, void *v)
else
lock_get_status(f, cur, iter->li_pos, "", level);
- if (!list_empty(&cur->fl_blocked_requests)) {
+ if (!list_empty(&cur->flc_blocked_requests)) {
/* Turn left */
- cur = list_first_entry_or_null(&cur->fl_blocked_requests,
- struct file_lock, fl_blocked_member);
+ cur = list_first_entry_or_null(&cur->flc_blocked_requests,
+ struct file_lock_core,
+ flc_blocked_member);
level++;
} else {
/* Turn right */
tmp = get_next_blocked_member(cur);
/* Fall back to parent node */
- while (tmp == NULL && cur->fl_blocker != NULL) {
- cur = cur->fl_blocker;
+ while (tmp == NULL && cur->flc_blocker != NULL) {
+ cur = cur->flc_blocker;
level--;
tmp = get_next_blocked_member(cur);
}
@@ -2829,14 +2901,13 @@ static void __show_fd_locks(struct seq_file *f,
struct list_head *head, int *id,
struct file *filp, struct files_struct *files)
{
- struct file_lock *fl;
+ struct file_lock_core *fl;
- list_for_each_entry(fl, head, fl_list) {
+ list_for_each_entry(fl, head, flc_list) {
- if (filp != fl->fl_file)
+ if (filp != fl->flc_file)
continue;
- if (fl->fl_owner != files &&
- fl->fl_owner != filp)
+ if (fl->flc_owner != files && fl->flc_owner != filp)
continue;
(*id)++;
@@ -2915,6 +2986,9 @@ static int __init filelock_init(void)
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+ filelease_cache = kmem_cache_create("file_lock_cache",
+ sizeof(struct file_lease), 0, SLAB_PANIC, NULL);
+
for_each_possible_cpu(i) {
struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 82aa7a35db26..e60a840999aa 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -426,9 +426,7 @@ EXPORT_SYMBOL(mb_cache_destroy);
static int __init mbcache_init(void)
{
- mb_entry_cache = kmem_cache_create("mbcache",
- sizeof(struct mb_cache_entry), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+ mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT);
if (!mb_entry_cache)
return -ENOMEM;
return 0;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 73f37f298087..7cbd2b9f4d11 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -87,7 +87,7 @@ static int __init init_inodecache(void)
minix_inode_cachep = kmem_cache_create("minix_inode_cache",
sizeof(struct minix_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (minix_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 64c5205e2b5e..3c60f1eaca61 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -214,7 +214,7 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from,
* anything at all.
*/
if (nr_extents == 0)
- return 0;
+ return -EINVAL;
/*
* Here we know that nr_extents is greater than zero which means
diff --git a/fs/mpage.c b/fs/mpage.c
index 738882e0766d..fa8b99a199fa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -605,6 +605,7 @@ alloc_new:
GFP_NOFS);
bio->bi_iter.bi_sector = first_block << (blkbits - 9);
wbc_init_bio(wbc, bio);
+ bio->bi_write_hint = inode->i_write_hint;
}
/*
diff --git a/fs/namei.c b/fs/namei.c
index 4e0de939fea1..d0c4a3e9278e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1717,7 +1717,11 @@ static inline int may_lookup(struct mnt_idmap *idmap,
{
if (nd->flags & LOOKUP_RCU) {
int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
- if (err != -ECHILD || !try_to_unlazy(nd))
+ if (!err) // success, keep going
+ return 0;
+ if (!try_to_unlazy(nd))
+ return -ECHILD; // redo it all non-lazy
+ if (err != -ECHILD) // hard error
return err;
}
return inode_permission(idmap, nd->inode, MAY_EXEC);
@@ -2676,10 +2680,8 @@ static int lookup_one_common(struct mnt_idmap *idmap,
if (!len)
return -EACCES;
- if (unlikely(name[0] == '.')) {
- if (len < 2 || (len == 2 && name[1] == '.'))
- return -EACCES;
- }
+ if (is_dot_dotdot(name, len))
+ return -EACCES;
while (len--) {
unsigned int c = *(const unsigned char *)name++;
diff --git a/fs/namespace.c b/fs/namespace.c
index ef1fd6829814..5a51315c6678 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4472,10 +4472,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
/*
* If this is an attached mount make sure it's located in the callers
* mount namespace. If it's not don't let the caller interact with it.
- * If this is a detached mount make sure it has an anonymous mount
- * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
+ *
+ * If this mount doesn't have a parent it's most often simply a
+ * detached mount with an anonymous mount namespace. IOW, something
+ * that's simply not attached yet. But there are apparently also users
+ * that do change mount properties on the rootfs itself. That obviously
+ * neither has a parent nor is it a detached mount so we cannot
+ * unconditionally check for detached mounts.
*/
- if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
+ if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
goto out;
/*
@@ -5042,13 +5047,12 @@ static struct mount *listmnt_next(struct mount *curr)
return node_to_mount(rb_next(&curr->mnt_node));
}
-static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
- u64 __user *buf, size_t bufsize,
- const struct path *root)
+static ssize_t do_listmount(struct mount *first, struct path *orig,
+ u64 mnt_parent_id, u64 __user *mnt_ids,
+ size_t nr_mnt_ids, const struct path *root)
{
struct mount *r;
- ssize_t ctr;
- int err;
+ ssize_t ret;
/*
* Don't trigger audit denials. We just want to determine what
@@ -5058,50 +5062,57 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
!ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
return -EPERM;
- err = security_sb_statfs(orig->dentry);
- if (err)
- return err;
+ ret = security_sb_statfs(orig->dentry);
+ if (ret)
+ return ret;
- for (ctr = 0, r = first; r && ctr < bufsize; r = listmnt_next(r)) {
- if (r->mnt_id_unique == mnt_id)
+ for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) {
+ if (r->mnt_id_unique == mnt_parent_id)
continue;
if (!is_path_reachable(r, r->mnt.mnt_root, orig))
continue;
- ctr = array_index_nospec(ctr, bufsize);
- if (put_user(r->mnt_id_unique, buf + ctr))
+ if (put_user(r->mnt_id_unique, mnt_ids))
return -EFAULT;
- if (check_add_overflow(ctr, 1, &ctr))
- return -ERANGE;
+ mnt_ids++;
+ nr_mnt_ids--;
+ ret++;
}
- return ctr;
+ return ret;
}
-SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
- u64 __user *, buf, size_t, bufsize, unsigned int, flags)
+SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
+ mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct mnt_id_req kreq;
struct mount *first;
struct path root, orig;
- u64 mnt_id, last_mnt_id;
+ u64 mnt_parent_id, last_mnt_id;
+ const size_t maxcount = (size_t)-1 >> 3;
ssize_t ret;
if (flags)
return -EINVAL;
+ if (unlikely(nr_mnt_ids > maxcount))
+ return -EFAULT;
+
+ if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
+ return -EFAULT;
+
ret = copy_mnt_id_req(req, &kreq);
if (ret)
return ret;
- mnt_id = kreq.mnt_id;
+ mnt_parent_id = kreq.mnt_id;
last_mnt_id = kreq.param;
down_read(&namespace_sem);
get_fs_root(current->fs, &root);
- if (mnt_id == LSMT_ROOT) {
+ if (mnt_parent_id == LSMT_ROOT) {
orig = root;
} else {
ret = -ENOENT;
- orig.mnt = lookup_mnt_in_ns(mnt_id, ns);
+ orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
if (!orig.mnt)
goto err;
orig.dentry = orig.mnt->mnt_root;
@@ -5111,7 +5122,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
else
first = mnt_find_id_at(ns, last_mnt_id + 1);
- ret = do_listmount(first, &orig, mnt_id, buf, bufsize, &root);
+ ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root);
err:
path_put(&root);
up_read(&namespace_sem);
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index b4db21022cb4..bec805e0c44c 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -21,3 +21,42 @@ config NETFS_STATS
multi-CPU system these may be on cachelines that keep bouncing
between CPUs. On the other hand, the stats are very useful for
debugging purposes. Saying 'Y' here is recommended.
+
+config FSCACHE
+ bool "General filesystem local caching manager"
+ depends on NETFS_SUPPORT
+ help
+ This option enables a generic filesystem caching manager that can be
+ used by various network and other filesystems to cache data locally.
+ Different sorts of caches can be plugged in, depending on the
+ resources available.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
+
+config FSCACHE_STATS
+ bool "Gather statistical information on local caching"
+ depends on FSCACHE && PROC_FS
+ select NETFS_STATS
+ help
+ This option causes statistical information to be gathered on local
+ caching and exported through file:
+
+ /proc/fs/fscache/stats
+
+ The gathering of statistics adds a certain amount of overhead to
+ execution as there are a quite a few stats gathered, and on a
+ multi-CPU system these may be on cachelines that keep bouncing
+ between CPUs. On the other hand, the stats are very useful for
+ debugging purposes. Saying 'Y' here is recommended.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
+
+config FSCACHE_DEBUG
+ bool "Debug FS-Cache"
+ depends on FSCACHE
+ help
+ This permits debugging to be dynamically enabled in the local caching
+ management module. If this is set, the debugging output may be
+ enabled by setting bits in /sys/modules/fscache/parameter/debug.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 386d6fb92793..d4d1d799819e 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -2,11 +2,29 @@
netfs-y := \
buffered_read.o \
+ buffered_write.o \
+ direct_read.o \
+ direct_write.o \
io.o \
iterator.o \
+ locking.o \
main.o \
- objects.o
+ misc.o \
+ objects.o \
+ output.o
netfs-$(CONFIG_NETFS_STATS) += stats.o
-obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
+netfs-$(CONFIG_FSCACHE) += \
+ fscache_cache.o \
+ fscache_cookie.o \
+ fscache_io.o \
+ fscache_main.o \
+ fscache_volume.o
+
+ifeq ($(CONFIG_PROC_FS),y)
+netfs-$(CONFIG_FSCACHE) += fscache_proc.o
+endif
+netfs-$(CONFIG_FSCACHE_STATS) += fscache_stats.o
+
+obj-$(CONFIG_NETFS_SUPPORT) += netfs.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 2cd3ccf4c439..3298c29b5548 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -16,6 +16,7 @@
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
+ struct netfs_folio *finfo;
struct folio *folio;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
@@ -63,6 +64,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
break;
}
if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_start_fscache(folio);
folio_started = true;
}
@@ -86,11 +88,20 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
if (!pg_failed) {
flush_dcache_folio(folio);
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ kfree(finfo);
+ }
folio_mark_uptodate(folio);
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
- if (folio_index(folio) == rreq->no_unlock_folio &&
+ if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
_debug("no unlock");
else
@@ -147,6 +158,15 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq,
}
}
+/*
+ * Begin an operation, and fetch the stored zero point value from the cookie if
+ * available.
+ */
+static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
+{
+ return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
+}
+
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
@@ -180,11 +200,9 @@ void netfs_readahead(struct readahead_control *ractl)
if (IS_ERR(rreq))
return;
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto cleanup_free;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto cleanup_free;
netfs_stat(&netfs_n_rh_readahead);
trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
@@ -192,6 +210,10 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl);
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
+ rreq->start, rreq->len);
+
/* Drop the refs on the folios here rather than in the cache or
* filesystem. The locks will be dropped in netfs_rreq_unlock().
*/
@@ -199,6 +221,7 @@ void netfs_readahead(struct readahead_control *ractl)
;
netfs_begin_read(rreq, false);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return;
cleanup_free:
@@ -223,12 +246,13 @@ EXPORT_SYMBOL(netfs_readahead);
*/
int netfs_read_folio(struct file *file, struct folio *folio)
{
- struct address_space *mapping = folio_file_mapping(folio);
+ struct address_space *mapping = folio->mapping;
struct netfs_io_request *rreq;
struct netfs_inode *ctx = netfs_inode(mapping->host);
+ struct folio *sink = NULL;
int ret;
- _enter("%lx", folio_index(folio));
+ _enter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
folio_file_pos(folio), folio_size(folio),
@@ -238,15 +262,64 @@ int netfs_read_folio(struct file *file, struct folio *folio)
goto alloc_error;
}
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto discard;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto discard;
netfs_stat(&netfs_n_rh_readpage);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
- return netfs_begin_read(rreq, true);
+
+ /* Set up the output buffer */
+ if (folio_test_dirty(folio)) {
+ /* Handle someone trying to read from an unflushed streaming
+ * write. We fiddle the buffer so that a gap at the beginning
+ * and/or a gap at the end get copied to, but the middle is
+ * discarded.
+ */
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct bio_vec *bvec;
+ unsigned int from = finfo->dirty_offset;
+ unsigned int to = from + finfo->dirty_len;
+ unsigned int off = 0, i = 0;
+ size_t flen = folio_size(folio);
+ size_t nr_bvec = flen / PAGE_SIZE + 2;
+ size_t part;
+
+ ret = -ENOMEM;
+ bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
+ if (!bvec)
+ goto discard;
+
+ sink = folio_alloc(GFP_KERNEL, 0);
+ if (!sink)
+ goto discard;
+
+ trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
+
+ rreq->direct_bv = bvec;
+ rreq->direct_bv_count = nr_bvec;
+ if (from > 0) {
+ bvec_set_folio(&bvec[i++], folio, from, 0);
+ off = from;
+ }
+ while (off < to) {
+ part = min_t(size_t, to - off, PAGE_SIZE);
+ bvec_set_folio(&bvec[i++], sink, part, 0);
+ off += part;
+ }
+ if (to < flen)
+ bvec_set_folio(&bvec[i++], folio, flen - to, to);
+ iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
+ } else {
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+ }
+
+ ret = netfs_begin_read(rreq, true);
+ if (sink)
+ folio_put(sink);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret < 0 ? ret : 0;
discard:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
@@ -387,14 +460,12 @@ retry:
ret = PTR_ERR(rreq);
goto error;
}
- rreq->no_unlock_folio = folio_index(folio);
+ rreq->no_unlock_folio = folio->index;
__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto error_put;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
netfs_stat(&netfs_n_rh_write_begin);
trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
@@ -405,6 +476,10 @@ retry:
ractl._nr_pages = folio_nr_pages(folio);
netfs_rreq_expand(rreq, &ractl);
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+
/* We hold the folio locks, so we can drop the references */
folio_get(folio);
while (readahead_folio(&ractl))
@@ -413,6 +488,7 @@ retry:
ret = netfs_begin_read(rreq, true);
if (ret < 0)
goto error;
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
ret = folio_wait_fscache_killable(folio);
@@ -434,3 +510,124 @@ error:
return ret;
}
EXPORT_SYMBOL(netfs_write_begin);
+
+/*
+ * Preload the data into a page we're proposing to write into.
+ */
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+ size_t offset, size_t len)
+{
+ struct netfs_io_request *rreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ unsigned long long start = folio_pos(folio);
+ size_t flen = folio_size(folio);
+ int ret;
+
+ _enter("%zx @%llx", flen, start);
+
+ ret = -ENOMEM;
+
+ rreq = netfs_alloc_request(mapping, file, start, flen,
+ NETFS_READ_FOR_WRITE);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto error;
+ }
+
+ rreq->no_unlock_folio = folio->index;
+ __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
+
+ netfs_stat(&netfs_n_rh_write_begin);
+ trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
+
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+
+ ret = netfs_begin_read(rreq, true);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret;
+
+error_put:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/**
+ * netfs_buffered_read_iter - Filesystem buffered I/O read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead. When no data can be read,
+ * -EAGAIN shall be returned. When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
+ return -EINVAL;
+
+ ret = netfs_start_io_read(inode);
+ if (ret == 0) {
+ ret = filemap_read(iocb, iter, 0);
+ netfs_end_io_read(inode);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netfs_buffered_read_iter);
+
+/**
+ * netfs_file_read_iter - Generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead. When no data can be read,
+ * -EAGAIN shall be returned. When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
+
+ if ((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+ return netfs_unbuffered_read_iter(iocb, iter);
+
+ return netfs_buffered_read_iter(iocb, iter);
+}
+EXPORT_SYMBOL(netfs_file_read_iter);
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
new file mode 100644
index 000000000000..9a0d32e4b422
--- /dev/null
+++ b/fs/netfs/buffered_write.c
@@ -0,0 +1,1257 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * Determined write method. Adjust netfs_folio_traces if this is changed.
+ */
+enum netfs_how_to_modify {
+ NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */
+ NETFS_JUST_PREFETCH, /* We have to read the folio anyway */
+ NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */
+ NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */
+ NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */
+ NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */
+ NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
+};
+
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
+
+static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
+{
+ if (netfs_group && !folio_get_private(folio))
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+}
+
+#if IS_ENABLED(CONFIG_FSCACHE)
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+ if (caching)
+ folio_start_fscache(folio);
+}
+#else
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+}
+#endif
+
+/*
+ * Decide how we should modify a folio. We might be attempting to do
+ * write-streaming, in which case we don't want to a local RMW cycle if we can
+ * avoid it. If we're doing local caching or content crypto, we award that
+ * priority over avoiding RMW. If the file is open readably, then we also
+ * assume that we may want to read what we wrote.
+ */
+static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
+ struct file *file,
+ struct folio *folio,
+ void *netfs_group,
+ size_t flen,
+ size_t offset,
+ size_t len,
+ bool maybe_trouble)
+{
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ loff_t pos = folio_file_pos(folio);
+
+ _enter("");
+
+ if (netfs_folio_group(folio) != netfs_group)
+ return NETFS_FLUSH_CONTENT;
+
+ if (folio_test_uptodate(folio))
+ return NETFS_FOLIO_IS_UPTODATE;
+
+ if (pos >= ctx->zero_point)
+ return NETFS_MODIFY_AND_CLEAR;
+
+ if (!maybe_trouble && offset == 0 && len >= flen)
+ return NETFS_WHOLE_FOLIO_MODIFY;
+
+ if (file->f_mode & FMODE_READ)
+ goto no_write_streaming;
+ if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+ goto no_write_streaming;
+
+ if (netfs_is_cache_enabled(ctx)) {
+ /* We don't want to get a streaming write on a file that loses
+ * caching service temporarily because the backing store got
+ * culled.
+ */
+ if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+ set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
+ goto no_write_streaming;
+ }
+
+ if (!finfo)
+ return NETFS_STREAMING_WRITE;
+
+ /* We can continue a streaming write only if it continues on from the
+ * previous. If it overlaps, we must flush lest we suffer a partial
+ * copy and disjoint dirty regions.
+ */
+ if (offset == finfo->dirty_offset + finfo->dirty_len)
+ return NETFS_STREAMING_WRITE_CONT;
+ return NETFS_FLUSH_CONTENT;
+
+no_write_streaming:
+ if (finfo) {
+ netfs_stat(&netfs_n_wh_wstream_conflict);
+ return NETFS_FLUSH_CONTENT;
+ }
+ return NETFS_JUST_PREFETCH;
+}
+
+/*
+ * Grab a folio for writing and lock it. Attempt to allocate as large a folio
+ * as possible to hold as much of the remaining length as possible in one go.
+ */
+static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
+ loff_t pos, size_t part)
+{
+ pgoff_t index = pos / PAGE_SIZE;
+ fgf_t fgp_flags = FGP_WRITEBEGIN;
+
+ if (mapping_large_folio_support(mapping))
+ fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part);
+
+ return __filemap_get_folio(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+}
+
+/**
+ * netfs_perform_write - Copy data into the pagecache.
+ * @iocb: The operation parameters
+ * @iter: The source buffer
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * Copy data into pagecache pages attached to the inode specified by @iocb.
+ * The caller must hold appropriate inode locks.
+ *
+ * Dirty pages are tagged with a netfs_folio struct if they're not up to date
+ * to indicate the range modified. Dirty pages may also be tagged with a
+ * netfs-specific grouping such that data from an old group gets flushed before
+ * a new one is started.
+ */
+ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct netfs_inode *ctx = netfs_inode(inode);
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .for_sync = true,
+ .nr_to_write = LONG_MAX,
+ .range_start = iocb->ki_pos,
+ .range_end = iocb->ki_pos + iter->count,
+ };
+ struct netfs_io_request *wreq = NULL;
+ struct netfs_folio *finfo;
+ struct folio *folio;
+ enum netfs_how_to_modify howto;
+ enum netfs_folio_trace trace;
+ unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
+ ssize_t written = 0, ret;
+ loff_t i_size, pos = iocb->ki_pos, from, to;
+ size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
+ bool maybe_trouble = false;
+
+ if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
+ iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
+ ) {
+ if (pos < i_size_read(inode)) {
+ ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+
+ wreq = netfs_begin_writethrough(iocb, iter->count);
+ if (IS_ERR(wreq)) {
+ wbc_detach_inode(&wbc);
+ ret = PTR_ERR(wreq);
+ wreq = NULL;
+ goto out;
+ }
+ if (!is_sync_kiocb(iocb))
+ wreq->iocb = iocb;
+ wreq->cleanup = netfs_cleanup_buffered_write;
+ }
+
+ do {
+ size_t flen;
+ size_t offset; /* Offset into pagecache folio */
+ size_t part; /* Bytes to write to folio */
+ size_t copied; /* Bytes copied from user */
+
+ ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
+ if (unlikely(ret < 0))
+ break;
+
+ offset = pos & (max_chunk - 1);
+ part = min(max_chunk - offset, iov_iter_count(iter));
+
+ /* Bring in the user pages that we will copy from _first_ lest
+ * we hit a nasty deadlock on copying from the same page as
+ * we're writing to, without it being marked uptodate.
+ *
+ * Not only is this an optimisation, but it is also required to
+ * check that the address is actually valid, when atomic
+ * usercopies are used below.
+ *
+ * We rely on the page being held onto long enough by the LRU
+ * that we can grab it below if this causes it to be read.
+ */
+ ret = -EFAULT;
+ if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
+ break;
+
+ folio = netfs_grab_folio_for_write(mapping, pos, part);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ break;
+ }
+
+ flen = folio_size(folio);
+ offset = pos & (flen - 1);
+ part = min_t(size_t, flen - offset, part);
+
+ if (signal_pending(current)) {
+ ret = written ? -EINTR : -ERESTARTSYS;
+ goto error_folio_unlock;
+ }
+
+ /* See if we need to prefetch the area we're going to modify.
+ * We need to do this before we get a lock on the folio in case
+ * there's more than one writer competing for the same cache
+ * block.
+ */
+ howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
+ flen, offset, part, maybe_trouble);
+ _debug("howto %u", howto);
+ switch (howto) {
+ case NETFS_JUST_PREFETCH:
+ ret = netfs_prefetch_for_write(file, folio, offset, part);
+ if (ret < 0) {
+ _debug("prefetch = %zd", ret);
+ goto error_folio_unlock;
+ }
+ break;
+ case NETFS_FOLIO_IS_UPTODATE:
+ case NETFS_WHOLE_FOLIO_MODIFY:
+ case NETFS_STREAMING_WRITE_CONT:
+ break;
+ case NETFS_MODIFY_AND_CLEAR:
+ zero_user_segment(&folio->page, 0, offset);
+ break;
+ case NETFS_STREAMING_WRITE:
+ ret = -EIO;
+ if (WARN_ON(folio_get_private(folio)))
+ goto error_folio_unlock;
+ break;
+ case NETFS_FLUSH_CONTENT:
+ trace_netfs_folio(folio, netfs_flush_content);
+ from = folio_pos(folio);
+ to = from + folio_size(folio) - 1;
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = filemap_write_and_wait_range(mapping, from, to);
+ if (ret < 0)
+ goto error_folio_unlock;
+ continue;
+ }
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+
+ flush_dcache_folio(folio);
+
+ /* Deal with a (partially) failed copy */
+ if (copied == 0) {
+ ret = -EFAULT;
+ goto error_folio_unlock;
+ }
+
+ trace = (enum netfs_folio_trace)howto;
+ switch (howto) {
+ case NETFS_FOLIO_IS_UPTODATE:
+ case NETFS_JUST_PREFETCH:
+ netfs_set_group(folio, netfs_group);
+ break;
+ case NETFS_MODIFY_AND_CLEAR:
+ zero_user_segment(&folio->page, offset + copied, flen);
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ break;
+ case NETFS_WHOLE_FOLIO_MODIFY:
+ if (unlikely(copied < part)) {
+ maybe_trouble = true;
+ iov_iter_revert(iter, copied);
+ copied = 0;
+ goto retry;
+ }
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ break;
+ case NETFS_STREAMING_WRITE:
+ if (offset == 0 && copied == flen) {
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ trace = netfs_streaming_filled_page;
+ break;
+ }
+ finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
+ if (!finfo) {
+ iov_iter_revert(iter, copied);
+ ret = -ENOMEM;
+ goto error_folio_unlock;
+ }
+ finfo->netfs_group = netfs_get_group(netfs_group);
+ finfo->dirty_offset = offset;
+ finfo->dirty_len = copied;
+ folio_attach_private(folio, (void *)((unsigned long)finfo |
+ NETFS_FOLIO_INFO));
+ break;
+ case NETFS_STREAMING_WRITE_CONT:
+ finfo = netfs_folio_info(folio);
+ finfo->dirty_len += copied;
+ if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ folio_mark_uptodate(folio);
+ kfree(finfo);
+ trace = netfs_streaming_cont_filled_page;
+ }
+ break;
+ default:
+ WARN(true, "Unexpected modify type %u ix=%lx\n",
+ howto, folio->index);
+ ret = -EIO;
+ goto error_folio_unlock;
+ }
+
+ trace_netfs_folio(folio, trace);
+
+ /* Update the inode size if we moved the EOF marker */
+ i_size = i_size_read(inode);
+ pos += copied;
+ if (pos > i_size) {
+ if (ctx->ops->update_i_size) {
+ ctx->ops->update_i_size(inode, pos);
+ } else {
+ i_size_write(inode, pos);
+#if IS_ENABLED(CONFIG_FSCACHE)
+ fscache_update_cookie(ctx->cache, NULL, &pos);
+#endif
+ }
+ }
+ written += copied;
+
+ if (likely(!wreq)) {
+ folio_mark_dirty(folio);
+ } else {
+ if (folio_test_dirty(folio))
+ /* Sigh. mmap. */
+ folio_clear_dirty_for_io(folio);
+ /* We make multiple writes to the folio... */
+ if (!folio_test_writeback(folio)) {
+ folio_wait_fscache(folio);
+ folio_start_writeback(folio);
+ folio_start_fscache(folio);
+ if (wreq->iter.count == 0)
+ trace_netfs_folio(folio, netfs_folio_trace_wthru);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+ }
+ netfs_advance_writethrough(wreq, copied,
+ offset + copied == flen);
+ }
+ retry:
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
+
+ cond_resched();
+ } while (iov_iter_count(iter));
+
+out:
+ if (unlikely(wreq)) {
+ ret = netfs_end_writethrough(wreq, iocb);
+ wbc_detach_inode(&wbc);
+ if (ret == -EIOCBQUEUED)
+ return ret;
+ }
+
+ iocb->ki_pos += written;
+ _leave(" = %zd [%zd]", written, ret);
+ return written ? written : ret;
+
+error_folio_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+}
+EXPORT_SYMBOL(netfs_perform_write);
+
+/**
+ * netfs_buffered_write_iter_locked - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * The caller must hold appropriate locks around this function and have called
+ * generic_write_checks() already. The caller is also responsible for doing
+ * any necessary syncing afterwards.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_rwsem.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
+ */
+ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
+ struct netfs_group *netfs_group)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret;
+
+ trace_netfs_write_iter(iocb, from);
+
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
+
+ return netfs_perform_write(iocb, from, netfs_group);
+}
+EXPORT_SYMBOL(netfs_buffered_write_iter_locked);
+
+/**
+ * netfs_file_write_iter - write data to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Perform a write to a file, writing into the pagecache if possible and doing
+ * an unbuffered write instead if not.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+ if (!iov_iter_count(from))
+ return 0;
+
+ if ((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+ return netfs_unbuffered_write_iter(iocb, from);
+
+ ret = netfs_start_io_write(inode);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0)
+ ret = netfs_buffered_write_iter_locked(iocb, from, NULL);
+ netfs_end_io_write(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_file_write_iter);
+
+/*
+ * Notification that a previously read-only page is about to become writable.
+ * Note that the caller indicates a single page of a multipage folio.
+ */
+vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
+{
+ struct folio *folio = page_folio(vmf->page);
+ struct file *file = vmf->vma->vm_file;
+ struct inode *inode = file_inode(file);
+ vm_fault_t ret = VM_FAULT_RETRY;
+ int err;
+
+ _enter("%lx", folio->index);
+
+ sb_start_pagefault(inode->i_sb);
+
+ if (folio_wait_writeback_killable(folio))
+ goto out;
+
+ if (folio_lock_killable(folio) < 0)
+ goto out;
+
+ /* Can we see a streaming write here? */
+ if (WARN_ON(!folio_test_uptodate(folio))) {
+ ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
+ goto out;
+ }
+
+ if (netfs_folio_group(folio) != netfs_group) {
+ folio_unlock(folio);
+ err = filemap_fdatawait_range(inode->i_mapping,
+ folio_pos(folio),
+ folio_pos(folio) + folio_size(folio));
+ switch (err) {
+ case 0:
+ ret = VM_FAULT_RETRY;
+ goto out;
+ case -ENOMEM:
+ ret = VM_FAULT_OOM;
+ goto out;
+ default:
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+
+ if (folio_test_dirty(folio))
+ trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
+ netfs_set_group(folio, netfs_group);
+ file_update_time(file);
+ ret = VM_FAULT_LOCKED;
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_page_mkwrite);
+
+/*
+ * Kill all the pages in the given range
+ */
+static void netfs_kill_pages(struct address_space *mapping,
+ loff_t start, loff_t len)
+{
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+ _enter("%llx-%llx", start, start + len - 1);
+
+ do {
+ _debug("kill %lx (to %lx)", index, last);
+
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ next = index + 1;
+ continue;
+ }
+
+ next = folio_next_index(folio);
+
+ trace_netfs_folio(folio, netfs_folio_trace_kill);
+ folio_clear_uptodate(folio);
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ folio_end_writeback(folio);
+ folio_lock(folio);
+ generic_error_remove_folio(mapping, folio);
+ folio_unlock(folio);
+ folio_put(folio);
+
+ } while (index = next, index <= last);
+
+ _leave("");
+}
+
+/*
+ * Redirty all the pages in a given range.
+ */
+static void netfs_redirty_pages(struct address_space *mapping,
+ loff_t start, loff_t len)
+{
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+ _enter("%llx-%llx", start, start + len - 1);
+
+ do {
+ _debug("redirty %llx @%llx", len, start);
+
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ next = index + 1;
+ continue;
+ }
+
+ next = folio_next_index(folio);
+ trace_netfs_folio(folio, netfs_folio_trace_redirty);
+ filemap_dirty_folio(mapping, folio);
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ folio_end_writeback(folio);
+ folio_put(folio);
+ } while (index = next, index <= last);
+
+ balance_dirty_pages_ratelimited(mapping);
+
+ _leave("");
+}
+
+/*
+ * Completion of write to server
+ */
+static void netfs_pages_written_back(struct netfs_io_request *wreq)
+{
+ struct address_space *mapping = wreq->mapping;
+ struct netfs_folio *finfo;
+ struct netfs_group *group = NULL;
+ struct folio *folio;
+ pgoff_t last;
+ int gcount = 0;
+
+ XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
+
+ _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+
+ rcu_read_lock();
+
+ last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
+ xas_for_each(&xas, folio, last) {
+ WARN(!folio_test_writeback(folio),
+ "bad %zx @%llx page %lx %lx\n",
+ wreq->len, wreq->start, folio->index, last);
+
+ if ((finfo = netfs_folio_info(folio))) {
+ /* Streaming writes cannot be redirtied whilst under
+ * writeback, so discard the streaming record.
+ */
+ folio_detach_private(folio);
+ group = finfo->netfs_group;
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_s);
+ kfree(finfo);
+ } else if ((group = netfs_folio_group(folio))) {
+ /* Need to detach the group pointer if the page didn't
+ * get redirtied. If it has been redirtied, then it
+ * must be within the same group.
+ */
+ if (folio_test_dirty(folio)) {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ goto end_wb;
+ }
+ if (folio_trylock(folio)) {
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ }
+ folio_unlock(folio);
+ goto end_wb;
+ }
+
+ xas_pause(&xas);
+ rcu_read_unlock();
+ folio_lock(folio);
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ }
+ folio_unlock(folio);
+ rcu_read_lock();
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_clear);
+ }
+ end_wb:
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ xas_advance(&xas, folio_next_index(folio) - 1);
+ folio_end_writeback(folio);
+ }
+
+ rcu_read_unlock();
+ netfs_put_group_many(group, gcount);
+ _leave("");
+}
+
+/*
+ * Deal with the disposition of the folios that are under writeback to close
+ * out the operation.
+ */
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
+{
+ struct address_space *mapping = wreq->mapping;
+
+ _enter("");
+
+ switch (wreq->error) {
+ case 0:
+ netfs_pages_written_back(wreq);
+ break;
+
+ default:
+ pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
+ fallthrough;
+ case -EACCES:
+ case -EPERM:
+ case -ENOKEY:
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ case -EKEYREVOKED:
+ case -ENETRESET:
+ case -EDQUOT:
+ case -ENOSPC:
+ netfs_redirty_pages(mapping, wreq->start, wreq->len);
+ break;
+
+ case -EROFS:
+ case -EIO:
+ case -EREMOTEIO:
+ case -EFBIG:
+ case -ENOENT:
+ case -ENOMEDIUM:
+ case -ENXIO:
+ netfs_kill_pages(mapping, wreq->start, wreq->len);
+ break;
+ }
+
+ if (wreq->error)
+ mapping_set_error(mapping, wreq->error);
+ if (wreq->netfs_ops->done)
+ wreq->netfs_ops->done(wreq);
+}
+
+/*
+ * Extend the region to be written back to include subsequent contiguously
+ * dirty pages if possible, but don't sleep while doing so.
+ *
+ * If this page holds new content, then we can include filler zeros in the
+ * writeback.
+ */
+static void netfs_extend_writeback(struct address_space *mapping,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ long *_count,
+ loff_t start,
+ loff_t max_len,
+ bool caching,
+ size_t *_len,
+ size_t *_top)
+{
+ struct netfs_folio *finfo;
+ struct folio_batch fbatch;
+ struct folio *folio;
+ unsigned int i;
+ pgoff_t index = (start + *_len) / PAGE_SIZE;
+ size_t len;
+ void *priv;
+ bool stop = true;
+
+ folio_batch_init(&fbatch);
+
+ do {
+ /* Firstly, we gather up a batch of contiguous dirty pages
+ * under the RCU read lock - but we can't clear the dirty flags
+ * there if any of those pages are mapped.
+ */
+ rcu_read_lock();
+
+ xas_for_each(xas, folio, ULONG_MAX) {
+ stop = true;
+ if (xas_retry(xas, folio))
+ continue;
+ if (xa_is_value(folio))
+ break;
+ if (folio->index != index) {
+ xas_reset(xas);
+ break;
+ }
+
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
+
+ /* Has the folio moved or been split? */
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+ if (!folio_test_dirty(folio) ||
+ folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+
+ stop = false;
+ len = folio_size(folio);
+ priv = folio_get_private(folio);
+ if ((const struct netfs_group *)priv != group) {
+ stop = true;
+ finfo = netfs_folio_info(folio);
+ if (finfo->netfs_group != group ||
+ finfo->dirty_offset > 0) {
+ folio_unlock(folio);
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+ len = finfo->dirty_len;
+ }
+
+ *_top += folio_size(folio);
+ index += folio_nr_pages(folio);
+ *_count -= folio_nr_pages(folio);
+ *_len += len;
+ if (*_len >= max_len || *_count <= 0)
+ stop = true;
+
+ if (!folio_batch_add(&fbatch, folio))
+ break;
+ if (stop)
+ break;
+ }
+
+ xas_pause(xas);
+ rcu_read_unlock();
+
+ /* Now, if we obtained any folios, we can shift them to being
+ * writable and mark them for caching.
+ */
+ if (!folio_batch_count(&fbatch))
+ break;
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
+ trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+
+ if (!folio_clear_dirty_for_io(folio))
+ BUG();
+ folio_start_writeback(folio);
+ netfs_folio_start_fscache(caching, folio);
+ folio_unlock(folio);
+ }
+
+ folio_batch_release(&fbatch);
+ cond_resched();
+ } while (!stop);
+}
+
+/*
+ * Synchronously write back the locked page and any subsequent non-locked dirty
+ * pages.
+ */
+static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ struct folio *folio,
+ unsigned long long start,
+ unsigned long long end)
+{
+ struct netfs_io_request *wreq;
+ struct netfs_folio *finfo;
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ unsigned long long i_size = i_size_read(&ctx->inode);
+ size_t len, max_len;
+ bool caching = netfs_is_cache_enabled(ctx);
+ long count = wbc->nr_to_write;
+ int ret;
+
+ _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
+
+ wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
+ NETFS_WRITEBACK);
+ if (IS_ERR(wreq)) {
+ folio_unlock(folio);
+ return PTR_ERR(wreq);
+ }
+
+ if (!folio_clear_dirty_for_io(folio))
+ BUG();
+ folio_start_writeback(folio);
+ netfs_folio_start_fscache(caching, folio);
+
+ count -= folio_nr_pages(folio);
+
+ /* Find all consecutive lockable dirty pages that have contiguous
+ * written regions, stopping when we find a page that is not
+ * immediately lockable, is not dirty or is missing, or we reach the
+ * end of the range.
+ */
+ trace_netfs_folio(folio, netfs_folio_trace_store);
+
+ len = wreq->len;
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ start += finfo->dirty_offset;
+ if (finfo->dirty_offset + finfo->dirty_len != len) {
+ len = finfo->dirty_len;
+ goto cant_expand;
+ }
+ len = finfo->dirty_len;
+ }
+
+ if (start < i_size) {
+ /* Trim the write to the EOF; the extra data is ignored. Also
+ * put an upper limit on the size of a single storedata op.
+ */
+ max_len = 65536 * 4096;
+ max_len = min_t(unsigned long long, max_len, end - start + 1);
+ max_len = min_t(unsigned long long, max_len, i_size - start);
+
+ if (len < max_len)
+ netfs_extend_writeback(mapping, group, xas, &count, start,
+ max_len, caching, &len, &wreq->upper_len);
+ }
+
+cant_expand:
+ len = min_t(unsigned long long, len, i_size - start);
+
+ /* We now have a contiguous set of dirty pages, each with writeback
+ * set; the first page is still locked at this point, but all the rest
+ * have been unlocked.
+ */
+ folio_unlock(folio);
+ wreq->start = start;
+ wreq->len = len;
+
+ if (start < i_size) {
+ _debug("write back %zx @%llx [%llx]", len, start, i_size);
+
+ /* Speculatively write to the cache. We have to fix this up
+ * later if the store fails.
+ */
+ wreq->cleanup = netfs_cleanup_buffered_write;
+
+ iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
+ wreq->upper_len);
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
+ if (ret == 0 || ret == -EIOCBQUEUED)
+ wbc->nr_to_write -= len / PAGE_SIZE;
+ } else {
+ _debug("write discard %zx @%llx [%llx]", len, start, i_size);
+
+ /* The dirty region was entirely beyond the EOF. */
+ fscache_clear_page_bits(mapping, start, len, caching);
+ netfs_pages_written_back(wreq);
+ ret = 0;
+ }
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = 1");
+ return 1;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static ssize_t netfs_writepages_begin(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ const struct netfs_folio *finfo;
+ struct folio *folio;
+ unsigned long long start = *_start;
+ ssize_t ret;
+ void *priv;
+ int skips = 0;
+
+ _enter("%llx,%llx,", start, end);
+
+search_again:
+ /* Find the first dirty page in the group. */
+ rcu_read_lock();
+
+ for (;;) {
+ folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
+ if (xas_retry(xas, folio) || xa_is_value(folio))
+ continue;
+ if (!folio)
+ break;
+
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ continue;
+ }
+
+ /* Skip any dirty folio that's not in the group of interest. */
+ priv = folio_get_private(folio);
+ if ((const struct netfs_group *)priv != group) {
+ finfo = netfs_folio_info(folio);
+ if (finfo->netfs_group != group) {
+ folio_put(folio);
+ continue;
+ }
+ }
+
+ xas_pause(xas);
+ break;
+ }
+ rcu_read_unlock();
+ if (!folio)
+ return 0;
+
+ start = folio_pos(folio); /* May regress with THPs */
+
+ _debug("wback %lx", folio->index);
+
+ /* At this point we hold neither the i_pages lock nor the page lock:
+ * the page may be truncated or invalidated (changing page->mapping to
+ * NULL), or even swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+lock_again:
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ ret = folio_lock_killable(folio);
+ if (ret < 0)
+ return ret;
+ } else {
+ if (!folio_trylock(folio))
+ goto search_again;
+ }
+
+ if (folio->mapping != mapping ||
+ !folio_test_dirty(folio)) {
+ start += folio_size(folio);
+ folio_unlock(folio);
+ goto search_again;
+ }
+
+ if (folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ folio_wait_writeback(folio);
+#ifdef CONFIG_FSCACHE
+ folio_wait_fscache(folio);
+#endif
+ goto lock_again;
+ }
+
+ start += folio_size(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (skips >= 5 || need_resched()) {
+ ret = 0;
+ goto out;
+ }
+ skips++;
+ }
+ goto search_again;
+ }
+
+ ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
+ folio, start, end);
+out:
+ if (ret > 0)
+ *_start = start + ret;
+ _leave(" = %zd [%llx]", ret, *_start);
+ return ret;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static int netfs_writepages_region(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ ssize_t ret;
+
+ XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
+
+ do {
+ ret = netfs_writepages_begin(mapping, wbc, group, &xas,
+ _start, end);
+ if (ret > 0 && wbc->nr_to_write > 0)
+ cond_resched();
+ } while (ret > 0 && wbc->nr_to_write > 0);
+
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * write some of the pending data back to the server
+ */
+int netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct netfs_group *group = NULL;
+ loff_t start, end;
+ int ret;
+
+ _enter("");
+
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
+
+ if (wbc->range_cyclic && mapping->writeback_index) {
+ start = mapping->writeback_index * PAGE_SIZE;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, LLONG_MAX);
+ if (ret < 0)
+ goto out;
+
+ if (wbc->nr_to_write <= 0) {
+ mapping->writeback_index = start / PAGE_SIZE;
+ goto out;
+ }
+
+ start = 0;
+ end = mapping->writeback_index * PAGE_SIZE;
+ mapping->writeback_index = 0;
+ ret = netfs_writepages_region(mapping, wbc, group, &start, end);
+ if (ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
+ } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+ start = 0;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, LLONG_MAX);
+ if (wbc->nr_to_write > 0 && ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
+ } else {
+ start = wbc->range_start;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, wbc->range_end);
+ }
+
+out:
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_writepages);
+
+/*
+ * Deal with the disposition of a laundered folio.
+ */
+static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
+{
+ if (wreq->error) {
+ pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
+ mapping_set_error(wreq->mapping, wreq->error);
+ }
+}
+
+/**
+ * netfs_launder_folio - Clean up a dirty folio that's being invalidated
+ * @folio: The folio to clean
+ *
+ * This is called to write back a folio that's being invalidated when an inode
+ * is getting torn down. Ideally, writepages would be used instead.
+ */
+int netfs_launder_folio(struct folio *folio)
+{
+ struct netfs_io_request *wreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct netfs_group *group = netfs_folio_group(folio);
+ struct bio_vec bvec;
+ unsigned long long i_size = i_size_read(mapping->host);
+ unsigned long long start = folio_pos(folio);
+ size_t offset = 0, len;
+ int ret = 0;
+
+ if (finfo) {
+ offset = finfo->dirty_offset;
+ start += offset;
+ len = finfo->dirty_len;
+ } else {
+ len = folio_size(folio);
+ }
+ len = min_t(unsigned long long, len, i_size - start);
+
+ wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
+ if (IS_ERR(wreq)) {
+ ret = PTR_ERR(wreq);
+ goto out;
+ }
+
+ if (!folio_clear_dirty_for_io(folio))
+ goto out_put;
+
+ trace_netfs_folio(folio, netfs_folio_trace_launder);
+
+ _debug("launder %llx-%llx", start, start + len - 1);
+
+ /* Speculatively write to the cache. We have to fix this up later if
+ * the store fails.
+ */
+ wreq->cleanup = netfs_cleanup_launder_folio;
+
+ bvec_set_folio(&bvec, folio, len, offset);
+ iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
+
+out_put:
+ folio_detach_private(folio);
+ netfs_put_group(group);
+ kfree(finfo);
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+out:
+ folio_wait_fscache(folio);
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_launder_folio);
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
new file mode 100644
index 000000000000..ad4370b3935d
--- /dev/null
+++ b/fs/netfs/direct_read.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Direct I/O support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer. No use is made of the pagecache.
+ *
+ * The caller must hold any appropriate locks.
+ */
+static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct netfs_io_request *rreq;
+ ssize_t ret;
+ size_t orig_count = iov_iter_count(iter);
+ bool async = !is_sync_kiocb(iocb);
+
+ _enter("");
+
+ if (!orig_count)
+ return 0; /* Don't update atime */
+
+ ret = kiocb_write_and_wait(iocb, orig_count);
+ if (ret < 0)
+ return ret;
+ file_accessed(iocb->ki_filp);
+
+ rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ iocb->ki_pos, orig_count,
+ NETFS_DIO_READ);
+ if (IS_ERR(rreq))
+ return PTR_ERR(rreq);
+
+ netfs_stat(&netfs_n_rh_dio_read);
+ trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read);
+
+ /* If this is an async op, we have to keep track of the destination
+ * buffer for ourselves as the caller's iterator will be trashed when
+ * we return.
+ *
+ * In such a case, extract an iterator to represent as much of the the
+ * output buffer as we can manage. Note that the extraction might not
+ * be able to allocate a sufficiently large bvec array and may shorten
+ * the request.
+ */
+ if (user_backed_iter(iter)) {
+ ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0);
+ if (ret < 0)
+ goto out;
+ rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec;
+ rreq->direct_bv_count = ret;
+ rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+ rreq->len = iov_iter_count(&rreq->iter);
+ } else {
+ rreq->iter = *iter;
+ rreq->len = orig_count;
+ rreq->direct_bv_unpin = false;
+ iov_iter_advance(iter, orig_count);
+ }
+
+ // TODO: Set up bounce buffer if needed
+
+ if (async)
+ rreq->iocb = iocb;
+
+ ret = netfs_begin_read(rreq, is_sync_kiocb(iocb));
+ if (ret < 0)
+ goto out; /* May be -EIOCBQUEUED */
+ if (!async) {
+ // TODO: Copy from bounce buffer
+ iocb->ki_pos += rreq->transferred;
+ ret = rreq->transferred;
+ }
+
+out:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ if (ret > 0)
+ orig_count -= ret;
+ if (ret != -EIOCBQUEUED)
+ iov_iter_revert(iter, orig_count - iov_iter_count(iter));
+ return ret;
+}
+
+/**
+ * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer. No use is made of the pagecache.
+ */
+ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (!iter->count)
+ return 0; /* Don't update atime */
+
+ ret = netfs_start_io_direct(inode);
+ if (ret == 0) {
+ ret = netfs_unbuffered_read_iter_locked(iocb, iter);
+ netfs_end_io_direct(inode);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_read_iter);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
new file mode 100644
index 000000000000..bee047e20f5d
--- /dev/null
+++ b/fs/netfs/direct_write.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Unbuffered and direct write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
+{
+ struct inode *inode = wreq->inode;
+ unsigned long long end = wreq->start + wreq->len;
+
+ if (!wreq->error &&
+ i_size_read(inode) < end) {
+ if (wreq->netfs_ops->update_i_size)
+ wreq->netfs_ops->update_i_size(inode, end);
+ else
+ i_size_write(inode, end);
+ }
+}
+
+/*
+ * Perform an unbuffered write where we may have to do an RMW operation on an
+ * encrypted file. This can also be used for direct I/O writes.
+ */
+static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group)
+{
+ struct netfs_io_request *wreq;
+ unsigned long long start = iocb->ki_pos;
+ unsigned long long end = start + iov_iter_count(iter);
+ ssize_t ret, n;
+ bool async = !is_sync_kiocb(iocb);
+
+ _enter("");
+
+ /* We're going to need a bounce buffer if what we transmit is going to
+ * be different in some way to the source buffer, e.g. because it gets
+ * encrypted/compressed or because it needs expanding to a block size.
+ */
+ // TODO
+
+ _debug("uw %llx-%llx", start, end);
+
+ wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ start, end - start,
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+ if (IS_ERR(wreq))
+ return PTR_ERR(wreq);
+
+ {
+ /* If this is an async op and we're not using a bounce buffer,
+ * we have to save the source buffer as the iterator is only
+ * good until we return. In such a case, extract an iterator
+ * to represent as much of the the output buffer as we can
+ * manage. Note that the extraction might not be able to
+ * allocate a sufficiently large bvec array and may shorten the
+ * request.
+ */
+ if (async || user_backed_iter(iter)) {
+ n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+ if (n < 0) {
+ ret = n;
+ goto out;
+ }
+ wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
+ wreq->direct_bv_count = n;
+ wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+ wreq->len = iov_iter_count(&wreq->iter);
+ } else {
+ wreq->iter = *iter;
+ }
+
+ wreq->io_iter = wreq->iter;
+ }
+
+ /* Copy the data into the bounce buffer and encrypt it. */
+ // TODO
+
+ /* Dispatch the write. */
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ if (async)
+ wreq->iocb = iocb;
+ wreq->cleanup = netfs_cleanup_dio_write;
+ ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
+ iocb->ki_flags & IOCB_DIRECT ?
+ netfs_write_trace_dio_write :
+ netfs_write_trace_unbuffered_write);
+ if (ret < 0) {
+ _debug("begin = %zd", ret);
+ goto out;
+ }
+
+ if (!async) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+
+ ret = wreq->error;
+ _debug("waited = %zd", ret);
+ if (ret == 0) {
+ ret = wreq->transferred;
+ iocb->ki_pos += ret;
+ }
+ } else {
+ ret = -EIOCBQUEUED;
+ }
+
+out:
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
+
+/**
+ * netfs_unbuffered_write_iter - Unbuffered write to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Do an unbuffered write to a file, writing the data directly to the server
+ * and not lodging the data in the pagecache.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ unsigned long long end;
+ ssize_t ret;
+
+ _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+ if (!iov_iter_count(from))
+ return 0;
+
+ trace_netfs_write_iter(iocb, from);
+ netfs_stat(&netfs_n_rh_dio_write);
+
+ ret = netfs_start_io_direct(inode);
+ if (ret < 0)
+ return ret;
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+ ret = file_remove_privs(file);
+ if (ret < 0)
+ goto out;
+ ret = file_update_time(file);
+ if (ret < 0)
+ goto out;
+ ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+ if (ret < 0)
+ goto out;
+ end = iocb->ki_pos + iov_iter_count(from);
+ if (end > ictx->zero_point)
+ ictx->zero_point = end;
+
+ fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
+ FSCACHE_INVAL_DIO_WRITE);
+ ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
+out:
+ netfs_end_io_direct(inode);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_write_iter);
diff --git a/fs/fscache/cache.c b/fs/netfs/fscache_cache.c
index d645f8b302a2..9397ed39b0b4 100644
--- a/fs/fscache/cache.c
+++ b/fs/netfs/fscache_cache.c
@@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache);
void fscache_put_cache(struct fscache_cache *cache,
enum fscache_cache_trace where)
{
- unsigned int debug_id = cache->debug_id;
+ unsigned int debug_id;
bool zero;
int ref;
if (IS_ERR_OR_NULL(cache))
return;
+ debug_id = cache->debug_id;
zero = __refcount_dec_and_test(&cache->ref, &ref);
trace_fscache_cache(debug_id, ref - 1, where);
diff --git a/fs/fscache/cookie.c b/fs/netfs/fscache_cookie.c
index bce2492186d0..bce2492186d0 100644
--- a/fs/fscache/cookie.c
+++ b/fs/netfs/fscache_cookie.c
diff --git a/fs/netfs/fscache_internal.h b/fs/netfs/fscache_internal.h
new file mode 100644
index 000000000000..a09b948fcef2
--- /dev/null
+++ b/fs/netfs/fscache_internal.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include "internal.h"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "FS-Cache: " fmt
diff --git a/fs/fscache/io.c b/fs/netfs/fscache_io.c
index 0d2b8dec8f82..ad572f7ee897 100644
--- a/fs/fscache/io.c
+++ b/fs/netfs/fscache_io.c
@@ -158,46 +158,6 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres,
}
EXPORT_SYMBOL(__fscache_begin_write_operation);
-/**
- * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback
- * @mapping: The mapping the folio belongs to.
- * @folio: The folio being dirtied.
- * @cookie: The cookie referring to the cache object
- *
- * Set the dirty flag on a folio and pin an in-use cache object in memory
- * so that writeback can later write to it. This is intended
- * to be called from the filesystem's ->dirty_folio() method.
- *
- * Return: true if the dirty flag was set on the folio, false otherwise.
- */
-bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
- struct fscache_cookie *cookie)
-{
- struct inode *inode = mapping->host;
- bool need_use = false;
-
- _enter("");
-
- if (!filemap_dirty_folio(mapping, folio))
- return false;
- if (!fscache_cookie_valid(cookie))
- return true;
-
- if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
- spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
- inode->i_state |= I_PINNING_FSCACHE_WB;
- need_use = true;
- }
- spin_unlock(&inode->i_lock);
-
- if (need_use)
- fscache_use_cookie(cookie, true);
- }
- return true;
-}
-EXPORT_SYMBOL(fscache_dirty_folio);
-
struct fscache_write_request {
struct netfs_cache_resources cache_resources;
struct address_space *mapping;
@@ -277,7 +237,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
fscache_access_io_write) < 0)
goto abandon_free;
- ret = cres->ops->prepare_write(cres, &start, &len, i_size, false);
+ ret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false);
if (ret < 0)
goto abandon_end;
diff --git a/fs/fscache/main.c b/fs/netfs/fscache_main.c
index dad85fd84f6f..42e98bb523e3 100644
--- a/fs/fscache/main.c
+++ b/fs/netfs/fscache_main.c
@@ -8,18 +8,9 @@
#define FSCACHE_DEBUG_LEVEL CACHE
#include <linux/module.h>
#include <linux/init.h>
-#define CREATE_TRACE_POINTS
#include "internal.h"
-
-MODULE_DESCRIPTION("FS Cache Manager");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
-unsigned fscache_debug;
-module_param_named(debug, fscache_debug, uint,
- S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(fscache_debug,
- "FS-Cache debugging mask");
+#define CREATE_TRACE_POINTS
+#include <trace/events/fscache.h>
EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache);
EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume);
@@ -71,7 +62,7 @@ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len)
/*
* initialise the fs caching module
*/
-static int __init fscache_init(void)
+int __init fscache_init(void)
{
int ret = -ENOMEM;
@@ -92,7 +83,7 @@ static int __init fscache_init(void)
goto error_cookie_jar;
}
- pr_notice("Loaded\n");
+ pr_notice("FS-Cache loaded\n");
return 0;
error_cookie_jar:
@@ -103,19 +94,15 @@ error_wq:
return ret;
}
-fs_initcall(fscache_init);
-
/*
* clean up on module removal
*/
-static void __exit fscache_exit(void)
+void __exit fscache_exit(void)
{
_enter("");
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
destroy_workqueue(fscache_wq);
- pr_notice("Unloaded\n");
+ pr_notice("FS-Cache unloaded\n");
}
-
-module_exit(fscache_exit);
diff --git a/fs/fscache/proc.c b/fs/netfs/fscache_proc.c
index dc3b0e9c8cce..874d951bc390 100644
--- a/fs/fscache/proc.c
+++ b/fs/netfs/fscache_proc.c
@@ -12,41 +12,34 @@
#include "internal.h"
/*
- * initialise the /proc/fs/fscache/ directory
+ * Add files to /proc/fs/netfs/.
*/
int __init fscache_proc_init(void)
{
- if (!proc_mkdir("fs/fscache", NULL))
- goto error_dir;
+ if (!proc_symlink("fs/fscache", NULL, "netfs"))
+ goto error_sym;
- if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL,
&fscache_caches_seq_ops))
goto error;
- if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/volumes", S_IFREG | 0444, NULL,
&fscache_volumes_seq_ops))
goto error;
- if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/cookies", S_IFREG | 0444, NULL,
&fscache_cookies_seq_ops))
goto error;
-
-#ifdef CONFIG_FSCACHE_STATS
- if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
- fscache_stats_show))
- goto error;
-#endif
-
return 0;
error:
remove_proc_entry("fs/fscache", NULL);
-error_dir:
+error_sym:
return -ENOMEM;
}
/*
- * clean up the /proc/fs/fscache/ directory
+ * Clean up the /proc/fs/fscache symlink.
*/
void fscache_proc_cleanup(void)
{
diff --git a/fs/fscache/stats.c b/fs/netfs/fscache_stats.c
index fc94e5e79f1c..add21abdf713 100644
--- a/fs/fscache/stats.c
+++ b/fs/netfs/fscache_stats.c
@@ -48,13 +48,15 @@ atomic_t fscache_n_no_create_space;
EXPORT_SYMBOL(fscache_n_no_create_space);
atomic_t fscache_n_culled;
EXPORT_SYMBOL(fscache_n_culled);
+atomic_t fscache_n_dio_misfit;
+EXPORT_SYMBOL(fscache_n_dio_misfit);
/*
* display the general statistics
*/
-int fscache_stats_show(struct seq_file *m, void *v)
+int fscache_stats_show(struct seq_file *m)
{
- seq_puts(m, "FS-Cache statistics\n");
+ seq_puts(m, "-- FS-Cache statistics --\n");
seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n",
atomic_read(&fscache_n_cookies),
atomic_read(&fscache_n_volumes),
@@ -93,10 +95,9 @@ int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_no_create_space),
atomic_read(&fscache_n_culled));
- seq_printf(m, "IO : rd=%u wr=%u\n",
+ seq_printf(m, "IO : rd=%u wr=%u mis=%u\n",
atomic_read(&fscache_n_read),
- atomic_read(&fscache_n_write));
-
- netfs_stats_show(m);
+ atomic_read(&fscache_n_write),
+ atomic_read(&fscache_n_dio_misfit));
return 0;
}
diff --git a/fs/fscache/volume.c b/fs/netfs/fscache_volume.c
index cdf991bdd9de..cdf991bdd9de 100644
--- a/fs/fscache/volume.c
+++ b/fs/netfs/fscache_volume.c
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 43fac1b14e40..ec7045d24400 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -5,9 +5,13 @@
* Written by David Howells (dhowells@redhat.com)
*/
+#include <linux/slab.h>
+#include <linux/seq_file.h>
#include <linux/netfs.h>
#include <linux/fscache.h>
+#include <linux/fscache-cache.h>
#include <trace/events/netfs.h>
+#include <trace/events/fscache.h>
#ifdef pr_fmt
#undef pr_fmt
@@ -19,6 +23,8 @@
* buffered_read.c
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+ size_t offset, size_t len);
/*
* io.c
@@ -29,6 +35,41 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
* main.c
*/
extern unsigned int netfs_debug;
+extern struct list_head netfs_io_requests;
+extern spinlock_t netfs_proc_lock;
+
+#ifdef CONFIG_PROC_FS
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
+{
+ spin_lock(&netfs_proc_lock);
+ list_add_tail_rcu(&rreq->proc_link, &netfs_io_requests);
+ spin_unlock(&netfs_proc_lock);
+}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq)
+{
+ if (!list_empty(&rreq->proc_link)) {
+ spin_lock(&netfs_proc_lock);
+ list_del_rcu(&rreq->proc_link);
+ spin_unlock(&netfs_proc_lock);
+ }
+}
+#else
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
+#endif
+
+/*
+ * misc.c
+ */
+#define NETFS_FLAG_PUT_MARK BIT(0)
+#define NETFS_FLAG_PAGECACHE_MARK BIT(1)
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+ struct folio *folio, unsigned int flags,
+ gfp_t gfp_mask);
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+ struct address_space *mapping,
+ pgoff_t index, pgoff_t to, gfp_t gfp_mask);
+void netfs_clear_buffer(struct xarray *buffer);
/*
* objects.c
@@ -50,9 +91,20 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
}
/*
+ * output.c
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+ enum netfs_write_trace what);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
+
+/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_dio_read;
+extern atomic_t netfs_n_rh_dio_write;
extern atomic_t netfs_n_rh_readahead;
extern atomic_t netfs_n_rh_readpage;
extern atomic_t netfs_n_rh_rreq;
@@ -71,7 +123,15 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_wh_wstream_conflict;
+extern atomic_t netfs_n_wh_upload;
+extern atomic_t netfs_n_wh_upload_done;
+extern atomic_t netfs_n_wh_upload_failed;
+extern atomic_t netfs_n_wh_write;
+extern atomic_t netfs_n_wh_write_done;
+extern atomic_t netfs_n_wh_write_failed;
+int netfs_stats_show(struct seq_file *m, void *v);
static inline void netfs_stat(atomic_t *stat)
{
@@ -103,6 +163,176 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
#endif
}
+/*
+ * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
+{
+ if (netfs_group)
+ refcount_inc(&netfs_group->ref);
+ return netfs_group;
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group(struct netfs_group *netfs_group)
+{
+ if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+ netfs_group->free(netfs_group);
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
+{
+ if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+ netfs_group->free(netfs_group);
+}
+
+/*
+ * fscache-cache.c
+ */
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_caches_seq_ops;
+#endif
+bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache);
+void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where);
+
+static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache)
+{
+ return smp_load_acquire(&cache->state);
+}
+
+static inline bool fscache_cache_is_live(const struct fscache_cache *cache)
+{
+ return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE;
+}
+
+static inline void fscache_set_cache_state(struct fscache_cache *cache,
+ enum fscache_cache_state new_state)
+{
+ smp_store_release(&cache->state, new_state);
+
+}
+
+static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
+ enum fscache_cache_state old_state,
+ enum fscache_cache_state new_state)
+{
+ return try_cmpxchg_release(&cache->state, &old_state, new_state);
+}
+
+/*
+ * fscache-cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_cookies_seq_ops;
+#endif
+extern struct timer_list fscache_cookie_lru_timer;
+
+extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
+extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+
+static inline void fscache_see_cookie(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
+{
+ trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
+ where);
+}
+
+/*
+ * fscache-main.c
+ */
+extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
+#ifdef CONFIG_FSCACHE
+int __init fscache_init(void);
+void __exit fscache_exit(void);
+#else
+static inline int fscache_init(void) { return 0; }
+static inline void fscache_exit(void) {}
+#endif
+
+/*
+ * fscache-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init() (0)
+#define fscache_proc_cleanup() do {} while (0)
+#endif
+
+/*
+ * fscache-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_volumes;
+extern atomic_t fscache_n_volumes_collision;
+extern atomic_t fscache_n_volumes_nomem;
+extern atomic_t fscache_n_cookies;
+extern atomic_t fscache_n_cookies_lru;
+extern atomic_t fscache_n_cookies_lru_expired;
+extern atomic_t fscache_n_cookies_lru_removed;
+extern atomic_t fscache_n_cookies_lru_dropped;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_invalidates;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_retire;
+extern atomic_t fscache_n_relinquishes_dropped;
+
+extern atomic_t fscache_n_resizes;
+extern atomic_t fscache_n_resizes_null;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+ atomic_inc(stat);
+}
+
+static inline void fscache_stat_d(atomic_t *stat)
+{
+ atomic_dec(stat);
+}
+
+#define __fscache_stat(stat) (stat)
+
+int fscache_stats_show(struct seq_file *m);
+#else
+
+#define __fscache_stat(stat) (NULL)
+#define fscache_stat(stat) do {} while (0)
+#define fscache_stat_d(stat) do {} while (0)
+
+static inline int fscache_stats_show(struct seq_file *m) { return 0; }
+#endif
+
+/*
+ * fscache-volume.c
+ */
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_volumes_seq_ops;
+#endif
+
+struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+void fscache_put_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+bool fscache_begin_volume_access(struct fscache_volume *volume,
+ struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+void fscache_create_volume(struct fscache_volume *volume, bool wait);
+
/*****************************************************************************/
/*
* debug tracing
@@ -143,3 +373,57 @@ do { \
#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
#endif
+
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X) \
+do { \
+ if (unlikely(!(X))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+ if (unlikely(!((X) OP (Y)))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIF(C, X) \
+do { \
+ if (unlikely((C) && !(X))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+ if (unlikely((C) && !((X) OP (Y)))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#else
+
+#define ASSERT(X) do {} while (0)
+#define ASSERTCMP(X, OP, Y) do {} while (0)
+#define ASSERTIF(C, X) do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
+
+#endif /* assert or not */
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 7f753380e047..4261ad6c55b6 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -21,12 +21,7 @@
*/
static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
- struct iov_iter iter;
-
- iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
- iov_iter_zero(iov_iter_count(&iter), &iter);
+ iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
}
static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
@@ -46,14 +41,9 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq,
enum netfs_read_from_hole read_hole)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct iov_iter iter;
netfs_stat(&netfs_n_rh_read);
- iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
-
- cres->ops->read(cres, subreq->start, &iter, read_hole,
+ cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole,
netfs_cache_read_terminated, subreq);
}
@@ -88,6 +78,13 @@ static void netfs_read_from_server(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
netfs_stat(&netfs_n_rh_download);
+
+ if (rreq->origin != NETFS_DIO_READ &&
+ iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+ pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
+ rreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->len,
+ subreq->transferred, subreq->flags);
rreq->netfs_ops->issue_read(subreq);
}
@@ -127,9 +124,10 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
/* We might have multiple writes from the same huge
* folio, but we mustn't unlock a folio more than once.
*/
- if (have_unlocked && folio_index(folio) <= unlocked)
+ if (have_unlocked && folio->index <= unlocked)
continue;
- unlocked = folio_index(folio);
+ unlocked = folio_next_index(folio) - 1;
+ trace_netfs_folio(folio, netfs_folio_trace_end_copy);
folio_end_fscache(folio);
have_unlocked = true;
}
@@ -201,7 +199,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
}
ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- rreq->i_size, true);
+ subreq->len, rreq->i_size, true);
if (ret < 0) {
trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
@@ -260,6 +258,30 @@ static void netfs_rreq_short_read(struct netfs_io_request *rreq,
}
/*
+ * Reset the subrequest iterator prior to resubmission.
+ */
+static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ size_t remaining = subreq->len - subreq->transferred;
+ size_t count = iov_iter_count(&subreq->io_iter);
+
+ if (count == remaining)
+ return;
+
+ _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
+ rreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->transferred,
+ subreq->len, rreq->i_size,
+ subreq->io_iter.iter_type);
+
+ if (count < remaining)
+ iov_iter_revert(&subreq->io_iter, remaining - count);
+ else
+ iov_iter_advance(&subreq->io_iter, count - remaining);
+}
+
+/*
* Resubmit any short or failed operations. Returns true if we got the rreq
* ref back.
*/
@@ -287,6 +309,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
atomic_inc(&rreq->nr_outstanding);
+ netfs_reset_subreq_iter(rreq, subreq);
netfs_read_from_server(rreq, subreq);
} else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
netfs_rreq_short_read(rreq, subreq);
@@ -321,6 +344,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
}
/*
+ * Determine how much we can admit to having read from a DIO read.
+ */
+static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ unsigned int i;
+ size_t transferred = 0;
+
+ for (i = 0; i < rreq->direct_bv_count; i++)
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->error || subreq->transferred == 0)
+ break;
+ transferred += subreq->transferred;
+ if (subreq->transferred < subreq->len)
+ break;
+ }
+
+ for (i = 0; i < rreq->direct_bv_count; i++)
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+ rreq->transferred = transferred;
+ task_io_account_read(transferred);
+
+ if (rreq->iocb) {
+ rreq->iocb->ki_pos += transferred;
+ if (rreq->iocb->ki_complete)
+ rreq->iocb->ki_complete(
+ rreq->iocb, rreq->error ? rreq->error : transferred);
+ }
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+ inode_dio_end(rreq->inode);
+}
+
+/*
* Assess the state of a read request and decide what to do next.
*
* Note that we could be in an ordinary kernel thread, on a workqueue or in
@@ -340,8 +400,12 @@ again:
return;
}
- netfs_rreq_unlock_folios(rreq);
+ if (rreq->origin != NETFS_DIO_READ)
+ netfs_rreq_unlock_folios(rreq);
+ else
+ netfs_rreq_assess_dio(rreq);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
@@ -399,9 +463,9 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
int u;
- _enter("[%u]{%llx,%lx},%zd",
- subreq->debug_index, subreq->start, subreq->flags,
- transferred_or_error);
+ _enter("R=%x[%x]{%llx,%lx},%zd",
+ rreq->debug_id, subreq->debug_index,
+ subreq->start, subreq->flags, transferred_or_error);
switch (subreq->source) {
case NETFS_READ_FROM_CACHE:
@@ -501,15 +565,20 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest
*/
static enum netfs_io_source
netfs_rreq_prepare_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
+ struct netfs_io_subrequest *subreq,
+ struct iov_iter *io_iter)
{
- enum netfs_io_source source;
+ enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
+ size_t lsize;
_enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
- source = netfs_cache_prepare_read(subreq, rreq->i_size);
- if (source == NETFS_INVALID_READ)
- goto out;
+ if (rreq->origin != NETFS_DIO_READ) {
+ source = netfs_cache_prepare_read(subreq, rreq->i_size);
+ if (source == NETFS_INVALID_READ)
+ goto out;
+ }
if (source == NETFS_DOWNLOAD_FROM_SERVER) {
/* Call out to the netfs to let it shrink the request to fit
@@ -518,19 +587,52 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
* to make serial calls, it can indicate a short read and then
* we will call it again.
*/
+ if (rreq->origin != NETFS_DIO_READ) {
+ if (subreq->start >= ictx->zero_point) {
+ source = NETFS_FILL_WITH_ZEROES;
+ goto set;
+ }
+ if (subreq->len > ictx->zero_point - subreq->start)
+ subreq->len = ictx->zero_point - subreq->start;
+ }
if (subreq->len > rreq->i_size - subreq->start)
subreq->len = rreq->i_size - subreq->start;
+ if (rreq->rsize && subreq->len > rreq->rsize)
+ subreq->len = rreq->rsize;
if (rreq->netfs_ops->clamp_length &&
!rreq->netfs_ops->clamp_length(subreq)) {
source = NETFS_INVALID_READ;
goto out;
}
+
+ if (subreq->max_nr_segs) {
+ lsize = netfs_limit_iter(io_iter, 0, subreq->len,
+ subreq->max_nr_segs);
+ if (subreq->len > lsize) {
+ subreq->len = lsize;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
+ }
+ }
}
- if (WARN_ON(subreq->len == 0))
+set:
+ if (subreq->len > rreq->len)
+ pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
+ rreq->debug_id, subreq->debug_index,
+ subreq->len, rreq->len);
+
+ if (WARN_ON(subreq->len == 0)) {
source = NETFS_INVALID_READ;
+ goto out;
+ }
+ subreq->source = source;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ subreq->io_iter = *io_iter;
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+ iov_iter_advance(io_iter, subreq->len);
out:
subreq->source = source;
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
@@ -541,6 +643,7 @@ out:
* Slice off a piece of a read request and submit an I/O request for it.
*/
static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
+ struct iov_iter *io_iter,
unsigned int *_debug_index)
{
struct netfs_io_subrequest *subreq;
@@ -552,7 +655,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
subreq->debug_index = (*_debug_index)++;
subreq->start = rreq->start + rreq->submitted;
- subreq->len = rreq->len - rreq->submitted;
+ subreq->len = io_iter->count;
_debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
@@ -565,7 +668,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
* (the starts must coincide), in which case, we go around the loop
* again and ask it to download the next piece.
*/
- source = netfs_rreq_prepare_read(rreq, subreq);
+ source = netfs_rreq_prepare_read(rreq, subreq, io_iter);
if (source == NETFS_INVALID_READ)
goto subreq_failed;
@@ -603,6 +706,7 @@ subreq_failed:
*/
int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
{
+ struct iov_iter io_iter;
unsigned int debug_index = 0;
int ret;
@@ -611,50 +715,73 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
if (rreq->len == 0) {
pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len);
return -EIO;
}
- INIT_WORK(&rreq->work, netfs_rreq_work);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_begin(rreq->inode);
- if (sync)
- netfs_get_request(rreq, netfs_rreq_trace_get_hold);
+ // TODO: Use bounce buffer if requested
+ rreq->io_iter = rreq->iter;
+
+ INIT_WORK(&rreq->work, netfs_rreq_work);
/* Chop the read into slices according to what the cache and the netfs
* want and submit each one.
*/
+ netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding);
atomic_set(&rreq->nr_outstanding, 1);
+ io_iter = rreq->io_iter;
do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
+ _debug("submit %llx + %zx >= %llx",
+ rreq->start, rreq->submitted, rreq->i_size);
+ if (rreq->origin == NETFS_DIO_READ &&
+ rreq->start + rreq->submitted >= rreq->i_size)
+ break;
+ if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
+ break;
+ if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
+ test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
break;
} while (rreq->submitted < rreq->len);
+ if (!rreq->submitted) {
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_end(rreq->inode);
+ ret = 0;
+ goto out;
+ }
+
if (sync) {
- /* Keep nr_outstanding incremented so that the ref always belongs to
- * us, and the service code isn't punted off to a random thread pool to
- * process.
+ /* Keep nr_outstanding incremented so that the ref always
+ * belongs to us, and the service code isn't punted off to a
+ * random thread pool to process. Note that this might start
+ * further work, such as writing to the cache.
*/
- for (;;) {
- wait_var_event(&rreq->nr_outstanding,
- atomic_read(&rreq->nr_outstanding) == 1);
+ wait_var_event(&rreq->nr_outstanding,
+ atomic_read(&rreq->nr_outstanding) == 1);
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- break;
- cond_resched();
- }
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len) {
+ if (ret == 0 && rreq->submitted < rreq->len &&
+ rreq->origin != NETFS_DIO_READ) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
- netfs_put_request(rreq, false, netfs_rreq_trace_put_hold);
} else {
/* If we decrement nr_outstanding to 0, the ref belongs to us. */
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
- ret = 0;
+ ret = -EIOCBQUEUED;
}
+
+out:
return ret;
}
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index 2ff07ba655a0..b781bbbf1d8d 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -101,3 +101,100 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
return npages;
}
EXPORT_SYMBOL_GPL(netfs_extract_user_iter);
+
+/*
+ * Select the span of a bvec iterator we're going to use. Limit it by both maximum
+ * size and maximum number of segments. Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ const struct bio_vec *bvecs = iter->bvec;
+ unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0;
+ size_t len, span = 0, n = iter->count;
+ size_t skip = iter->iov_offset + start_offset;
+
+ if (WARN_ON(!iov_iter_is_bvec(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+
+ while (n && ix < nbv && skip) {
+ len = bvecs[ix].bv_len;
+ if (skip < len)
+ break;
+ skip -= len;
+ n -= len;
+ ix++;
+ }
+
+ while (n && ix < nbv) {
+ len = min3(n, bvecs[ix].bv_len - skip, max_size);
+ span += len;
+ nsegs++;
+ ix++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ skip = 0;
+ n -= len;
+ }
+
+ return min(span, max_size);
+}
+
+/*
+ * Select the span of an xarray iterator we're going to use. Limit it by both
+ * maximum size and maximum number of segments. It is assumed that segments
+ * can be larger than a page in size, provided they're physically contiguous.
+ * Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ struct folio *folio;
+ unsigned int nsegs = 0;
+ loff_t pos = iter->xarray_start + iter->iov_offset;
+ pgoff_t index = pos / PAGE_SIZE;
+ size_t span = 0, n = iter->count;
+
+ XA_STATE(xas, iter->xarray, index);
+
+ if (WARN_ON(!iov_iter_is_xarray(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+ max_size = min(max_size, n - start_offset);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ size_t offset, flen, len;
+ if (xas_retry(&xas, folio))
+ continue;
+ if (WARN_ON(xa_is_value(folio)))
+ break;
+ if (WARN_ON(folio_test_hugetlb(folio)))
+ break;
+
+ flen = folio_size(folio);
+ offset = offset_in_folio(folio, pos);
+ len = min(max_size, flen - offset);
+ span += len;
+ nsegs++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ }
+
+ rcu_read_unlock();
+ return min(span, max_size);
+}
+
+size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ if (iov_iter_is_bvec(iter))
+ return netfs_limit_bvec(iter, start_offset, max_size, max_segs);
+ if (iov_iter_is_xarray(iter))
+ return netfs_limit_xarray(iter, start_offset, max_size, max_segs);
+ BUG();
+}
+EXPORT_SYMBOL(netfs_limit_iter);
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
new file mode 100644
index 000000000000..75dc52a49b3a
--- /dev/null
+++ b/fs/netfs/locking.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O and data path helper functionality.
+ *
+ * Borrowed from NFS Copyright (c) 2016 Trond Myklebust
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/*
+ * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish
+ * @inode: inode to wait for
+ *
+ * Waits for all pending direct I/O requests to finish so that we can
+ * proceed with a truncate or equivalent operation.
+ *
+ * Must be called under a lock that serializes taking new references
+ * to i_dio_count, usually by inode->i_mutex.
+ */
+static int inode_dio_wait_interruptible(struct inode *inode)
+{
+ if (!atomic_read(&inode->i_dio_count))
+ return 0;
+
+ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
+ DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
+
+ for (;;) {
+ prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE);
+ if (!atomic_read(&inode->i_dio_count))
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ }
+ finish_wait(wq, &q.wq_entry);
+
+ return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0;
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_o_direct(struct netfs_inode *ictx)
+{
+ if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags))
+ return 0;
+ clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ return inode_dio_wait_interruptible(&ictx->inode);
+}
+
+/**
+ * netfs_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+int netfs_start_io_read(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ /* Be an optimist! */
+ if (down_read_interruptible(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) == 0)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (netfs_block_o_direct(ictx) < 0) {
+ up_write(&inode->i_rwsem);
+ return -ERESTARTSYS;
+ }
+ downgrade_write(&inode->i_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_read);
+
+/**
+ * netfs_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_read(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_read);
+
+/**
+ * netfs_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+int netfs_start_io_write(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (netfs_block_o_direct(ictx) < 0) {
+ up_write(&inode->i_rwsem);
+ return -ERESTARTSYS;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_write);
+
+/**
+ * netfs_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_write(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_write(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_write);
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_buffered(struct inode *inode)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ int ret;
+
+ if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) {
+ set_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ if (inode->i_mapping->nrpages != 0) {
+ unmap_mapping_range(inode->i_mapping, 0, 0, 0);
+ ret = filemap_fdatawait(inode->i_mapping);
+ if (ret < 0) {
+ clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * netfs_start_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+int netfs_start_io_direct(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ int ret;
+
+ /* Be an optimist! */
+ if (down_read_interruptible(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) != 0)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ ret = netfs_block_buffered(inode);
+ if (ret < 0) {
+ up_write(&inode->i_rwsem);
+ return ret;
+ }
+ downgrade_write(&inode->i_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_direct);
+
+/**
+ * netfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_direct(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_direct);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 068568702957..5e77618a7940 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,8 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/netfs.h>
@@ -15,6 +17,113 @@ MODULE_DESCRIPTION("Network fs support");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");
+EXPORT_TRACEPOINT_SYMBOL(netfs_sreq);
+
unsigned netfs_debug;
module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+
+#ifdef CONFIG_PROC_FS
+LIST_HEAD(netfs_io_requests);
+DEFINE_SPINLOCK(netfs_proc_lock);
+
+static const char *netfs_origins[nr__netfs_io_origin] = {
+ [NETFS_READAHEAD] = "RA",
+ [NETFS_READPAGE] = "RP",
+ [NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_WRITEBACK] = "WB",
+ [NETFS_WRITETHROUGH] = "WT",
+ [NETFS_LAUNDER_WRITE] = "LW",
+ [NETFS_UNBUFFERED_WRITE] = "UW",
+ [NETFS_DIO_READ] = "DR",
+ [NETFS_DIO_WRITE] = "DW",
+};
+
+/*
+ * Generate a list of I/O requests in /proc/fs/netfs/requests
+ */
+static int netfs_requests_seq_show(struct seq_file *m, void *v)
+{
+ struct netfs_io_request *rreq;
+
+ if (v == &netfs_io_requests) {
+ seq_puts(m,
+ "REQUEST OR REF FL ERR OPS COVERAGE\n"
+ "======== == === == ==== === =========\n"
+ );
+ return 0;
+ }
+
+ rreq = list_entry(v, struct netfs_io_request, proc_link);
+ seq_printf(m,
+ "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+ rreq->debug_id,
+ netfs_origins[rreq->origin],
+ refcount_read(&rreq->ref),
+ rreq->flags,
+ rreq->error,
+ atomic_read(&rreq->nr_outstanding),
+ rreq->start, rreq->submitted, rreq->len);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void *netfs_requests_seq_start(struct seq_file *m, loff_t *_pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return seq_list_start_head(&netfs_io_requests, *_pos);
+}
+
+static void *netfs_requests_seq_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+ return seq_list_next(v, &netfs_io_requests, _pos);
+}
+
+static void netfs_requests_seq_stop(struct seq_file *m, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static const struct seq_operations netfs_requests_seq_ops = {
+ .start = netfs_requests_seq_start,
+ .next = netfs_requests_seq_next,
+ .stop = netfs_requests_seq_stop,
+ .show = netfs_requests_seq_show,
+};
+#endif /* CONFIG_PROC_FS */
+
+static int __init netfs_init(void)
+{
+ int ret = -ENOMEM;
+
+ if (!proc_mkdir("fs/netfs", NULL))
+ goto error;
+ if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
+ &netfs_requests_seq_ops))
+ goto error_proc;
+#ifdef CONFIG_FSCACHE_STATS
+ if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
+ netfs_stats_show))
+ goto error_proc;
+#endif
+
+ ret = fscache_init();
+ if (ret < 0)
+ goto error_proc;
+ return 0;
+
+error_proc:
+ remove_proc_entry("fs/netfs", NULL);
+error:
+ return ret;
+}
+fs_initcall(netfs_init);
+
+static void __exit netfs_exit(void)
+{
+ fscache_exit();
+ remove_proc_entry("fs/netfs", NULL);
+}
+module_exit(netfs_exit);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
new file mode 100644
index 000000000000..90051ced8e2a
--- /dev/null
+++ b/fs/netfs/misc.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Miscellaneous routines.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/swap.h>
+#include "internal.h"
+
+/*
+ * Attach a folio to the buffer and maybe set marks on it to say that we need
+ * to put the folio later and twiddle the pagecache flags.
+ */
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+ struct folio *folio, unsigned int flags,
+ gfp_t gfp_mask)
+{
+ XA_STATE_ORDER(xas, xa, index, folio_order(folio));
+
+retry:
+ xas_lock(&xas);
+ for (;;) {
+ xas_store(&xas, folio);
+ if (!xas_error(&xas))
+ break;
+ xas_unlock(&xas);
+ if (!xas_nomem(&xas, gfp_mask))
+ return xas_error(&xas);
+ goto retry;
+ }
+
+ if (flags & NETFS_FLAG_PUT_MARK)
+ xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
+ if (flags & NETFS_FLAG_PAGECACHE_MARK)
+ xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
+ xas_unlock(&xas);
+ return xas_error(&xas);
+}
+
+/*
+ * Create the specified range of folios in the buffer attached to the read
+ * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that
+ * these need freeing later.
+ */
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+ struct address_space *mapping,
+ pgoff_t index, pgoff_t to, gfp_t gfp_mask)
+{
+ struct folio *folio;
+ int ret;
+
+ if (to + 1 == index) /* Page range is inclusive */
+ return 0;
+
+ do {
+ /* TODO: Figure out what order folio can be allocated here */
+ folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
+ if (!folio)
+ return -ENOMEM;
+ folio->index = index;
+ ret = netfs_xa_store_and_mark(buffer, index, folio,
+ NETFS_FLAG_PUT_MARK, gfp_mask);
+ if (ret < 0) {
+ folio_put(folio);
+ return ret;
+ }
+
+ index += folio_nr_pages(folio);
+ } while (index <= to && index != 0);
+
+ return 0;
+}
+
+/*
+ * Clear an xarray buffer, putting a ref on the folios that have
+ * NETFS_BUF_PUT_MARK set.
+ */
+void netfs_clear_buffer(struct xarray *buffer)
+{
+ struct folio *folio;
+ XA_STATE(xas, buffer, 0);
+
+ rcu_read_lock();
+ xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
+ folio_put(folio);
+ }
+ rcu_read_unlock();
+ xa_destroy(buffer);
+}
+
+/**
+ * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
+ * @mapping: The mapping the folio belongs to.
+ * @folio: The folio being dirtied.
+ *
+ * Set the dirty flag on a folio and pin an in-use cache object in memory so
+ * that writeback can later write to it. This is intended to be called from
+ * the filesystem's ->dirty_folio() method.
+ *
+ * Return: true if the dirty flag was set on the folio, false otherwise.
+ */
+bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(ictx);
+ bool need_use = false;
+
+ _enter("");
+
+ if (!filemap_dirty_folio(mapping, folio))
+ return false;
+ if (!fscache_cookie_valid(cookie))
+ return true;
+
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ inode->i_state |= I_PINNING_NETFS_WB;
+ need_use = true;
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (need_use)
+ fscache_use_cookie(cookie, true);
+ }
+ return true;
+}
+EXPORT_SYMBOL(netfs_dirty_folio);
+
+/**
+ * netfs_unpin_writeback - Unpin writeback resources
+ * @inode: The inode on which the cookie resides
+ * @wbc: The writeback control
+ *
+ * Unpin the writeback resources pinned by netfs_dirty_folio(). This is
+ * intended to be called as/by the netfs's ->write_inode() method.
+ */
+int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc)
+{
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+ if (wbc->unpinned_netfs_wb)
+ fscache_unuse_cookie(cookie, NULL, NULL);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_unpin_writeback);
+
+/**
+ * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode
+ * @inode: The inode to clean up
+ * @aux: Auxiliary data to apply to the inode
+ *
+ * Clear any writeback resources held by an inode when the inode is evicted.
+ * This must be called before clear_inode() is called.
+ */
+void netfs_clear_inode_writeback(struct inode *inode, const void *aux)
+{
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+ if (inode->i_state & I_PINNING_NETFS_WB) {
+ loff_t i_size = i_size_read(inode);
+ fscache_unuse_cookie(cookie, aux, &i_size);
+ }
+}
+EXPORT_SYMBOL(netfs_clear_inode_writeback);
+
+/**
+ * netfs_invalidate_folio - Invalidate or partially invalidate a folio
+ * @folio: Folio proposed for release
+ * @offset: Offset of the invalidated region
+ * @length: Length of the invalidated region
+ *
+ * Invalidate part or all of a folio for a network filesystem. The folio will
+ * be removed afterwards if the invalidated region covers the entire folio.
+ */
+void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+ struct netfs_folio *finfo = NULL;
+ size_t flen = folio_size(folio);
+
+ _enter("{%lx},%zx,%zx", folio->index, offset, length);
+
+ folio_wait_fscache(folio);
+
+ if (!folio_test_private(folio))
+ return;
+
+ finfo = netfs_folio_info(folio);
+
+ if (offset == 0 && length >= flen)
+ goto erase_completely;
+
+ if (finfo) {
+ /* We have a partially uptodate page from a streaming write. */
+ unsigned int fstart = finfo->dirty_offset;
+ unsigned int fend = fstart + finfo->dirty_len;
+ unsigned int end = offset + length;
+
+ if (offset >= fend)
+ return;
+ if (end <= fstart)
+ return;
+ if (offset <= fstart && end >= fend)
+ goto erase_completely;
+ if (offset <= fstart && end > fstart)
+ goto reduce_len;
+ if (offset > fstart && end >= fend)
+ goto move_start;
+ /* A partial write was split. The caller has already zeroed
+ * it, so just absorb the hole.
+ */
+ }
+ return;
+
+erase_completely:
+ netfs_put_group(netfs_folio_group(folio));
+ folio_detach_private(folio);
+ folio_clear_uptodate(folio);
+ kfree(finfo);
+ return;
+reduce_len:
+ finfo->dirty_len = offset + length - finfo->dirty_offset;
+ return;
+move_start:
+ finfo->dirty_len -= offset - finfo->dirty_offset;
+ finfo->dirty_offset = offset;
+}
+EXPORT_SYMBOL(netfs_invalidate_folio);
+
+/**
+ * netfs_release_folio - Try to release a folio
+ * @folio: Folio proposed for release
+ * @gfp: Flags qualifying the release
+ *
+ * Request release of a folio and clean up its private state if it's not busy.
+ * Returns true if the folio can now be released, false if not
+ */
+bool netfs_release_folio(struct folio *folio, gfp_t gfp)
+{
+ struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
+ unsigned long long end;
+
+ end = folio_pos(folio) + folio_size(folio);
+ if (end > ctx->zero_point)
+ ctx->zero_point = end;
+
+ if (folio_test_private(folio))
+ return false;
+ if (folio_test_fscache(folio)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ folio_wait_fscache(folio);
+ }
+
+ fscache_note_page_release(netfs_i_cookie(ctx));
+ return true;
+}
+EXPORT_SYMBOL(netfs_release_folio);
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e17cdf53f6a7..610ceb5bd86c 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -20,14 +20,20 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct inode *inode = file ? file_inode(file) : mapping->host;
struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
+ bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
+ origin == NETFS_DIO_READ ||
+ origin == NETFS_DIO_WRITE);
+ bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
int ret;
- rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
+ GFP_KERNEL);
if (!rreq)
return ERR_PTR(-ENOMEM);
rreq->start = start;
rreq->len = len;
+ rreq->upper_len = len;
rreq->origin = origin;
rreq->netfs_ops = ctx->ops;
rreq->mapping = mapping;
@@ -35,8 +41,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
INIT_LIST_HEAD(&rreq->subrequests);
+ INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
+
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ if (cached)
+ __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+ if (file && file->f_flags & O_NONBLOCK)
+ __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
@@ -45,6 +57,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
}
+ trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
+ netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
return rreq;
}
@@ -74,33 +88,47 @@ static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
+ unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ netfs_proc_del_rreq(rreq);
netfs_clear_subrequests(rreq, false);
if (rreq->netfs_ops->free_request)
rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
- kfree(rreq);
+ if (rreq->direct_bv) {
+ for (i = 0; i < rreq->direct_bv_count; i++) {
+ if (rreq->direct_bv[i].bv_page) {
+ if (rreq->direct_bv_unpin)
+ unpin_user_page(rreq->direct_bv[i].bv_page);
+ }
+ }
+ kvfree(rreq->direct_bv);
+ }
+ kfree_rcu(rreq, rcu);
netfs_stat_d(&netfs_n_rh_rreq);
}
void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
enum netfs_rreq_ref_trace what)
{
- unsigned int debug_id = rreq->debug_id;
+ unsigned int debug_id;
bool dead;
int r;
- dead = __refcount_dec_and_test(&rreq->ref, &r);
- trace_netfs_rreq_ref(debug_id, r - 1, what);
- if (dead) {
- if (was_async) {
- rreq->work.func = netfs_free_request;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_free_request(&rreq->work);
+ if (rreq) {
+ debug_id = rreq->debug_id;
+ dead = __refcount_dec_and_test(&rreq->ref, &r);
+ trace_netfs_rreq_ref(debug_id, r - 1, what);
+ if (dead) {
+ if (was_async) {
+ rreq->work.func = netfs_free_request;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_free_request(&rreq->work);
+ }
}
}
}
@@ -112,8 +140,11 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq
{
struct netfs_io_subrequest *subreq;
- subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL);
+ subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
+ sizeof(struct netfs_io_subrequest),
+ GFP_KERNEL);
if (subreq) {
+ INIT_WORK(&subreq->work, NULL);
INIT_LIST_HEAD(&subreq->rreq_link);
refcount_set(&subreq->ref, 2);
subreq->rreq = rreq;
@@ -140,6 +171,8 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
trace_netfs_sreq(subreq, netfs_sreq_trace_free);
+ if (rreq->netfs_ops->free_subrequest)
+ rreq->netfs_ops->free_subrequest(subreq);
kfree(subreq);
netfs_stat_d(&netfs_n_rh_sreq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
new file mode 100644
index 000000000000..625eb68f3e5a
--- /dev/null
+++ b/fs/netfs/output.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/**
+ * netfs_create_write_request - Create a write operation.
+ * @wreq: The write request this is storing from.
+ * @dest: The destination type
+ * @start: Start of the region this write will modify
+ * @len: Length of the modification
+ * @worker: The worker function to handle the write(s)
+ *
+ * Allocate a write operation, set it up and add it to the list on a write
+ * request.
+ */
+struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq,
+ enum netfs_io_source dest,
+ loff_t start, size_t len,
+ work_func_t worker)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_alloc_subrequest(wreq);
+ if (subreq) {
+ INIT_WORK(&subreq->work, worker);
+ subreq->source = dest;
+ subreq->start = start;
+ subreq->len = len;
+ subreq->debug_index = wreq->subreq_counter++;
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ BUG();
+ }
+
+ subreq->io_iter = wreq->io_iter;
+ iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start);
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+ atomic_inc(&wreq->nr_outstanding);
+ list_add_tail(&subreq->rreq_link, &wreq->subrequests);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ }
+
+ return subreq;
+}
+EXPORT_SYMBOL(netfs_create_write_request);
+
+/*
+ * Process a completed write request once all the component operations have
+ * been completed.
+ */
+static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+ size_t transferred = 0;
+
+ _enter("R=%x[]", wreq->debug_id);
+
+ trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+ list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+ if (subreq->error || subreq->transferred == 0)
+ break;
+ transferred += subreq->transferred;
+ if (subreq->transferred < subreq->len)
+ break;
+ }
+ wreq->transferred = transferred;
+
+ list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+ if (!subreq->error)
+ continue;
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ /* Depending on the type of failure, this may prevent
+ * writeback completion unless we're in disconnected
+ * mode.
+ */
+ if (!wreq->error)
+ wreq->error = subreq->error;
+ break;
+
+ case NETFS_WRITE_TO_CACHE:
+ /* Failure doesn't prevent writeback completion unless
+ * we're in disconnected mode.
+ */
+ if (subreq->error != -ENOBUFS)
+ ctx->ops->invalidate_cache(wreq);
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ if (!wreq->error)
+ wreq->error = -EIO;
+ return;
+ }
+ }
+
+ wreq->cleanup(wreq);
+
+ if (wreq->origin == NETFS_DIO_WRITE &&
+ wreq->mapping->nrpages) {
+ pgoff_t first = wreq->start >> PAGE_SHIFT;
+ pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+ invalidate_inode_pages2_range(wreq->mapping, first, last);
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_end(wreq->inode);
+
+ _debug("finished");
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (wreq->iocb) {
+ wreq->iocb->ki_pos += transferred;
+ if (wreq->iocb->ki_complete)
+ wreq->iocb->ki_complete(
+ wreq->iocb, wreq->error ? wreq->error : transferred);
+ }
+
+ netfs_clear_subrequests(wreq, was_async);
+ netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
+}
+
+/*
+ * Deal with the completion of writing the data to the cache.
+ */
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = _op;
+ struct netfs_io_request *wreq = subreq->rreq;
+ unsigned int u;
+
+ _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_done);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_done);
+ break;
+ case NETFS_INVALID_WRITE:
+ break;
+ default:
+ BUG();
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ trace_netfs_failure(wreq, subreq, transferred_or_error,
+ netfs_fail_write);
+ goto failed;
+ }
+
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq excess write: R%x[%x] %zd > %zu - %zu",
+ wreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+
+ if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+ pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n",
+ wreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->len,
+ subreq->transferred, subreq->io_iter.iter_type);
+
+ if (subreq->transferred < subreq->len)
+ goto incomplete;
+
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+out:
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ /* If we decrement nr_outstanding to 0, the ref belongs to us. */
+ u = atomic_dec_return(&wreq->nr_outstanding);
+ if (u == 0)
+ netfs_write_terminated(wreq, was_async);
+ else if (u == 1)
+ wake_up_var(&wreq->nr_outstanding);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+ return;
+
+incomplete:
+ if (transferred_or_error == 0) {
+ if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+ subreq->error = -ENODATA;
+ goto failed;
+ }
+ } else {
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+
+ __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+ goto out;
+
+failed:
+ switch (subreq->source) {
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_failed);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+ break;
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_failed);
+ set_bit(NETFS_RREQ_FAILED, &wreq->flags);
+ wreq->error = subreq->error;
+ break;
+ default:
+ break;
+ }
+ goto out;
+}
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
+
+static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+
+ cres->ops->write(cres, subreq->start, &subreq->io_iter,
+ netfs_write_subrequest_terminated, subreq);
+}
+
+static void netfs_write_to_cache_op_worker(struct work_struct *work)
+{
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
+
+ netfs_write_to_cache_op(subreq);
+}
+
+/**
+ * netfs_queue_write_request - Queue a write request for attention
+ * @subreq: The write request to be queued
+ *
+ * Queue the specified write request for processing by a worker thread. We
+ * pass the caller's ref on the request to the worker thread.
+ */
+void netfs_queue_write_request(struct netfs_io_subrequest *subreq)
+{
+ if (!queue_work(system_unbound_wq, &subreq->work))
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip);
+}
+EXPORT_SYMBOL(netfs_queue_write_request);
+
+/*
+ * Set up a op for writing to the cache.
+ */
+static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
+{
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+ struct netfs_io_subrequest *subreq;
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(ctx);
+ loff_t start = wreq->start;
+ size_t len = wreq->len;
+ int ret;
+
+ if (!fscache_cookie_enabled(cookie)) {
+ clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags);
+ return;
+ }
+
+ _debug("write to cache");
+ ret = fscache_begin_write_operation(cres, cookie);
+ if (ret < 0)
+ return;
+
+ ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len,
+ i_size_read(wreq->inode), true);
+ if (ret < 0)
+ return;
+
+ subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
+ netfs_write_to_cache_op_worker);
+ if (!subreq)
+ return;
+
+ netfs_write_to_cache_op(subreq);
+}
+
+/*
+ * Begin the process of writing out a chunk of data.
+ *
+ * We are given a write request that holds a series of dirty regions and
+ * (partially) covers a sequence of folios, all of which are present. The
+ * pages must have been marked as writeback as appropriate.
+ *
+ * We need to perform the following steps:
+ *
+ * (1) If encrypting, create an output buffer and encrypt each block of the
+ * data into it, otherwise the output buffer will point to the original
+ * folios.
+ *
+ * (2) If the data is to be cached, set up a write op for the entire output
+ * buffer to the cache, if the cache wants to accept it.
+ *
+ * (3) If the data is to be uploaded (ie. not merely cached):
+ *
+ * (a) If the data is to be compressed, create a compression buffer and
+ * compress the data into it.
+ *
+ * (b) For each destination we want to upload to, set up write ops to write
+ * to that destination. We may need multiple writes if the data is not
+ * contiguous or the span exceeds wsize for a server.
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+ enum netfs_write_trace what)
+{
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+
+ _enter("R=%x %llx-%llx f=%lx",
+ wreq->debug_id, wreq->start, wreq->start + wreq->len - 1,
+ wreq->flags);
+
+ trace_netfs_write(wreq, what);
+ if (wreq->len == 0 || wreq->iter.count == 0) {
+ pr_err("Zero-sized write [R=%x]\n", wreq->debug_id);
+ return -EIO;
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_begin(wreq->inode);
+
+ wreq->io_iter = wreq->iter;
+
+ /* ->outstanding > 0 carries a ref */
+ netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+ atomic_set(&wreq->nr_outstanding, 1);
+
+ /* Start the encryption/compression going. We can do that in the
+ * background whilst we generate a list of write ops that we want to
+ * perform.
+ */
+ // TODO: Encrypt or compress the region as appropriate
+
+ /* We need to write all of the region to the cache */
+ if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+ netfs_set_up_write_to_cache(wreq);
+
+ /* However, we don't necessarily write all of the region to the server.
+ * Caching of reads is being managed this way also.
+ */
+ if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ ctx->ops->create_write_requests(wreq, wreq->start, wreq->len);
+
+ if (atomic_dec_and_test(&wreq->nr_outstanding))
+ netfs_write_terminated(wreq, false);
+
+ if (!may_wait)
+ return -EIOCBQUEUED;
+
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ return wreq->error;
+}
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+ struct netfs_io_request *wreq;
+ struct file *file = iocb->ki_filp;
+
+ wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len,
+ NETFS_WRITETHROUGH);
+ if (IS_ERR(wreq))
+ return wreq;
+
+ trace_netfs_write(wreq, netfs_write_trace_writethrough);
+
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0);
+ wreq->io_iter = wreq->iter;
+
+ /* ->outstanding > 0 carries a ref */
+ netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+ atomic_set(&wreq->nr_outstanding, 1);
+ return wreq;
+}
+
+static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
+{
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ unsigned long long start;
+ size_t len;
+
+ if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ return;
+
+ start = wreq->start + wreq->submitted;
+ len = wreq->iter.count - wreq->submitted;
+ if (!final) {
+ len /= wreq->wsize; /* Round to number of maximum packets */
+ len *= wreq->wsize;
+ }
+
+ ictx->ops->create_write_requests(wreq, start, len);
+ wreq->submitted += len;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache. Data has been copied into the pagecache that we need to append
+ * to the request. If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
+{
+ _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
+ wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
+
+ wreq->iter.count += copied;
+ wreq->io_iter.count += copied;
+ if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize)
+ netfs_submit_writethrough(wreq, false);
+
+ return wreq->error;
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
+{
+ int ret = -EIOCBQUEUED;
+
+ _enter("ic=%zu sb=%zu ws=%u",
+ wreq->iter.count, wreq->submitted, wreq->wsize);
+
+ if (wreq->submitted < wreq->io_iter.count)
+ netfs_submit_writethrough(wreq, true);
+
+ if (atomic_dec_and_test(&wreq->nr_outstanding))
+ netfs_write_terminated(wreq, false);
+
+ if (is_sync_kiocb(iocb)) {
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ ret = wreq->error;
+ }
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 5510a7a14a40..deeba9f9dcf5 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -9,6 +9,8 @@
#include <linux/seq_file.h>
#include "internal.h"
+atomic_t netfs_n_rh_dio_read;
+atomic_t netfs_n_rh_dio_write;
atomic_t netfs_n_rh_readahead;
atomic_t netfs_n_rh_readpage;
atomic_t netfs_n_rh_rreq;
@@ -27,32 +29,48 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_wh_wstream_conflict;
+atomic_t netfs_n_wh_upload;
+atomic_t netfs_n_wh_upload_done;
+atomic_t netfs_n_wh_upload_failed;
+atomic_t netfs_n_wh_write;
+atomic_t netfs_n_wh_write_done;
+atomic_t netfs_n_wh_write_failed;
-void netfs_stats_show(struct seq_file *m)
+int netfs_stats_show(struct seq_file *m, void *v)
{
- seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+ seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n",
+ atomic_read(&netfs_n_rh_dio_read),
+ atomic_read(&netfs_n_rh_dio_write),
atomic_read(&netfs_n_rh_readahead),
atomic_read(&netfs_n_rh_readpage),
atomic_read(&netfs_n_rh_write_begin),
- atomic_read(&netfs_n_rh_write_zskip),
- atomic_read(&netfs_n_rh_rreq),
- atomic_read(&netfs_n_rh_sreq));
- seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n",
+ atomic_read(&netfs_n_rh_write_zskip));
+ seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
atomic_read(&netfs_n_rh_write_zskip));
- seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n",
+ seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n",
atomic_read(&netfs_n_rh_download),
atomic_read(&netfs_n_rh_download_done),
atomic_read(&netfs_n_rh_download_failed),
atomic_read(&netfs_n_rh_download_instead));
- seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n",
+ seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n",
atomic_read(&netfs_n_rh_read),
atomic_read(&netfs_n_rh_read_done),
atomic_read(&netfs_n_rh_read_failed));
- seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n",
- atomic_read(&netfs_n_rh_write),
- atomic_read(&netfs_n_rh_write_done),
- atomic_read(&netfs_n_rh_write_failed));
+ seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n",
+ atomic_read(&netfs_n_wh_upload),
+ atomic_read(&netfs_n_wh_upload_done),
+ atomic_read(&netfs_n_wh_upload_failed));
+ seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n",
+ atomic_read(&netfs_n_wh_write),
+ atomic_read(&netfs_n_wh_write_done),
+ atomic_read(&netfs_n_wh_write_failed));
+ seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n",
+ atomic_read(&netfs_n_rh_rreq),
+ atomic_read(&netfs_n_rh_sreq),
+ atomic_read(&netfs_n_wh_wstream_conflict));
+ return fscache_stats_show(m);
}
EXPORT_SYMBOL(netfs_stats_show);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 01ac733a6320..f7e32d76e34d 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -169,8 +169,8 @@ config ROOT_NFS
config NFS_FSCACHE
bool "Provide NFS client caching support"
- depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
- select NETFS_SUPPORT
+ depends on NFS_FS=m && NETFS_SUPPORT || NFS_FS=y && NETFS_SUPPORT=y
+ select FSCACHE
help
Say Y here if you want NFS data to be cached locally on disc through
the general filesystem cache manager
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index b4294a8aa2d4..f1eeb4914199 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -108,7 +108,7 @@ struct pnfs_block_dev {
struct pnfs_block_dev *children;
u64 chunk_size;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
u64 disk_offset;
u64 pr_key;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index c97ebc42ec0f..93ef7f864980 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev)
} else {
if (dev->pr_registered) {
const struct pr_ops *ops =
- dev->bdev_handle->bdev->bd_disk->fops->pr_ops;
+ file_bdev(dev->bdev_file)->bd_disk->fops->pr_ops;
int error;
- error = ops->pr_register(dev->bdev_handle->bdev,
+ error = ops->pr_register(file_bdev(dev->bdev_file),
dev->pr_key, 0, false);
if (error)
pr_err("failed to unregister PR key.\n");
}
- if (dev->bdev_handle)
- bdev_release(dev->bdev_handle);
+ if (dev->bdev_file)
+ fput(dev->bdev_file);
}
}
@@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
map->start = dev->start;
map->len = dev->len;
map->disk_offset = dev->disk_offset;
- map->bdev = dev->bdev_handle->bdev;
+ map->bdev = file_bdev(dev->bdev_file);
return true;
}
@@ -236,26 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev)
return -EIO;
- bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
NULL, NULL);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
- MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle));
- return PTR_ERR(bdev_handle);
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev_file));
+ return PTR_ERR(bdev_file);
}
- d->bdev_handle = bdev_handle;
- d->len = bdev_nr_bytes(bdev_handle->bdev);
+ d->bdev_file = bdev_file;
+ d->len = bdev_nr_bytes(file_bdev(bdev_file));
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
- bdev_handle->bdev->bd_disk->disk_name);
+ file_bdev(bdev_file)->bd_disk->disk_name);
return 0;
}
@@ -300,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v)
}
}
-static struct bdev_handle *
+static struct file *
bl_open_path(struct pnfs_block_volume *v, const char *prefix)
{
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
const char *devname;
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
@@ -311,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix)
if (!devname)
return ERR_PTR(-ENOMEM);
- bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
NULL, NULL);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
pr_warn("pNFS: failed to open device %s (%ld)\n",
- devname, PTR_ERR(bdev_handle));
+ devname, PTR_ERR(bdev_file));
}
kfree(devname);
- return bdev_handle;
+ return bdev_file;
}
static int
@@ -327,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
const struct pr_ops *ops;
int error;
@@ -340,14 +340,14 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
* On other distributions like Debian, the default SCSI by-id path will
* point to the dm-multipath device if one exists.
*/
- bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x");
- if (IS_ERR(bdev_handle))
- bdev_handle = bl_open_path(v, "wwn-0x");
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
- d->bdev_handle = bdev_handle;
-
- d->len = bdev_nr_bytes(d->bdev_handle->bdev);
+ bdev_file = bl_open_path(v, "dm-uuid-mpath-0x");
+ if (IS_ERR(bdev_file))
+ bdev_file = bl_open_path(v, "wwn-0x");
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
+ d->bdev_file = bdev_file;
+
+ d->len = bdev_nr_bytes(file_bdev(d->bdev_file));
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
@@ -355,20 +355,20 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return -ENODEV;
pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
- d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
+ file_bdev(d->bdev_file)->bd_disk->disk_name, d->pr_key);
- ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops;
+ ops = file_bdev(d->bdev_file)->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: block device %s does not support reservations.",
- d->bdev_handle->bdev->bd_disk->disk_name);
+ file_bdev(d->bdev_file)->bd_disk->disk_name);
error = -EINVAL;
goto out_blkdev_put;
}
- error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true);
+ error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true);
if (error) {
pr_err("pNFS: failed to register key for block device %s.",
- d->bdev_handle->bdev->bd_disk->disk_name);
+ file_bdev(d->bdev_file)->bd_disk->disk_name);
goto out_blkdev_put;
}
@@ -376,7 +376,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
out_blkdev_put:
- bdev_release(d->bdev_handle);
+ fput(d->bdev_file);
return error;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 44eca51b2808..fbdc9ca80f71 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -246,7 +246,7 @@ void nfs_free_client(struct nfs_client *clp)
put_nfs_version(clp->cl_nfs_mod);
kfree(clp->cl_hostname);
kfree(clp->cl_acceptor);
- kfree(clp);
+ kfree_rcu(clp, rcu);
}
EXPORT_SYMBOL_GPL(nfs_free_client);
@@ -1006,6 +1006,14 @@ struct nfs_server *nfs_alloc_server(void)
}
EXPORT_SYMBOL_GPL(nfs_alloc_server);
+static void delayed_free(struct rcu_head *p)
+{
+ struct nfs_server *server = container_of(p, struct nfs_server, rcu);
+
+ nfs_free_iostats(server->io_stats);
+ kfree(server);
+}
+
/*
* Free up a server record
*/
@@ -1031,10 +1039,9 @@ void nfs_free_server(struct nfs_server *server)
ida_destroy(&server->lockowner_id);
ida_destroy(&server->openowner_id);
- nfs_free_iostats(server->io_stats);
put_cred(server->cred);
- kfree(server);
nfs_release_automount_timer();
+ call_rcu(&server->rcu, delayed_free);
}
EXPORT_SYMBOL_GPL(nfs_free_server);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index fa1a14def45c..d4a42ce0c7e3 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -156,8 +156,8 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state
list = &flctx->flc_posix;
spin_lock(&flctx->flc_lock);
restart:
- list_for_each_entry(fl, list, fl_list) {
- if (nfs_file_open_context(fl->fl_file)->state != state)
+ for_each_file_lock(fl, list) {
+ if (nfs_file_open_context(fl->c.flc_file)->state != state)
continue;
spin_unlock(&flctx->flc_lock);
status = nfs4_lock_delegation_recall(fl, state, stateid);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c8ecbe999059..ac505671efbd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1431,9 +1431,9 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry)
static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
{
struct inode *inode = d_inode(dentry);
- struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *dir = d_inode_rcu(dentry->d_parent);
- if (!nfs_verify_change_attribute(dir, verf))
+ if (!dir || !nfs_verify_change_attribute(dir, verf))
return;
if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
nfs_set_verifier_delegated(&verf);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8577ccf621f5..407c6e15afe2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -720,15 +720,15 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{
struct inode *inode = filp->f_mapping->host;
int status = 0;
- unsigned int saved_type = fl->fl_type;
+ unsigned int saved_type = fl->c.flc_type;
/* Try local locking first */
posix_test_lock(filp, fl);
- if (fl->fl_type != F_UNLCK) {
+ if (fl->c.flc_type != F_UNLCK) {
/* found a conflict */
goto out;
}
- fl->fl_type = saved_type;
+ fl->c.flc_type = saved_type;
if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
goto out_noconflict;
@@ -740,7 +740,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
out:
return status;
out_noconflict:
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
goto out;
}
@@ -765,7 +765,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
* If we're signalled while cleaning up locks on process exit, we
* still need to complete the unlock.
*/
- if (status < 0 && !(fl->fl_flags & FL_CLOSE))
+ if (status < 0 && !(fl->c.flc_flags & FL_CLOSE))
return status;
}
@@ -832,12 +832,12 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
int is_local = 0;
dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n",
- filp, fl->fl_type, fl->fl_flags,
+ filp, fl->c.flc_type, fl->c.flc_flags,
(long long)fl->fl_start, (long long)fl->fl_end);
nfs_inc_stats(inode, NFSIOS_VFSLOCK);
- if (fl->fl_flags & FL_RECLAIM)
+ if (fl->c.flc_flags & FL_RECLAIM)
return -ENOGRACE;
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
@@ -851,7 +851,7 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
if (IS_GETLK(cmd))
ret = do_getlk(filp, cmd, fl, is_local);
- else if (fl->fl_type == F_UNLCK)
+ else if (lock_is_unlock(fl))
ret = do_unlk(filp, cmd, fl, is_local);
else
ret = do_setlk(filp, cmd, fl, is_local);
@@ -869,16 +869,16 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
int is_local = 0;
dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n",
- filp, fl->fl_type, fl->fl_flags);
+ filp, fl->c.flc_type, fl->c.flc_flags);
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
is_local = 1;
/* We're simulating flock() locks using posix locks on the server */
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
return do_unlk(filp, cmd, fl, is_local);
return do_setlk(filp, cmd, fl, is_local);
}
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index b05717fe0d4e..2d1bfee225c3 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -274,12 +274,6 @@ static void nfs_netfs_free_request(struct netfs_io_request *rreq)
put_nfs_open_context(rreq->netfs_priv);
}
-static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq)
-{
- return fscache_begin_read_operation(&rreq->cache_resources,
- netfs_i_cookie(netfs_inode(rreq->inode)));
-}
-
static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
{
struct nfs_netfs_io_data *netfs;
@@ -387,7 +381,6 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
const struct netfs_request_ops nfs_netfs_ops = {
.init_request = nfs_netfs_init_request,
.free_request = nfs_netfs_free_request,
- .begin_cache_operation = nfs_netfs_begin_cache_operation,
.issue_read = nfs_netfs_issue_read,
.clamp_length = nfs_netfs_clamp_length
};
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 5407ab8c8783..e3cb4923316b 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -80,7 +80,7 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
}
static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
{
- netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops);
+ netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false);
}
extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2de66e4e8280..cbbe3f0193b8 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -963,7 +963,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
struct nfs_open_context *ctx = nfs_file_open_context(filp);
int status;
- if (fl->fl_flags & FL_CLOSE) {
+ if (fl->c.flc_flags & FL_CLOSE) {
l_ctx = nfs_get_lock_context(ctx);
if (IS_ERR(l_ctx))
l_ctx = NULL;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 581698f1b7b2..6ff41ceb9f1c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -330,7 +330,7 @@ extern int update_open_stateid(struct nfs4_state *state,
const nfs4_stateid *deleg_stateid,
fmode_t fmode);
extern int nfs4_proc_setlease(struct file *file, int arg,
- struct file_lock **lease, void **priv);
+ struct file_lease **lease, void **priv);
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
extern void nfs4_update_changeattr(struct inode *dir,
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e238abc78a13..1cd9652f3c28 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -439,7 +439,7 @@ void nfs42_ssc_unregister_ops(void)
}
#endif /* CONFIG_NFS_V4_2 */
-static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease,
+static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease,
void **priv)
{
return nfs4_proc_setlease(file, arg, lease, priv);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 23819a756508..815996cb27fc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6800,7 +6800,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
switch (status) {
case 0:
- request->fl_type = F_UNLCK;
+ request->c.flc_type = F_UNLCK;
break;
case -NFS4ERR_DENIED:
status = 0;
@@ -7018,8 +7018,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
/* Ensure this is an unlock - when canceling a lock, the
* canceled lock is passed in, and it won't be an unlock.
*/
- fl->fl_type = F_UNLCK;
- if (fl->fl_flags & FL_CLOSE)
+ fl->c.flc_type = F_UNLCK;
+ if (fl->c.flc_flags & FL_CLOSE)
set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
@@ -7045,11 +7045,11 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
struct rpc_task *task;
struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
int status = 0;
- unsigned char fl_flags = request->fl_flags;
+ unsigned char saved_flags = request->c.flc_flags;
status = nfs4_set_lock_state(state, request);
/* Unlock _before_ we do the RPC call */
- request->fl_flags |= FL_EXISTS;
+ request->c.flc_flags |= FL_EXISTS;
/* Exclude nfs_delegation_claim_locks() */
mutex_lock(&sp->so_delegreturn_mutex);
/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
@@ -7073,14 +7073,16 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
status = -ENOMEM;
if (IS_ERR(seqid))
goto out;
- task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
+ task = nfs4_do_unlck(request,
+ nfs_file_open_context(request->c.flc_file),
+ lsp, seqid);
status = PTR_ERR(task);
if (IS_ERR(task))
goto out;
status = rpc_wait_for_completion_task(task);
rpc_put_task(task);
out:
- request->fl_flags = fl_flags;
+ request->c.flc_flags = saved_flags;
trace_nfs4_unlock(request, state, F_SETLK, status);
return status;
}
@@ -7191,7 +7193,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
data->timestamp);
if (data->arg.new_lock && !data->cancelled) {
- data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
+ data->fl.c.flc_flags &= ~(FL_SLEEP | FL_ACCESS);
if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
goto out_restart;
}
@@ -7292,7 +7294,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE))
task_setup_data.flags |= RPC_TASK_MOVEABLE;
- data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
+ data = nfs4_alloc_lockdata(fl,
+ nfs_file_open_context(fl->c.flc_file),
fl->fl_u.nfs4_fl.owner, GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
@@ -7398,10 +7401,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
{
struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs4_state_owner *sp = state->owner;
- unsigned char fl_flags = request->fl_flags;
+ unsigned char flags = request->c.flc_flags;
int status;
- request->fl_flags |= FL_ACCESS;
+ request->c.flc_flags |= FL_ACCESS;
status = locks_lock_inode_wait(state->inode, request);
if (status < 0)
goto out;
@@ -7410,7 +7413,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
/* Yes: cache locks! */
/* ...but avoid races with delegation recall... */
- request->fl_flags = fl_flags & ~FL_SLEEP;
+ request->c.flc_flags = flags & ~FL_SLEEP;
status = locks_lock_inode_wait(state->inode, request);
up_read(&nfsi->rwsem);
mutex_unlock(&sp->so_delegreturn_mutex);
@@ -7420,7 +7423,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
mutex_unlock(&sp->so_delegreturn_mutex);
status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
out:
- request->fl_flags = fl_flags;
+ request->c.flc_flags = flags;
return status;
}
@@ -7562,7 +7565,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
return -EINVAL;
- if (request->fl_type == F_UNLCK) {
+ if (lock_is_unlock(request)) {
if (state != NULL)
return nfs4_proc_unlck(state, cmd, request);
return 0;
@@ -7571,7 +7574,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
if (state == NULL)
return -ENOLCK;
- if ((request->fl_flags & FL_POSIX) &&
+ if ((request->c.flc_flags & FL_POSIX) &&
!test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
return -ENOLCK;
@@ -7579,7 +7582,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
* Don't rely on the VFS having checked the file open mode,
* since it won't do this for flock() locks.
*/
- switch (request->fl_type) {
+ switch (request->c.flc_type) {
case F_RDLCK:
if (!(filp->f_mode & FMODE_READ))
return -EBADF;
@@ -7601,7 +7604,7 @@ static int nfs4_delete_lease(struct file *file, void **priv)
return generic_setlease(file, F_UNLCK, NULL, priv);
}
-static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease,
+static int nfs4_add_lease(struct file *file, int arg, struct file_lease **lease,
void **priv)
{
struct inode *inode = file_inode(file);
@@ -7619,7 +7622,7 @@ static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease,
return -EAGAIN;
}
-int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease,
+int nfs4_proc_setlease(struct file *file, int arg, struct file_lease **lease,
void **priv)
{
switch (arg) {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9a5d911a7edc..8cfabdbda336 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -847,15 +847,15 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
*/
static struct nfs4_lock_state *
__nfs4_find_lock_state(struct nfs4_state *state,
- fl_owner_t fl_owner, fl_owner_t fl_owner2)
+ fl_owner_t owner, fl_owner_t owner2)
{
struct nfs4_lock_state *pos, *ret = NULL;
list_for_each_entry(pos, &state->lock_states, ls_locks) {
- if (pos->ls_owner == fl_owner) {
+ if (pos->ls_owner == owner) {
ret = pos;
break;
}
- if (pos->ls_owner == fl_owner2)
+ if (pos->ls_owner == owner2)
ret = pos;
}
if (ret)
@@ -868,7 +868,7 @@ __nfs4_find_lock_state(struct nfs4_state *state,
* exists, return an uninitialized one.
*
*/
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t owner)
{
struct nfs4_lock_state *lsp;
struct nfs_server *server = state->owner->so_server;
@@ -879,7 +879,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
nfs4_init_seqid_counter(&lsp->ls_seqid);
refcount_set(&lsp->ls_count, 1);
lsp->ls_state = state;
- lsp->ls_owner = fl_owner;
+ lsp->ls_owner = owner;
lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT);
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
@@ -980,7 +980,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
if (fl->fl_ops != NULL)
return 0;
- lsp = nfs4_get_lock_state(state, fl->fl_owner);
+ lsp = nfs4_get_lock_state(state, fl->c.flc_owner);
if (lsp == NULL)
return -ENOMEM;
fl->fl_u.nfs4_fl.owner = lsp;
@@ -993,7 +993,7 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
const struct nfs_lock_context *l_ctx)
{
struct nfs4_lock_state *lsp;
- fl_owner_t fl_owner, fl_flock_owner;
+ fl_owner_t owner, fl_flock_owner;
int ret = -ENOENT;
if (l_ctx == NULL)
@@ -1002,11 +1002,11 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
goto out;
- fl_owner = l_ctx->lockowner;
+ owner = l_ctx->lockowner;
fl_flock_owner = l_ctx->open_context->flock_owner;
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner);
+ lsp = __nfs4_find_lock_state(state, owner, fl_flock_owner);
if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
ret = -EIO;
else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
@@ -1529,8 +1529,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
down_write(&nfsi->rwsem);
spin_lock(&flctx->flc_lock);
restart:
- list_for_each_entry(fl, list, fl_list) {
- if (nfs_file_open_context(fl->fl_file)->state != state)
+ for_each_file_lock(fl, list) {
+ if (nfs_file_open_context(fl->c.flc_file)->state != state)
continue;
spin_unlock(&flctx->flc_lock);
status = ops->recover_lock(state, fl);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index d27919d7241d..fd7cb15b08b2 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -699,7 +699,7 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
__entry->error = error < 0 ? -error : 0;
__entry->cmd = cmd;
- __entry->type = request->fl_type;
+ __entry->type = request->c.flc_type;
__entry->start = request->fl_start;
__entry->end = request->fl_end;
__entry->dev = inode->i_sb->s_dev;
@@ -771,7 +771,7 @@ TRACE_EVENT(nfs4_set_lock,
__entry->error = error < 0 ? -error : 0;
__entry->cmd = cmd;
- __entry->type = request->fl_type;
+ __entry->type = request->c.flc_type;
__entry->start = request->fl_start;
__entry->end = request->fl_end;
__entry->dev = inode->i_sb->s_dev;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 69406e60f391..1416099dfcd1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1305,7 +1305,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
static inline int nfs4_lock_type(struct file_lock *fl, int block)
{
- if (fl->fl_type == F_RDLCK)
+ if (lock_is_read(fl))
return block ? NFS4_READW_LT : NFS4_READ_LT;
return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
}
@@ -5052,10 +5052,10 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
fl->fl_end = fl->fl_start + (loff_t)length - 1;
if (length == ~(uint64_t)0)
fl->fl_end = OFFSET_MAX;
- fl->fl_type = F_WRLCK;
+ fl->c.flc_type = F_WRLCK;
if (type & 1)
- fl->fl_type = F_RDLCK;
- fl->fl_pid = 0;
+ fl->c.flc_type = F_RDLCK;
+ fl->c.flc_pid = 0;
}
p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index bb79d3a886ae..84bb85264572 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1301,7 +1301,7 @@ static bool
is_whole_file_wrlock(struct file_lock *fl)
{
return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
- fl->fl_type == F_WRLCK;
+ lock_is_write(fl);
}
/* If we know the page is up to date, and we're not using byte range locks (or
@@ -1335,13 +1335,13 @@ static int nfs_can_extend_write(struct file *file, struct folio *folio,
spin_lock(&flctx->flc_lock);
if (!list_empty(&flctx->flc_posix)) {
fl = list_first_entry(&flctx->flc_posix, struct file_lock,
- fl_list);
+ c.flc_list);
if (is_whole_file_wrlock(fl))
ret = 1;
} else if (!list_empty(&flctx->flc_flock)) {
fl = list_first_entry(&flctx->flc_flock, struct file_lock,
- fl_list);
- if (fl->fl_type == F_WRLCK)
+ c.flc_list);
+ if (lock_is_write(fl))
ret = 1;
}
spin_unlock(&flctx->flc_lock);
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 9cb7f0c33df5..b86d8494052c 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -662,8 +662,8 @@ nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
struct file_lock *fl = data;
/* Only close files for F_SETLEASE leases */
- if (fl->fl_flags & FL_LEASE)
- nfsd_file_close_inode(file_inode(fl->fl_file));
+ if (fl->c.flc_flags & FL_LEASE)
+ nfsd_file_close_inode(file_inode(fl->c.flc_file));
return 0;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 926c29879c6a..32d23ef3e5de 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -674,7 +674,7 @@ static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req,
const struct nfsd4_callback *cb = data;
const struct nfsd4_blocked_lock *nbl =
container_of(cb, struct nfsd4_blocked_lock, nbl_cb);
- struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner;
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.c.flc_owner;
struct nfs4_cb_compound_hdr hdr = {
.ident = 0,
.minorversion = cb->cb_clp->cl_minorversion,
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 5e8096bc5eaa..4c0d00bdfbb1 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -25,7 +25,7 @@ static struct kmem_cache *nfs4_layout_cache;
static struct kmem_cache *nfs4_layout_stateid_cache;
static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
-static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+static const struct lease_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
#ifdef CONFIG_NFSD_FLEXFILELAYOUT
@@ -170,7 +170,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
spin_unlock(&fp->fi_lock);
if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
- vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls);
+ kernel_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls);
nfsd_file_put(ls->ls_file);
if (ls->ls_recalled)
@@ -182,27 +182,26 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
static int
nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
{
- struct file_lock *fl;
+ struct file_lease *fl;
int status;
if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
return 0;
- fl = locks_alloc_lock();
+ fl = locks_alloc_lease();
if (!fl)
return -ENOMEM;
- locks_init_lock(fl);
+ locks_init_lease(fl);
fl->fl_lmops = &nfsd4_layouts_lm_ops;
- fl->fl_flags = FL_LAYOUT;
- fl->fl_type = F_RDLCK;
- fl->fl_end = OFFSET_MAX;
- fl->fl_owner = ls;
- fl->fl_pid = current->tgid;
- fl->fl_file = ls->ls_file->nf_file;
-
- status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+ fl->c.flc_flags = FL_LAYOUT;
+ fl->c.flc_type = F_RDLCK;
+ fl->c.flc_owner = ls;
+ fl->c.flc_pid = current->tgid;
+ fl->c.flc_file = ls->ls_file->nf_file;
+
+ status = kernel_setlease(fl->c.flc_file, fl->c.flc_type, &fl, NULL);
if (status) {
- locks_free_lock(fl);
+ locks_free_lease(fl);
return status;
}
BUG_ON(fl != NULL);
@@ -723,7 +722,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
};
static bool
-nfsd4_layout_lm_break(struct file_lock *fl)
+nfsd4_layout_lm_break(struct file_lease *fl)
{
/*
* We don't want the locks code to timeout the lease for us;
@@ -731,19 +730,19 @@ nfsd4_layout_lm_break(struct file_lock *fl)
* in time:
*/
fl->fl_break_time = 0;
- nfsd4_recall_file_layout(fl->fl_owner);
+ nfsd4_recall_file_layout(fl->c.flc_owner);
return false;
}
static int
-nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+nfsd4_layout_lm_change(struct file_lease *onlist, int arg,
struct list_head *dispose)
{
BUG_ON(!(arg & F_UNLCK));
return lease_modify(onlist, arg, dispose);
}
-static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
.lm_break = nfsd4_layout_lm_break,
.lm_change = nfsd4_layout_lm_change,
};
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2fa54cfd4882..9257425cbd1a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1249,7 +1249,7 @@ static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
WARN_ON_ONCE(!fp->fi_delegees);
- vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp);
+ kernel_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp);
put_deleg_file(fp);
}
@@ -4922,9 +4922,9 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
/* Called from break_lease() with flc_lock held. */
static bool
-nfsd_break_deleg_cb(struct file_lock *fl)
+nfsd_break_deleg_cb(struct file_lease *fl)
{
- struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
+ struct nfs4_delegation *dp = (struct nfs4_delegation *) fl->c.flc_owner;
struct nfs4_file *fp = dp->dl_stid.sc_file;
struct nfs4_client *clp = dp->dl_stid.sc_client;
struct nfsd_net *nn;
@@ -4945,10 +4945,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
*/
fl->fl_break_time = 0;
- spin_lock(&fp->fi_lock);
fp->fi_had_conflict = true;
nfsd_break_one_deleg(dp);
- spin_unlock(&fp->fi_lock);
return false;
}
@@ -4960,9 +4958,9 @@ nfsd_break_deleg_cb(struct file_lock *fl)
* %true: Lease conflict was resolved
* %false: Lease conflict was not resolved.
*/
-static bool nfsd_breaker_owns_lease(struct file_lock *fl)
+static bool nfsd_breaker_owns_lease(struct file_lease *fl)
{
- struct nfs4_delegation *dl = fl->fl_owner;
+ struct nfs4_delegation *dl = fl->c.flc_owner;
struct svc_rqst *rqst;
struct nfs4_client *clp;
@@ -4977,10 +4975,10 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl)
}
static int
-nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+nfsd_change_deleg_cb(struct file_lease *onlist, int arg,
struct list_head *dispose)
{
- struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner;
+ struct nfs4_delegation *dp = (struct nfs4_delegation *) onlist->c.flc_owner;
struct nfs4_client *clp = dp->dl_stid.sc_client;
if (arg & F_UNLCK) {
@@ -4991,7 +4989,7 @@ nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
return -EAGAIN;
}
-static const struct lock_manager_operations nfsd_lease_mng_ops = {
+static const struct lease_manager_operations nfsd_lease_mng_ops = {
.lm_breaker_owns_lease = nfsd_breaker_owns_lease,
.lm_break = nfsd_break_deleg_cb,
.lm_change = nfsd_change_deleg_cb,
@@ -5331,21 +5329,20 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
}
-static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
+static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
int flag)
{
- struct file_lock *fl;
+ struct file_lease *fl;
- fl = locks_alloc_lock();
+ fl = locks_alloc_lease();
if (!fl)
return NULL;
fl->fl_lmops = &nfsd_lease_mng_ops;
- fl->fl_flags = FL_DELEG;
- fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
- fl->fl_end = OFFSET_MAX;
- fl->fl_owner = (fl_owner_t)dp;
- fl->fl_pid = current->tgid;
- fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
+ fl->c.flc_flags = FL_DELEG;
+ fl->c.flc_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+ fl->c.flc_owner = (fl_owner_t)dp;
+ fl->c.flc_pid = current->tgid;
+ fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
return fl;
}
@@ -5463,7 +5460,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate;
struct nfs4_delegation *dp;
struct nfsd_file *nf = NULL;
- struct file_lock *fl;
+ struct file_lease *fl;
u32 dl_type;
/*
@@ -5533,9 +5530,10 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
if (!fl)
goto out_clnt_odstate;
- status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL);
+ status = kernel_setlease(fp->fi_deleg_file->nf_file,
+ fl->c.flc_type, &fl, NULL);
if (fl)
- locks_free_lock(fl);
+ locks_free_lease(fl);
if (status)
goto out_clnt_odstate;
@@ -5557,12 +5555,13 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
if (status)
goto out_unlock;
+ status = -EAGAIN;
+ if (fp->fi_had_conflict)
+ goto out_unlock;
+
spin_lock(&state_lock);
spin_lock(&fp->fi_lock);
- if (fp->fi_had_conflict)
- status = -EAGAIN;
- else
- status = hash_delegation_locked(dp, fp);
+ status = hash_delegation_locked(dp, fp);
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
@@ -5571,7 +5570,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
return dp;
out_unlock:
- vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
+ kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
out_clnt_odstate:
put_clnt_odstate(dp->dl_clnt_odstate);
nfs4_put_stid(&dp->dl_stid);
@@ -7149,7 +7148,7 @@ nfsd4_lm_put_owner(fl_owner_t owner)
static bool
nfsd4_lm_lock_expirable(struct file_lock *cfl)
{
- struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner;
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *) cfl->c.flc_owner;
struct nfs4_client *clp = lo->lo_owner.so_client;
struct nfsd_net *nn;
@@ -7171,7 +7170,7 @@ nfsd4_lm_expire_lock(void)
static void
nfsd4_lm_notify(struct file_lock *fl)
{
- struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner;
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *) fl->c.flc_owner;
struct net *net = lo->lo_owner.so_client->net;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct nfsd4_blocked_lock *nbl = container_of(fl,
@@ -7208,7 +7207,7 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
struct nfs4_lockowner *lo;
if (fl->fl_lmops == &nfsd_posix_mng_ops) {
- lo = (struct nfs4_lockowner *) fl->fl_owner;
+ lo = (struct nfs4_lockowner *) fl->c.flc_owner;
xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner,
GFP_KERNEL);
if (!deny->ld_owner.data)
@@ -7227,7 +7226,7 @@ nevermind:
if (fl->fl_end != NFS4_MAX_UINT64)
deny->ld_length = fl->fl_end - fl->fl_start + 1;
deny->ld_type = NFS4_READ_LT;
- if (fl->fl_type != F_RDLCK)
+ if (fl->c.flc_type != F_RDLCK)
deny->ld_type = NFS4_WRITE_LT;
}
@@ -7493,8 +7492,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
int lkflg;
int err;
bool new = false;
- unsigned char fl_type;
- unsigned int fl_flags = FL_POSIX;
+ unsigned char type;
+ unsigned int flags = FL_POSIX;
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -7557,14 +7556,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
if (lock->lk_reclaim)
- fl_flags |= FL_RECLAIM;
+ flags |= FL_RECLAIM;
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
if (nfsd4_has_session(cstate) ||
exportfs_lock_op_is_async(sb->s_export_op))
- fl_flags |= FL_SLEEP;
+ flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
spin_lock(&fp->fi_lock);
@@ -7572,12 +7571,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (nf)
get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
spin_unlock(&fp->fi_lock);
- fl_type = F_RDLCK;
+ type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
if (nfsd4_has_session(cstate) ||
exportfs_lock_op_is_async(sb->s_export_op))
- fl_flags |= FL_SLEEP;
+ flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
spin_lock(&fp->fi_lock);
@@ -7585,7 +7584,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (nf)
get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
spin_unlock(&fp->fi_lock);
- fl_type = F_WRLCK;
+ type = F_WRLCK;
break;
default:
status = nfserr_inval;
@@ -7605,7 +7604,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* on those filesystems:
*/
if (!exportfs_lock_op_is_async(sb->s_export_op))
- fl_flags &= ~FL_SLEEP;
+ flags &= ~FL_SLEEP;
nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
if (!nbl) {
@@ -7615,11 +7614,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
file_lock = &nbl->nbl_lock;
- file_lock->fl_type = fl_type;
- file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
- file_lock->fl_pid = current->tgid;
- file_lock->fl_file = nf->nf_file;
- file_lock->fl_flags = fl_flags;
+ file_lock->c.flc_type = type;
+ file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
+ file_lock->c.flc_pid = current->tgid;
+ file_lock->c.flc_file = nf->nf_file;
+ file_lock->c.flc_flags = flags;
file_lock->fl_lmops = &nfsd_posix_mng_ops;
file_lock->fl_start = lock->lk_offset;
file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
@@ -7632,7 +7631,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- if (fl_flags & FL_SLEEP) {
+ if (flags & FL_SLEEP) {
nbl->nbl_time = ktime_get_boottime_seconds();
spin_lock(&nn->blocked_locks_lock);
list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
@@ -7669,7 +7668,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
out:
if (nbl) {
/* dequeue it if we queued it before */
- if (fl_flags & FL_SLEEP) {
+ if (flags & FL_SLEEP) {
spin_lock(&nn->blocked_locks_lock);
if (!list_empty(&nbl->nbl_list) &&
!list_empty(&nbl->nbl_lru)) {
@@ -7737,9 +7736,9 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct
err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
if (err)
goto out;
- lock->fl_file = nf->nf_file;
+ lock->c.flc_file = nf->nf_file;
err = nfserrno(vfs_test_lock(nf->nf_file, lock));
- lock->fl_file = NULL;
+ lock->c.flc_file = NULL;
out:
inode_unlock(inode);
nfsd_file_put(nf);
@@ -7784,11 +7783,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (lockt->lt_type) {
case NFS4_READ_LT:
case NFS4_READW_LT:
- file_lock->fl_type = F_RDLCK;
+ file_lock->c.flc_type = F_RDLCK;
break;
case NFS4_WRITE_LT:
case NFS4_WRITEW_LT:
- file_lock->fl_type = F_WRLCK;
+ file_lock->c.flc_type = F_WRLCK;
break;
default:
dprintk("NFSD: nfs4_lockt: bad lock type!\n");
@@ -7798,9 +7797,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
lo = find_lockowner_str(cstate->clp, &lockt->lt_owner);
if (lo)
- file_lock->fl_owner = (fl_owner_t)lo;
- file_lock->fl_pid = current->tgid;
- file_lock->fl_flags = FL_POSIX;
+ file_lock->c.flc_owner = (fl_owner_t)lo;
+ file_lock->c.flc_pid = current->tgid;
+ file_lock->c.flc_flags = FL_POSIX;
file_lock->fl_start = lockt->lt_offset;
file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
@@ -7811,7 +7810,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
- if (file_lock->fl_type != F_UNLCK) {
+ if (file_lock->c.flc_type != F_UNLCK) {
status = nfserr_denied;
nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
}
@@ -7867,11 +7866,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto put_file;
}
- file_lock->fl_type = F_UNLCK;
- file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
- file_lock->fl_pid = current->tgid;
- file_lock->fl_file = nf->nf_file;
- file_lock->fl_flags = FL_POSIX;
+ file_lock->c.flc_type = F_UNLCK;
+ file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
+ file_lock->c.flc_pid = current->tgid;
+ file_lock->c.flc_file = nf->nf_file;
+ file_lock->c.flc_flags = FL_POSIX;
file_lock->fl_lmops = &nfsd_posix_mng_ops;
file_lock->fl_start = locku->lu_offset;
@@ -7911,14 +7910,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
{
struct file_lock *fl;
int status = false;
- struct nfsd_file *nf = find_any_file(fp);
+ struct nfsd_file *nf;
struct inode *inode;
struct file_lock_context *flctx;
+ spin_lock(&fp->fi_lock);
+ nf = find_any_file_locked(fp);
if (!nf) {
/* Any valid lock stateid should have some sort of access */
WARN_ON_ONCE(1);
- return status;
+ goto out;
}
inode = file_inode(nf->nf_file);
@@ -7926,15 +7927,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
if (flctx && !list_empty_careful(&flctx->flc_posix)) {
spin_lock(&flctx->flc_lock);
- list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
- if (fl->fl_owner == (fl_owner_t)lowner) {
+ for_each_file_lock(fl, &flctx->flc_posix) {
+ if (fl->c.flc_owner == (fl_owner_t)lowner) {
status = true;
break;
}
}
spin_unlock(&flctx->flc_lock);
}
- nfsd_file_put(nf);
+out:
+ spin_unlock(&fp->fi_lock);
return status;
}
@@ -7944,10 +7946,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
* @cstate: NFSv4 COMPOUND state
* @u: RELEASE_LOCKOWNER arguments
*
- * The lockowner's so_count is bumped when a lock record is added
- * or when copying a conflicting lock. The latter case is brief,
- * but can lead to fleeting false positives when looking for
- * locks-in-use.
+ * Check if theree are any locks still held and if not - free the lockowner
+ * and any lock state that is owned.
*
* Return values:
* %nfs_ok: lockowner released or not found
@@ -7983,10 +7983,13 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
spin_unlock(&clp->cl_lock);
return nfs_ok;
}
- if (atomic_read(&lo->lo_owner.so_count) != 2) {
- spin_unlock(&clp->cl_lock);
- nfs4_put_stateowner(&lo->lo_owner);
- return nfserr_locks_held;
+
+ list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
+ if (check_for_locks(stp->st_stid.sc_file, lo)) {
+ spin_unlock(&clp->cl_lock);
+ nfs4_put_stateowner(&lo->lo_owner);
+ return nfserr_locks_held;
+ }
}
unhash_lockowner_locked(lo);
while (!list_empty(&lo->lo_owner.so_stateids)) {
@@ -8448,15 +8451,17 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
{
__be32 status;
struct file_lock_context *ctx;
- struct file_lock *fl;
+ struct file_lease *fl;
struct nfs4_delegation *dp;
ctx = locks_inode_context(inode);
if (!ctx)
return 0;
spin_lock(&ctx->flc_lock);
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (fl->fl_flags == FL_LAYOUT)
+ for_each_file_lock(fl, &ctx->flc_lease) {
+ unsigned char type = fl->c.flc_type;
+
+ if (fl->c.flc_flags == FL_LAYOUT)
continue;
if (fl->fl_lmops != &nfsd_lease_mng_ops) {
/*
@@ -8464,12 +8469,12 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
* we are done; there isn't any write delegation
* on this inode
*/
- if (fl->fl_type == F_RDLCK)
+ if (type == F_RDLCK)
break;
goto break_lease;
}
- if (fl->fl_type == F_WRLCK) {
- dp = fl->fl_owner;
+ if (type == F_WRLCK) {
+ dp = fl->c.flc_owner;
if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
spin_unlock(&ctx->flc_lock);
return 0;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index bec33b89a075..0e3fc5ba33c7 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -107,7 +107,13 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
nilfs_transaction_commit(inode->i_sb);
mapped:
- folio_wait_stable(folio);
+ /*
+ * Since checksumming including data blocks is performed to determine
+ * the validity of the log to be written and used for recovery, it is
+ * necessary to wait for writeback to finish here, regardless of the
+ * stable write requirement of the backing device.
+ */
+ folio_wait_writeback(folio);
out:
sb_end_pagefault(inode->i_sb);
return vmf_fs_error(ret);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 0955b657938f..a9b8d77c8c1d 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -472,9 +472,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
struct nilfs_recovery_block *rb,
- struct page *page)
+ loff_t pos, struct page *page)
{
struct buffer_head *bh_org;
+ size_t from = pos & ~PAGE_MASK;
void *kaddr;
bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
@@ -482,7 +483,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
return -EIO;
kaddr = kmap_atomic(page);
- memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+ memcpy(kaddr + from, bh_org->b_data, bh_org->b_size);
kunmap_atomic(kaddr);
brelse(bh_org);
return 0;
@@ -521,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
goto failed_inode;
}
- err = nilfs_recovery_copy_block(nilfs, rb, page);
+ err = nilfs_recovery_copy_block(nilfs, rb, pos, page);
if (unlikely(err))
goto failed_page;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2590a0860eab..2bfb08052d39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_begin_folio_io(fs_folio);
fs_folio = bh->b_folio;
@@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
if (bh->b_folio != bd_folio) {
@@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, err);
fs_folio = bh->b_folio;
@@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_folio != bd_folio) {
folio_end_writeback(bd_folio);
bd_folio = bh->b_folio;
@@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, 0);
fs_folio = bh->b_folio;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 34e1e3e36733..7aaafb5cb9fc 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -27,26 +27,17 @@ static const struct file_operations ns_file_operations = {
static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
{
struct inode *inode = d_inode(dentry);
- const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+ struct ns_common *ns = inode->i_private;
+ const struct proc_ns_operations *ns_ops = ns->ops;
return dynamic_dname(buffer, buflen, "%s:[%lu]",
ns_ops->name, inode->i_ino);
}
-static void ns_prune_dentry(struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
- if (inode) {
- struct ns_common *ns = inode->i_private;
- atomic_long_set(&ns->stashed, 0);
- }
-}
-
-const struct dentry_operations ns_dentry_operations =
-{
- .d_prune = ns_prune_dentry,
+const struct dentry_operations ns_dentry_operations = {
.d_delete = always_delete_dentry,
.d_dname = ns_dname,
+ .d_prune = stashed_dentry_prune,
};
static void nsfs_evict(struct inode *inode)
@@ -56,67 +47,16 @@ static void nsfs_evict(struct inode *inode)
ns->ops->put(ns);
}
-static int __ns_get_path(struct path *path, struct ns_common *ns)
-{
- struct vfsmount *mnt = nsfs_mnt;
- struct dentry *dentry;
- struct inode *inode;
- unsigned long d;
-
- rcu_read_lock();
- d = atomic_long_read(&ns->stashed);
- if (!d)
- goto slow;
- dentry = (struct dentry *)d;
- if (!lockref_get_not_dead(&dentry->d_lockref))
- goto slow;
- rcu_read_unlock();
- ns->ops->put(ns);
-got_it:
- path->mnt = mntget(mnt);
- path->dentry = dentry;
- return 0;
-slow:
- rcu_read_unlock();
- inode = new_inode_pseudo(mnt->mnt_sb);
- if (!inode) {
- ns->ops->put(ns);
- return -ENOMEM;
- }
- inode->i_ino = ns->inum;
- simple_inode_init_ts(inode);
- inode->i_flags |= S_IMMUTABLE;
- inode->i_mode = S_IFREG | S_IRUGO;
- inode->i_fop = &ns_file_operations;
- inode->i_private = ns;
-
- dentry = d_make_root(inode); /* not the normal use, but... */
- if (!dentry)
- return -ENOMEM;
- dentry->d_fsdata = (void *)ns->ops;
- d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
- if (d) {
- d_delete(dentry); /* make sure ->d_prune() does nothing */
- dput(dentry);
- cpu_relax();
- return -EAGAIN;
- }
- goto got_it;
-}
-
int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
void *private_data)
{
- int ret;
+ struct ns_common *ns;
- do {
- struct ns_common *ns = ns_get_cb(private_data);
- if (!ns)
- return -ENOENT;
- ret = __ns_get_path(path, ns);
- } while (ret == -EAGAIN);
+ ns = ns_get_cb(private_data);
+ if (!ns)
+ return -ENOENT;
- return ret;
+ return path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt, ns, path);
}
struct ns_get_path_task_args {
@@ -146,6 +86,7 @@ int open_related_ns(struct ns_common *ns,
struct ns_common *(*get_ns)(struct ns_common *ns))
{
struct path path = {};
+ struct ns_common *relative;
struct file *f;
int err;
int fd;
@@ -154,19 +95,15 @@ int open_related_ns(struct ns_common *ns,
if (fd < 0)
return fd;
- do {
- struct ns_common *relative;
-
- relative = get_ns(ns);
- if (IS_ERR(relative)) {
- put_unused_fd(fd);
- return PTR_ERR(relative);
- }
-
- err = __ns_get_path(&path, relative);
- } while (err == -EAGAIN);
+ relative = get_ns(ns);
+ if (IS_ERR(relative)) {
+ put_unused_fd(fd);
+ return PTR_ERR(relative);
+ }
- if (err) {
+ err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
+ relative, &path);
+ if (err < 0) {
put_unused_fd(fd);
return err;
}
@@ -249,7 +186,8 @@ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+ const struct ns_common *ns = inode->i_private;
+ const struct proc_ns_operations *ns_ops = ns->ops;
seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
return 0;
@@ -261,6 +199,24 @@ static const struct super_operations nsfs_ops = {
.show_path = nsfs_show_path,
};
+static void nsfs_init_inode(struct inode *inode, void *data)
+{
+ inode->i_private = data;
+ inode->i_mode |= S_IRUGO;
+ inode->i_fop = &ns_file_operations;
+}
+
+static void nsfs_put_data(void *data)
+{
+ struct ns_common *ns = data;
+ ns->ops->put(ns);
+}
+
+static const struct stashed_operations nsfs_stashed_ops = {
+ .init_inode = nsfs_init_inode,
+ .put_data = nsfs_put_data,
+};
+
static int nsfs_init_fs_context(struct fs_context *fc)
{
struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
@@ -268,6 +224,7 @@ static int nsfs_init_fs_context(struct fs_context *fc)
return -ENOMEM;
ctx->ops = &nsfs_ops;
ctx->dops = &ns_dentry_operations;
+ fc->s_fs_info = (void *)&nsfs_stashed_ops;
return 0;
}
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
deleted file mode 100644
index 7b2509741735..000000000000
--- a/fs/ntfs/Kconfig
+++ /dev/null
@@ -1,81 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config NTFS_FS
- tristate "NTFS file system support"
- select BUFFER_HEAD
- select NLS
- help
- NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
-
- Saying Y or M here enables read support. There is partial, but
- safe, write support available. For write support you must also
- say Y to "NTFS write support" below.
-
- There are also a number of user-space tools available, called
- ntfsprogs. These include ntfsundelete and ntfsresize, that work
- without NTFS support enabled in the kernel.
-
- This is a rewrite from scratch of Linux NTFS support and replaced
- the old NTFS code starting with Linux 2.5.11. A backport to
- the Linux 2.4 kernel series is separately available as a patch
- from the project web site.
-
- For more information see <file:Documentation/filesystems/ntfs.rst>
- and <http://www.linux-ntfs.org/>.
-
- To compile this file system support as a module, choose M here: the
- module will be called ntfs.
-
- If you are not using Windows NT, 2000, XP or 2003 in addition to
- Linux on your computer it is safe to say N.
-
-config NTFS_DEBUG
- bool "NTFS debugging support"
- depends on NTFS_FS
- help
- If you are experiencing any problems with the NTFS file system, say
- Y here. This will result in additional consistency checks to be
- performed by the driver as well as additional debugging messages to
- be written to the system log. Note that debugging messages are
- disabled by default. To enable them, supply the option debug_msgs=1
- at the kernel command line when booting the kernel or as an option
- to insmod when loading the ntfs module. Once the driver is active,
- you can enable debugging messages by doing (as root):
- echo 1 > /proc/sys/fs/ntfs-debug
- Replacing the "1" with "0" would disable debug messages.
-
- If you leave debugging messages disabled, this results in little
- overhead, but enabling debug messages results in very significant
- slowdown of the system.
-
- When reporting bugs, please try to have available a full dump of
- debugging messages while the misbehaviour was occurring.
-
-config NTFS_RW
- bool "NTFS write support"
- depends on NTFS_FS
- depends on PAGE_SIZE_LESS_THAN_64KB
- help
- This enables the partial, but safe, write support in the NTFS driver.
-
- The only supported operation is overwriting existing files, without
- changing the file length. No file or directory creation, deletion or
- renaming is possible. Note only non-resident files can be written to
- so you may find that some very small files (<500 bytes or so) cannot
- be written to.
-
- While we cannot guarantee that it will not damage any data, we have
- so far not received a single report where the driver would have
- damaged someones data so we assume it is perfectly safe to use.
-
- Note: While write support is safe in this version (a rewrite from
- scratch of the NTFS support), it should be noted that the old NTFS
- write support, included in Linux 2.5.10 and before (since 1997),
- is not safe.
-
- This is currently useful with TopologiLinux. TopologiLinux is run
- on top of any DOS/Microsoft Windows system without partitioning your
- hard disk. Unlike other Linux distributions TopologiLinux does not
- need its own partition. For more information see
- <http://topologi-linux.sourceforge.net/>
-
- It is perfectly safe to say N here.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
deleted file mode 100644
index 3e736572ed00..000000000000
--- a/fs/ntfs/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Rules for making the NTFS driver.
-
-obj-$(CONFIG_NTFS_FS) += ntfs.o
-
-ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
- index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
- unistr.o upcase.o
-
-ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-
-ccflags-y := -DNTFS_VERSION=\"2.1.32\"
-ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG
-ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW
-
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
deleted file mode 100644
index 2d01517a2d59..000000000000
--- a/fs/ntfs/aops.c
+++ /dev/null
@@ -1,1744 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * aops.c - NTFS kernel address space operations and page cache handling.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-#include <linux/buffer_head.h>
-#include <linux/writeback.h>
-#include <linux/bit_spinlock.h>
-#include <linux/bio.h>
-
-#include "aops.h"
-#include "attrib.h"
-#include "debug.h"
-#include "inode.h"
-#include "mft.h"
-#include "runlist.h"
-#include "types.h"
-#include "ntfs.h"
-
-/**
- * ntfs_end_buffer_async_read - async io completion for reading attributes
- * @bh: buffer head on which io is completed
- * @uptodate: whether @bh is now uptodate or not
- *
- * Asynchronous I/O completion handler for reading pages belonging to the
- * attribute address space of an inode. The inodes can either be files or
- * directories or they can be fake inodes describing some attribute.
- *
- * If NInoMstProtected(), perform the post read mst fixups when all IO on the
- * page has been completed and mark the page uptodate or set the error bit on
- * the page. To determine the size of the records that need fixing up, we
- * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
- * record size, and index_block_size_bits, to the log(base 2) of the ntfs
- * record size.
- */
-static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
-{
- unsigned long flags;
- struct buffer_head *first, *tmp;
- struct page *page;
- struct inode *vi;
- ntfs_inode *ni;
- int page_uptodate = 1;
-
- page = bh->b_page;
- vi = page->mapping->host;
- ni = NTFS_I(vi);
-
- if (likely(uptodate)) {
- loff_t i_size;
- s64 file_ofs, init_size;
-
- set_buffer_uptodate(bh);
-
- file_ofs = ((s64)page->index << PAGE_SHIFT) +
- bh_offset(bh);
- read_lock_irqsave(&ni->size_lock, flags);
- init_size = ni->initialized_size;
- i_size = i_size_read(vi);
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (unlikely(init_size > i_size)) {
- /* Race with shrinking truncate. */
- init_size = i_size;
- }
- /* Check for the current buffer head overflowing. */
- if (unlikely(file_ofs + bh->b_size > init_size)) {
- int ofs;
- void *kaddr;
-
- ofs = 0;
- if (file_ofs < init_size)
- ofs = init_size - file_ofs;
- kaddr = kmap_atomic(page);
- memset(kaddr + bh_offset(bh) + ofs, 0,
- bh->b_size - ofs);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- }
- } else {
- clear_buffer_uptodate(bh);
- SetPageError(page);
- ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
- "0x%llx.", (unsigned long long)bh->b_blocknr);
- }
- first = page_buffers(page);
- spin_lock_irqsave(&first->b_uptodate_lock, flags);
- clear_buffer_async_read(bh);
- unlock_buffer(bh);
- tmp = bh;
- do {
- if (!buffer_uptodate(tmp))
- page_uptodate = 0;
- if (buffer_async_read(tmp)) {
- if (likely(buffer_locked(tmp)))
- goto still_busy;
- /* Async buffers must be locked. */
- BUG();
- }
- tmp = tmp->b_this_page;
- } while (tmp != bh);
- spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- /*
- * If none of the buffers had errors then we can set the page uptodate,
- * but we first have to perform the post read mst fixups, if the
- * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
- * Note we ignore fixup errors as those are detected when
- * map_mft_record() is called which gives us per record granularity
- * rather than per page granularity.
- */
- if (!NInoMstProtected(ni)) {
- if (likely(page_uptodate && !PageError(page)))
- SetPageUptodate(page);
- } else {
- u8 *kaddr;
- unsigned int i, recs;
- u32 rec_size;
-
- rec_size = ni->itype.index.block_size;
- recs = PAGE_SIZE / rec_size;
- /* Should have been verified before we got here... */
- BUG_ON(!recs);
- kaddr = kmap_atomic(page);
- for (i = 0; i < recs; i++)
- post_read_mst_fixup((NTFS_RECORD*)(kaddr +
- i * rec_size), rec_size);
- kunmap_atomic(kaddr);
- flush_dcache_page(page);
- if (likely(page_uptodate && !PageError(page)))
- SetPageUptodate(page);
- }
- unlock_page(page);
- return;
-still_busy:
- spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- return;
-}
-
-/**
- * ntfs_read_block - fill a @folio of an address space with data
- * @folio: page cache folio to fill with data
- *
- * We read each buffer asynchronously and when all buffers are read in, our io
- * completion handler ntfs_end_buffer_read_async(), if required, automatically
- * applies the mst fixups to the folio before finally marking it uptodate and
- * unlocking it.
- *
- * We only enforce allocated_size limit because i_size is checked for in
- * generic_file_read().
- *
- * Return 0 on success and -errno on error.
- *
- * Contains an adapted version of fs/buffer.c::block_read_full_folio().
- */
-static int ntfs_read_block(struct folio *folio)
-{
- loff_t i_size;
- VCN vcn;
- LCN lcn;
- s64 init_size;
- struct inode *vi;
- ntfs_inode *ni;
- ntfs_volume *vol;
- runlist_element *rl;
- struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
- sector_t iblock, lblock, zblock;
- unsigned long flags;
- unsigned int blocksize, vcn_ofs;
- int i, nr;
- unsigned char blocksize_bits;
-
- vi = folio->mapping->host;
- ni = NTFS_I(vi);
- vol = ni->vol;
-
- /* $MFT/$DATA must have its complete runlist in memory at all times. */
- BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
-
- blocksize = vol->sb->s_blocksize;
- blocksize_bits = vol->sb->s_blocksize_bits;
-
- head = folio_buffers(folio);
- if (!head)
- head = create_empty_buffers(folio, blocksize, 0);
- bh = head;
-
- /*
- * We may be racing with truncate. To avoid some of the problems we
- * now take a snapshot of the various sizes and use those for the whole
- * of the function. In case of an extending truncate it just means we
- * may leave some buffers unmapped which are now allocated. This is
- * not a problem since these buffers will just get mapped when a write
- * occurs. In case of a shrinking truncate, we will detect this later
- * on due to the runlist being incomplete and if the folio is being
- * fully truncated, truncate will throw it away as soon as we unlock
- * it so no need to worry what we do with it.
- */
- iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
- read_lock_irqsave(&ni->size_lock, flags);
- lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
- init_size = ni->initialized_size;
- i_size = i_size_read(vi);
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (unlikely(init_size > i_size)) {
- /* Race with shrinking truncate. */
- init_size = i_size;
- }
- zblock = (init_size + blocksize - 1) >> blocksize_bits;
-
- /* Loop through all the buffers in the folio. */
- rl = NULL;
- nr = i = 0;
- do {
- int err = 0;
-
- if (unlikely(buffer_uptodate(bh)))
- continue;
- if (unlikely(buffer_mapped(bh))) {
- arr[nr++] = bh;
- continue;
- }
- bh->b_bdev = vol->sb->s_bdev;
- /* Is the block within the allowed limits? */
- if (iblock < lblock) {
- bool is_retry = false;
-
- /* Convert iblock into corresponding vcn and offset. */
- vcn = (VCN)iblock << blocksize_bits >>
- vol->cluster_size_bits;
- vcn_ofs = ((VCN)iblock << blocksize_bits) &
- vol->cluster_size_mask;
- if (!rl) {
-lock_retry_remap:
- down_read(&ni->runlist.lock);
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- } else
- lcn = LCN_RL_NOT_MAPPED;
- /* Successful remap. */
- if (lcn >= 0) {
- /* Setup buffer head to correct block. */
- bh->b_blocknr = ((lcn << vol->cluster_size_bits)
- + vcn_ofs) >> blocksize_bits;
- set_buffer_mapped(bh);
- /* Only read initialized data blocks. */
- if (iblock < zblock) {
- arr[nr++] = bh;
- continue;
- }
- /* Fully non-initialized data block, zero it. */
- goto handle_zblock;
- }
- /* It is a hole, need to zero it. */
- if (lcn == LCN_HOLE)
- goto handle_hole;
- /* If first try and runlist unmapped, map and retry. */
- if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
- is_retry = true;
- /*
- * Attempt to map runlist, dropping lock for
- * the duration.
- */
- up_read(&ni->runlist.lock);
- err = ntfs_map_runlist(ni, vcn);
- if (likely(!err))
- goto lock_retry_remap;
- rl = NULL;
- } else if (!rl)
- up_read(&ni->runlist.lock);
- /*
- * If buffer is outside the runlist, treat it as a
- * hole. This can happen due to concurrent truncate
- * for example.
- */
- if (err == -ENOENT || lcn == LCN_ENOENT) {
- err = 0;
- goto handle_hole;
- }
- /* Hard error, zero out region. */
- if (!err)
- err = -EIO;
- bh->b_blocknr = -1;
- folio_set_error(folio);
- ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
- "attribute type 0x%x, vcn 0x%llx, "
- "offset 0x%x because its location on "
- "disk could not be determined%s "
- "(error code %i).", ni->mft_no,
- ni->type, (unsigned long long)vcn,
- vcn_ofs, is_retry ? " even after "
- "retrying" : "", err);
- }
- /*
- * Either iblock was outside lblock limits or
- * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion
- * of the folio and set the buffer uptodate.
- */
-handle_hole:
- bh->b_blocknr = -1UL;
- clear_buffer_mapped(bh);
-handle_zblock:
- folio_zero_range(folio, i * blocksize, blocksize);
- if (likely(!err))
- set_buffer_uptodate(bh);
- } while (i++, iblock++, (bh = bh->b_this_page) != head);
-
- /* Release the lock if we took it. */
- if (rl)
- up_read(&ni->runlist.lock);
-
- /* Check we have at least one buffer ready for i/o. */
- if (nr) {
- struct buffer_head *tbh;
-
- /* Lock the buffers. */
- for (i = 0; i < nr; i++) {
- tbh = arr[i];
- lock_buffer(tbh);
- tbh->b_end_io = ntfs_end_buffer_async_read;
- set_buffer_async_read(tbh);
- }
- /* Finally, start i/o on the buffers. */
- for (i = 0; i < nr; i++) {
- tbh = arr[i];
- if (likely(!buffer_uptodate(tbh)))
- submit_bh(REQ_OP_READ, tbh);
- else
- ntfs_end_buffer_async_read(tbh, 1);
- }
- return 0;
- }
- /* No i/o was scheduled on any of the buffers. */
- if (likely(!folio_test_error(folio)))
- folio_mark_uptodate(folio);
- else /* Signal synchronous i/o error. */
- nr = -EIO;
- folio_unlock(folio);
- return nr;
-}
-
-/**
- * ntfs_read_folio - fill a @folio of a @file with data from the device
- * @file: open file to which the folio @folio belongs or NULL
- * @folio: page cache folio to fill with data
- *
- * For non-resident attributes, ntfs_read_folio() fills the @folio of the open
- * file @file by calling the ntfs version of the generic block_read_full_folio()
- * function, ntfs_read_block(), which in turn creates and reads in the buffers
- * associated with the folio asynchronously.
- *
- * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
- * data from the mft record (which at this stage is most likely in memory) and
- * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
- * even if the mft record is not cached at this point in time, we need to wait
- * for it to be read in before we can do the copy.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_read_folio(struct file *file, struct folio *folio)
-{
- struct page *page = &folio->page;
- loff_t i_size;
- struct inode *vi;
- ntfs_inode *ni, *base_ni;
- u8 *addr;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *mrec;
- unsigned long flags;
- u32 attr_len;
- int err = 0;
-
-retry_readpage:
- BUG_ON(!PageLocked(page));
- vi = page->mapping->host;
- i_size = i_size_read(vi);
- /* Is the page fully outside i_size? (truncate in progress) */
- if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
- PAGE_SHIFT)) {
- zero_user(page, 0, PAGE_SIZE);
- ntfs_debug("Read outside i_size - truncated?");
- goto done;
- }
- /*
- * This can potentially happen because we clear PageUptodate() during
- * ntfs_writepage() of MstProtected() attributes.
- */
- if (PageUptodate(page)) {
- unlock_page(page);
- return 0;
- }
- ni = NTFS_I(vi);
- /*
- * Only $DATA attributes can be encrypted and only unnamed $DATA
- * attributes can be compressed. Index root can have the flags set but
- * this means to create compressed/encrypted files, not that the
- * attribute is compressed/encrypted. Note we need to check for
- * AT_INDEX_ALLOCATION since this is the type of both directory and
- * index inodes.
- */
- if (ni->type != AT_INDEX_ALLOCATION) {
- /* If attribute is encrypted, deny access, just like NT4. */
- if (NInoEncrypted(ni)) {
- BUG_ON(ni->type != AT_DATA);
- err = -EACCES;
- goto err_out;
- }
- /* Compressed data streams are handled in compress.c. */
- if (NInoNonResident(ni) && NInoCompressed(ni)) {
- BUG_ON(ni->type != AT_DATA);
- BUG_ON(ni->name_len);
- return ntfs_read_compressed_block(page);
- }
- }
- /* NInoNonResident() == NInoIndexAllocPresent() */
- if (NInoNonResident(ni)) {
- /* Normal, non-resident data stream. */
- return ntfs_read_block(folio);
- }
- /*
- * Attribute is resident, implying it is not compressed or encrypted.
- * This also means the attribute is smaller than an mft record and
- * hence smaller than a page, so can simply zero out any pages with
- * index above 0. Note the attribute can actually be marked compressed
- * but if it is resident the actual data is not compressed so we are
- * ok to ignore the compressed flag here.
- */
- if (unlikely(page->index > 0)) {
- zero_user(page, 0, PAGE_SIZE);
- goto done;
- }
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /* Map, pin, and lock the mft record. */
- mrec = map_mft_record(base_ni);
- if (IS_ERR(mrec)) {
- err = PTR_ERR(mrec);
- goto err_out;
- }
- /*
- * If a parallel write made the attribute non-resident, drop the mft
- * record and retry the read_folio.
- */
- if (unlikely(NInoNonResident(ni))) {
- unmap_mft_record(base_ni);
- goto retry_readpage;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err))
- goto put_unm_err_out;
- attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
- read_lock_irqsave(&ni->size_lock, flags);
- if (unlikely(attr_len > ni->initialized_size))
- attr_len = ni->initialized_size;
- i_size = i_size_read(vi);
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (unlikely(attr_len > i_size)) {
- /* Race with shrinking truncate. */
- attr_len = i_size;
- }
- addr = kmap_atomic(page);
- /* Copy the data to the page. */
- memcpy(addr, (u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset),
- attr_len);
- /* Zero the remainder of the page. */
- memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
- flush_dcache_page(page);
- kunmap_atomic(addr);
-put_unm_err_out:
- ntfs_attr_put_search_ctx(ctx);
-unm_err_out:
- unmap_mft_record(base_ni);
-done:
- SetPageUptodate(page);
-err_out:
- unlock_page(page);
- return err;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_write_block - write a @folio to the backing store
- * @folio: page cache folio to write out
- * @wbc: writeback control structure
- *
- * This function is for writing folios belonging to non-resident, non-mst
- * protected attributes to their backing store.
- *
- * For a folio with buffers, map and write the dirty buffers asynchronously
- * under folio writeback. For a folio without buffers, create buffers for the
- * folio, then proceed as above.
- *
- * If a folio doesn't have buffers the folio dirty state is definitive. If
- * a folio does have buffers, the folio dirty state is just a hint,
- * and the buffer dirty state is definitive. (A hint which has rules:
- * dirty buffers against a clean folio is illegal. Other combinations are
- * legal and need to be handled. In particular a dirty folio containing
- * clean buffers for example.)
- *
- * Return 0 on success and -errno on error.
- *
- * Based on ntfs_read_block() and __block_write_full_folio().
- */
-static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc)
-{
- VCN vcn;
- LCN lcn;
- s64 initialized_size;
- loff_t i_size;
- sector_t block, dblock, iblock;
- struct inode *vi;
- ntfs_inode *ni;
- ntfs_volume *vol;
- runlist_element *rl;
- struct buffer_head *bh, *head;
- unsigned long flags;
- unsigned int blocksize, vcn_ofs;
- int err;
- bool need_end_writeback;
- unsigned char blocksize_bits;
-
- vi = folio->mapping->host;
- ni = NTFS_I(vi);
- vol = ni->vol;
-
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
- "0x%lx.", ni->mft_no, ni->type, folio->index);
-
- BUG_ON(!NInoNonResident(ni));
- BUG_ON(NInoMstProtected(ni));
- blocksize = vol->sb->s_blocksize;
- blocksize_bits = vol->sb->s_blocksize_bits;
- head = folio_buffers(folio);
- if (!head) {
- BUG_ON(!folio_test_uptodate(folio));
- head = create_empty_buffers(folio, blocksize,
- (1 << BH_Uptodate) | (1 << BH_Dirty));
- }
- bh = head;
-
- /* NOTE: Different naming scheme to ntfs_read_block()! */
-
- /* The first block in the folio. */
- block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
-
- read_lock_irqsave(&ni->size_lock, flags);
- i_size = i_size_read(vi);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
-
- /* The first out of bounds block for the data size. */
- dblock = (i_size + blocksize - 1) >> blocksize_bits;
-
- /* The last (fully or partially) initialized block. */
- iblock = initialized_size >> blocksize_bits;
-
- /*
- * Be very careful. We have no exclusion from block_dirty_folio
- * here, and the (potentially unmapped) buffers may become dirty at
- * any time. If a buffer becomes dirty here after we've inspected it
- * then we just miss that fact, and the folio stays dirty.
- *
- * Buffers outside i_size may be dirtied by block_dirty_folio;
- * handle that here by just cleaning them.
- */
-
- /*
- * Loop through all the buffers in the folio, mapping all the dirty
- * buffers to disk addresses and handling any aliases from the
- * underlying block device's mapping.
- */
- rl = NULL;
- err = 0;
- do {
- bool is_retry = false;
-
- if (unlikely(block >= dblock)) {
- /*
- * Mapped buffers outside i_size will occur, because
- * this folio can be outside i_size when there is a
- * truncate in progress. The contents of such buffers
- * were zeroed by ntfs_writepage().
- *
- * FIXME: What about the small race window where
- * ntfs_writepage() has not done any clearing because
- * the folio was within i_size but before we get here,
- * vmtruncate() modifies i_size?
- */
- clear_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- continue;
- }
-
- /* Clean buffers are not written out, so no need to map them. */
- if (!buffer_dirty(bh))
- continue;
-
- /* Make sure we have enough initialized size. */
- if (unlikely((block >= iblock) &&
- (initialized_size < i_size))) {
- /*
- * If this folio is fully outside initialized
- * size, zero out all folios between the current
- * initialized size and the current folio. Just
- * use ntfs_read_folio() to do the zeroing
- * transparently.
- */
- if (block > iblock) {
- // TODO:
- // For each folio do:
- // - read_cache_folio()
- // Again for each folio do:
- // - wait_on_folio_locked()
- // - Check (folio_test_uptodate(folio) &&
- // !folio_test_error(folio))
- // Update initialized size in the attribute and
- // in the inode.
- // Again, for each folio do:
- // block_dirty_folio();
- // folio_put()
- // We don't need to wait on the writes.
- // Update iblock.
- }
- /*
- * The current folio straddles initialized size. Zero
- * all non-uptodate buffers and set them uptodate (and
- * dirty?). Note, there aren't any non-uptodate buffers
- * if the folio is uptodate.
- * FIXME: For an uptodate folio, the buffers may need to
- * be written out because they were not initialized on
- * disk before.
- */
- if (!folio_test_uptodate(folio)) {
- // TODO:
- // Zero any non-uptodate buffers up to i_size.
- // Set them uptodate and dirty.
- }
- // TODO:
- // Update initialized size in the attribute and in the
- // inode (up to i_size).
- // Update iblock.
- // FIXME: This is inefficient. Try to batch the two
- // size changes to happen in one go.
- ntfs_error(vol->sb, "Writing beyond initialized size "
- "is not supported yet. Sorry.");
- err = -EOPNOTSUPP;
- break;
- // Do NOT set_buffer_new() BUT DO clear buffer range
- // outside write request range.
- // set_buffer_uptodate() on complete buffers as well as
- // set_buffer_dirty().
- }
-
- /* No need to map buffers that are already mapped. */
- if (buffer_mapped(bh))
- continue;
-
- /* Unmapped, dirty buffer. Need to map it. */
- bh->b_bdev = vol->sb->s_bdev;
-
- /* Convert block into corresponding vcn and offset. */
- vcn = (VCN)block << blocksize_bits;
- vcn_ofs = vcn & vol->cluster_size_mask;
- vcn >>= vol->cluster_size_bits;
- if (!rl) {
-lock_retry_remap:
- down_read(&ni->runlist.lock);
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- } else
- lcn = LCN_RL_NOT_MAPPED;
- /* Successful remap. */
- if (lcn >= 0) {
- /* Setup buffer head to point to correct block. */
- bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
- vcn_ofs) >> blocksize_bits;
- set_buffer_mapped(bh);
- continue;
- }
- /* It is a hole, need to instantiate it. */
- if (lcn == LCN_HOLE) {
- u8 *kaddr;
- unsigned long *bpos, *bend;
-
- /* Check if the buffer is zero. */
- kaddr = kmap_local_folio(folio, bh_offset(bh));
- bpos = (unsigned long *)kaddr;
- bend = (unsigned long *)(kaddr + blocksize);
- do {
- if (unlikely(*bpos))
- break;
- } while (likely(++bpos < bend));
- kunmap_local(kaddr);
- if (bpos == bend) {
- /*
- * Buffer is zero and sparse, no need to write
- * it.
- */
- bh->b_blocknr = -1;
- clear_buffer_dirty(bh);
- continue;
- }
- // TODO: Instantiate the hole.
- // clear_buffer_new(bh);
- // clean_bdev_bh_alias(bh);
- ntfs_error(vol->sb, "Writing into sparse regions is "
- "not supported yet. Sorry.");
- err = -EOPNOTSUPP;
- break;
- }
- /* If first try and runlist unmapped, map and retry. */
- if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
- is_retry = true;
- /*
- * Attempt to map runlist, dropping lock for
- * the duration.
- */
- up_read(&ni->runlist.lock);
- err = ntfs_map_runlist(ni, vcn);
- if (likely(!err))
- goto lock_retry_remap;
- rl = NULL;
- } else if (!rl)
- up_read(&ni->runlist.lock);
- /*
- * If buffer is outside the runlist, truncate has cut it out
- * of the runlist. Just clean and clear the buffer and set it
- * uptodate so it can get discarded by the VM.
- */
- if (err == -ENOENT || lcn == LCN_ENOENT) {
- bh->b_blocknr = -1;
- clear_buffer_dirty(bh);
- folio_zero_range(folio, bh_offset(bh), blocksize);
- set_buffer_uptodate(bh);
- err = 0;
- continue;
- }
- /* Failed to map the buffer, even after retrying. */
- if (!err)
- err = -EIO;
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
- "attribute type 0x%x, vcn 0x%llx, offset 0x%x "
- "because its location on disk could not be "
- "determined%s (error code %i).", ni->mft_no,
- ni->type, (unsigned long long)vcn,
- vcn_ofs, is_retry ? " even after "
- "retrying" : "", err);
- break;
- } while (block++, (bh = bh->b_this_page) != head);
-
- /* Release the lock if we took it. */
- if (rl)
- up_read(&ni->runlist.lock);
-
- /* For the error case, need to reset bh to the beginning. */
- bh = head;
-
- /* Just an optimization, so ->read_folio() is not called later. */
- if (unlikely(!folio_test_uptodate(folio))) {
- int uptodate = 1;
- do {
- if (!buffer_uptodate(bh)) {
- uptodate = 0;
- bh = head;
- break;
- }
- } while ((bh = bh->b_this_page) != head);
- if (uptodate)
- folio_mark_uptodate(folio);
- }
-
- /* Setup all mapped, dirty buffers for async write i/o. */
- do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
- lock_buffer(bh);
- if (test_clear_buffer_dirty(bh)) {
- BUG_ON(!buffer_uptodate(bh));
- mark_buffer_async_write(bh);
- } else
- unlock_buffer(bh);
- } else if (unlikely(err)) {
- /*
- * For the error case. The buffer may have been set
- * dirty during attachment to a dirty folio.
- */
- if (err != -ENOMEM)
- clear_buffer_dirty(bh);
- }
- } while ((bh = bh->b_this_page) != head);
-
- if (unlikely(err)) {
- // TODO: Remove the -EOPNOTSUPP check later on...
- if (unlikely(err == -EOPNOTSUPP))
- err = 0;
- else if (err == -ENOMEM) {
- ntfs_warning(vol->sb, "Error allocating memory. "
- "Redirtying folio so we try again "
- "later.");
- /*
- * Put the folio back on mapping->dirty_pages, but
- * leave its buffer's dirty state as-is.
- */
- folio_redirty_for_writepage(wbc, folio);
- err = 0;
- } else
- folio_set_error(folio);
- }
-
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */
-
- /* Submit the prepared buffers for i/o. */
- need_end_writeback = true;
- do {
- struct buffer_head *next = bh->b_this_page;
- if (buffer_async_write(bh)) {
- submit_bh(REQ_OP_WRITE, bh);
- need_end_writeback = false;
- }
- bh = next;
- } while (bh != head);
- folio_unlock(folio);
-
- /* If no i/o was started, need to end writeback here. */
- if (unlikely(need_end_writeback))
- folio_end_writeback(folio);
-
- ntfs_debug("Done.");
- return err;
-}
-
-/**
- * ntfs_write_mst_block - write a @page to the backing store
- * @page: page cache page to write out
- * @wbc: writeback control structure
- *
- * This function is for writing pages belonging to non-resident, mst protected
- * attributes to their backing store. The only supported attributes are index
- * allocation and $MFT/$DATA. Both directory inodes and index inodes are
- * supported for the index allocation case.
- *
- * The page must remain locked for the duration of the write because we apply
- * the mst fixups, write, and then undo the fixups, so if we were to unlock the
- * page before undoing the fixups, any other user of the page will see the
- * page contents as corrupt.
- *
- * We clear the page uptodate flag for the duration of the function to ensure
- * exclusion for the $MFT/$DATA case against someone mapping an mft record we
- * are about to apply the mst fixups to.
- *
- * Return 0 on success and -errno on error.
- *
- * Based on ntfs_write_block(), ntfs_mft_writepage(), and
- * write_mft_record_nolock().
- */
-static int ntfs_write_mst_block(struct page *page,
- struct writeback_control *wbc)
-{
- sector_t block, dblock, rec_block;
- struct inode *vi = page->mapping->host;
- ntfs_inode *ni = NTFS_I(vi);
- ntfs_volume *vol = ni->vol;
- u8 *kaddr;
- unsigned int rec_size = ni->itype.index.block_size;
- ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
- struct buffer_head *bh, *head, *tbh, *rec_start_bh;
- struct buffer_head *bhs[MAX_BUF_PER_PAGE];
- runlist_element *rl;
- int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
- unsigned bh_size, rec_size_bits;
- bool sync, is_mft, page_is_dirty, rec_is_dirty;
- unsigned char bh_size_bits;
-
- if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
- return -EINVAL;
-
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
- "0x%lx.", vi->i_ino, ni->type, page->index);
- BUG_ON(!NInoNonResident(ni));
- BUG_ON(!NInoMstProtected(ni));
- is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
- /*
- * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
- * in its page cache were to be marked dirty. However this should
- * never happen with the current driver and considering we do not
- * handle this case here we do want to BUG(), at least for now.
- */
- BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
- (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
- bh_size = vol->sb->s_blocksize;
- bh_size_bits = vol->sb->s_blocksize_bits;
- max_bhs = PAGE_SIZE / bh_size;
- BUG_ON(!max_bhs);
- BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
-
- /* Were we called for sync purposes? */
- sync = (wbc->sync_mode == WB_SYNC_ALL);
-
- /* Make sure we have mapped buffers. */
- bh = head = page_buffers(page);
- BUG_ON(!bh);
-
- rec_size_bits = ni->itype.index.block_size_bits;
- BUG_ON(!(PAGE_SIZE >> rec_size_bits));
- bhs_per_rec = rec_size >> bh_size_bits;
- BUG_ON(!bhs_per_rec);
-
- /* The first block in the page. */
- rec_block = block = (sector_t)page->index <<
- (PAGE_SHIFT - bh_size_bits);
-
- /* The first out of bounds block for the data size. */
- dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
-
- rl = NULL;
- err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
- page_is_dirty = rec_is_dirty = false;
- rec_start_bh = NULL;
- do {
- bool is_retry = false;
-
- if (likely(block < rec_block)) {
- if (unlikely(block >= dblock)) {
- clear_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- continue;
- }
- /*
- * This block is not the first one in the record. We
- * ignore the buffer's dirty state because we could
- * have raced with a parallel mark_ntfs_record_dirty().
- */
- if (!rec_is_dirty)
- continue;
- if (unlikely(err2)) {
- if (err2 != -ENOMEM)
- clear_buffer_dirty(bh);
- continue;
- }
- } else /* if (block == rec_block) */ {
- BUG_ON(block > rec_block);
- /* This block is the first one in the record. */
- rec_block += bhs_per_rec;
- err2 = 0;
- if (unlikely(block >= dblock)) {
- clear_buffer_dirty(bh);
- continue;
- }
- if (!buffer_dirty(bh)) {
- /* Clean records are not written out. */
- rec_is_dirty = false;
- continue;
- }
- rec_is_dirty = true;
- rec_start_bh = bh;
- }
- /* Need to map the buffer if it is not mapped already. */
- if (unlikely(!buffer_mapped(bh))) {
- VCN vcn;
- LCN lcn;
- unsigned int vcn_ofs;
-
- bh->b_bdev = vol->sb->s_bdev;
- /* Obtain the vcn and offset of the current block. */
- vcn = (VCN)block << bh_size_bits;
- vcn_ofs = vcn & vol->cluster_size_mask;
- vcn >>= vol->cluster_size_bits;
- if (!rl) {
-lock_retry_remap:
- down_read(&ni->runlist.lock);
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- } else
- lcn = LCN_RL_NOT_MAPPED;
- /* Successful remap. */
- if (likely(lcn >= 0)) {
- /* Setup buffer head to correct block. */
- bh->b_blocknr = ((lcn <<
- vol->cluster_size_bits) +
- vcn_ofs) >> bh_size_bits;
- set_buffer_mapped(bh);
- } else {
- /*
- * Remap failed. Retry to map the runlist once
- * unless we are working on $MFT which always
- * has the whole of its runlist in memory.
- */
- if (!is_mft && !is_retry &&
- lcn == LCN_RL_NOT_MAPPED) {
- is_retry = true;
- /*
- * Attempt to map runlist, dropping
- * lock for the duration.
- */
- up_read(&ni->runlist.lock);
- err2 = ntfs_map_runlist(ni, vcn);
- if (likely(!err2))
- goto lock_retry_remap;
- if (err2 == -ENOMEM)
- page_is_dirty = true;
- lcn = err2;
- } else {
- err2 = -EIO;
- if (!rl)
- up_read(&ni->runlist.lock);
- }
- /* Hard error. Abort writing this record. */
- if (!err || err == -ENOMEM)
- err = err2;
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Cannot write ntfs record "
- "0x%llx (inode 0x%lx, "
- "attribute type 0x%x) because "
- "its location on disk could "
- "not be determined (error "
- "code %lli).",
- (long long)block <<
- bh_size_bits >>
- vol->mft_record_size_bits,
- ni->mft_no, ni->type,
- (long long)lcn);
- /*
- * If this is not the first buffer, remove the
- * buffers in this record from the list of
- * buffers to write and clear their dirty bit
- * if not error -ENOMEM.
- */
- if (rec_start_bh != bh) {
- while (bhs[--nr_bhs] != rec_start_bh)
- ;
- if (err2 != -ENOMEM) {
- do {
- clear_buffer_dirty(
- rec_start_bh);
- } while ((rec_start_bh =
- rec_start_bh->
- b_this_page) !=
- bh);
- }
- }
- continue;
- }
- }
- BUG_ON(!buffer_uptodate(bh));
- BUG_ON(nr_bhs >= max_bhs);
- bhs[nr_bhs++] = bh;
- } while (block++, (bh = bh->b_this_page) != head);
- if (unlikely(rl))
- up_read(&ni->runlist.lock);
- /* If there were no dirty buffers, we are done. */
- if (!nr_bhs)
- goto done;
- /* Map the page so we can access its contents. */
- kaddr = kmap(page);
- /* Clear the page uptodate flag whilst the mst fixups are applied. */
- BUG_ON(!PageUptodate(page));
- ClearPageUptodate(page);
- for (i = 0; i < nr_bhs; i++) {
- unsigned int ofs;
-
- /* Skip buffers which are not at the beginning of records. */
- if (i % bhs_per_rec)
- continue;
- tbh = bhs[i];
- ofs = bh_offset(tbh);
- if (is_mft) {
- ntfs_inode *tni;
- unsigned long mft_no;
-
- /* Get the mft record number. */
- mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
- >> rec_size_bits;
- /* Check whether to write this mft record. */
- tni = NULL;
- if (!ntfs_may_write_mft_record(vol, mft_no,
- (MFT_RECORD*)(kaddr + ofs), &tni)) {
- /*
- * The record should not be written. This
- * means we need to redirty the page before
- * returning.
- */
- page_is_dirty = true;
- /*
- * Remove the buffers in this mft record from
- * the list of buffers to write.
- */
- do {
- bhs[i] = NULL;
- } while (++i % bhs_per_rec);
- continue;
- }
- /*
- * The record should be written. If a locked ntfs
- * inode was returned, add it to the array of locked
- * ntfs inodes.
- */
- if (tni)
- locked_nis[nr_locked_nis++] = tni;
- }
- /* Apply the mst protection fixups. */
- err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
- rec_size);
- if (unlikely(err2)) {
- if (!err || err == -ENOMEM)
- err = -EIO;
- ntfs_error(vol->sb, "Failed to apply mst fixups "
- "(inode 0x%lx, attribute type 0x%x, "
- "page index 0x%lx, page offset 0x%x)!"
- " Unmount and run chkdsk.", vi->i_ino,
- ni->type, page->index, ofs);
- /*
- * Mark all the buffers in this record clean as we do
- * not want to write corrupt data to disk.
- */
- do {
- clear_buffer_dirty(bhs[i]);
- bhs[i] = NULL;
- } while (++i % bhs_per_rec);
- continue;
- }
- nr_recs++;
- }
- /* If no records are to be written out, we are done. */
- if (!nr_recs)
- goto unm_done;
- flush_dcache_page(page);
- /* Lock buffers and start synchronous write i/o on them. */
- for (i = 0; i < nr_bhs; i++) {
- tbh = bhs[i];
- if (!tbh)
- continue;
- if (!trylock_buffer(tbh))
- BUG();
- /* The buffer dirty state is now irrelevant, just clean it. */
- clear_buffer_dirty(tbh);
- BUG_ON(!buffer_uptodate(tbh));
- BUG_ON(!buffer_mapped(tbh));
- get_bh(tbh);
- tbh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE, tbh);
- }
- /* Synchronize the mft mirror now if not @sync. */
- if (is_mft && !sync)
- goto do_mirror;
-do_wait:
- /* Wait on i/o completion of buffers. */
- for (i = 0; i < nr_bhs; i++) {
- tbh = bhs[i];
- if (!tbh)
- continue;
- wait_on_buffer(tbh);
- if (unlikely(!buffer_uptodate(tbh))) {
- ntfs_error(vol->sb, "I/O error while writing ntfs "
- "record buffer (inode 0x%lx, "
- "attribute type 0x%x, page index "
- "0x%lx, page offset 0x%lx)! Unmount "
- "and run chkdsk.", vi->i_ino, ni->type,
- page->index, bh_offset(tbh));
- if (!err || err == -ENOMEM)
- err = -EIO;
- /*
- * Set the buffer uptodate so the page and buffer
- * states do not become out of sync.
- */
- set_buffer_uptodate(tbh);
- }
- }
- /* If @sync, now synchronize the mft mirror. */
- if (is_mft && sync) {
-do_mirror:
- for (i = 0; i < nr_bhs; i++) {
- unsigned long mft_no;
- unsigned int ofs;
-
- /*
- * Skip buffers which are not at the beginning of
- * records.
- */
- if (i % bhs_per_rec)
- continue;
- tbh = bhs[i];
- /* Skip removed buffers (and hence records). */
- if (!tbh)
- continue;
- ofs = bh_offset(tbh);
- /* Get the mft record number. */
- mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
- >> rec_size_bits;
- if (mft_no < vol->mftmirr_size)
- ntfs_sync_mft_mirror(vol, mft_no,
- (MFT_RECORD*)(kaddr + ofs),
- sync);
- }
- if (!sync)
- goto do_wait;
- }
- /* Remove the mst protection fixups again. */
- for (i = 0; i < nr_bhs; i++) {
- if (!(i % bhs_per_rec)) {
- tbh = bhs[i];
- if (!tbh)
- continue;
- post_write_mst_fixup((NTFS_RECORD*)(kaddr +
- bh_offset(tbh)));
- }
- }
- flush_dcache_page(page);
-unm_done:
- /* Unlock any locked inodes. */
- while (nr_locked_nis-- > 0) {
- ntfs_inode *tni, *base_tni;
-
- tni = locked_nis[nr_locked_nis];
- /* Get the base inode. */
- mutex_lock(&tni->extent_lock);
- if (tni->nr_extents >= 0)
- base_tni = tni;
- else {
- base_tni = tni->ext.base_ntfs_ino;
- BUG_ON(!base_tni);
- }
- mutex_unlock(&tni->extent_lock);
- ntfs_debug("Unlocking %s inode 0x%lx.",
- tni == base_tni ? "base" : "extent",
- tni->mft_no);
- mutex_unlock(&tni->mrec_lock);
- atomic_dec(&tni->count);
- iput(VFS_I(base_tni));
- }
- SetPageUptodate(page);
- kunmap(page);
-done:
- if (unlikely(err && err != -ENOMEM)) {
- /*
- * Set page error if there is only one ntfs record in the page.
- * Otherwise we would loose per-record granularity.
- */
- if (ni->itype.index.block_size == PAGE_SIZE)
- SetPageError(page);
- NVolSetErrors(vol);
- }
- if (page_is_dirty) {
- ntfs_debug("Page still contains one or more dirty ntfs "
- "records. Redirtying the page starting at "
- "record 0x%lx.", page->index <<
- (PAGE_SHIFT - rec_size_bits));
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- } else {
- /*
- * Keep the VM happy. This must be done otherwise the
- * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
- * the page is clean.
- */
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
- end_page_writeback(page);
- }
- if (likely(!err))
- ntfs_debug("Done.");
- return err;
-}
-
-/**
- * ntfs_writepage - write a @page to the backing store
- * @page: page cache page to write out
- * @wbc: writeback control structure
- *
- * This is called from the VM when it wants to have a dirty ntfs page cache
- * page cleaned. The VM has already locked the page and marked it clean.
- *
- * For non-resident attributes, ntfs_writepage() writes the @page by calling
- * the ntfs version of the generic block_write_full_folio() function,
- * ntfs_write_block(), which in turn if necessary creates and writes the
- * buffers associated with the page asynchronously.
- *
- * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
- * the data to the mft record (which at this stage is most likely in memory).
- * The mft record is then marked dirty and written out asynchronously via the
- * vfs inode dirty code path for the inode the mft record belongs to or via the
- * vm page dirty code path for the page the mft record is in.
- *
- * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio().
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
- loff_t i_size;
- struct inode *vi = folio->mapping->host;
- ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
- char *addr;
- ntfs_attr_search_ctx *ctx = NULL;
- MFT_RECORD *m = NULL;
- u32 attr_len;
- int err;
-
-retry_writepage:
- BUG_ON(!folio_test_locked(folio));
- i_size = i_size_read(vi);
- /* Is the folio fully outside i_size? (truncate in progress) */
- if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
- PAGE_SHIFT)) {
- /*
- * The folio may have dirty, unmapped buffers. Make them
- * freeable here, so the page does not leak.
- */
- block_invalidate_folio(folio, 0, folio_size(folio));
- folio_unlock(folio);
- ntfs_debug("Write outside i_size - truncated?");
- return 0;
- }
- /*
- * Only $DATA attributes can be encrypted and only unnamed $DATA
- * attributes can be compressed. Index root can have the flags set but
- * this means to create compressed/encrypted files, not that the
- * attribute is compressed/encrypted. Note we need to check for
- * AT_INDEX_ALLOCATION since this is the type of both directory and
- * index inodes.
- */
- if (ni->type != AT_INDEX_ALLOCATION) {
- /* If file is encrypted, deny access, just like NT4. */
- if (NInoEncrypted(ni)) {
- folio_unlock(folio);
- BUG_ON(ni->type != AT_DATA);
- ntfs_debug("Denying write access to encrypted file.");
- return -EACCES;
- }
- /* Compressed data streams are handled in compress.c. */
- if (NInoNonResident(ni) && NInoCompressed(ni)) {
- BUG_ON(ni->type != AT_DATA);
- BUG_ON(ni->name_len);
- // TODO: Implement and replace this with
- // return ntfs_write_compressed_block(page);
- folio_unlock(folio);
- ntfs_error(vi->i_sb, "Writing to compressed files is "
- "not supported yet. Sorry.");
- return -EOPNOTSUPP;
- }
- // TODO: Implement and remove this check.
- if (NInoNonResident(ni) && NInoSparse(ni)) {
- folio_unlock(folio);
- ntfs_error(vi->i_sb, "Writing to sparse files is not "
- "supported yet. Sorry.");
- return -EOPNOTSUPP;
- }
- }
- /* NInoNonResident() == NInoIndexAllocPresent() */
- if (NInoNonResident(ni)) {
- /* We have to zero every time due to mmap-at-end-of-file. */
- if (folio->index >= (i_size >> PAGE_SHIFT)) {
- /* The folio straddles i_size. */
- unsigned int ofs = i_size & (folio_size(folio) - 1);
- folio_zero_segment(folio, ofs, folio_size(folio));
- }
- /* Handle mst protected attributes. */
- if (NInoMstProtected(ni))
- return ntfs_write_mst_block(page, wbc);
- /* Normal, non-resident data stream. */
- return ntfs_write_block(folio, wbc);
- }
- /*
- * Attribute is resident, implying it is not compressed, encrypted, or
- * mst protected. This also means the attribute is smaller than an mft
- * record and hence smaller than a folio, so can simply return error on
- * any folios with index above 0. Note the attribute can actually be
- * marked compressed but if it is resident the actual data is not
- * compressed so we are ok to ignore the compressed flag here.
- */
- BUG_ON(folio_buffers(folio));
- BUG_ON(!folio_test_uptodate(folio));
- if (unlikely(folio->index > 0)) {
- ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. "
- "Aborting write.", folio->index);
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio);
- folio_unlock(folio);
- folio_end_writeback(folio);
- return -EIO;
- }
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /* Map, pin, and lock the mft record. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- /*
- * If a parallel write made the attribute non-resident, drop the mft
- * record and retry the writepage.
- */
- if (unlikely(NInoNonResident(ni))) {
- unmap_mft_record(base_ni);
- goto retry_writepage;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err))
- goto err_out;
- /*
- * Keep the VM happy. This must be done otherwise
- * PAGECACHE_TAG_DIRTY remains set even though the folio is clean.
- */
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio);
- folio_unlock(folio);
- attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
- i_size = i_size_read(vi);
- if (unlikely(attr_len > i_size)) {
- /* Race with shrinking truncate or a failed truncate. */
- attr_len = i_size;
- /*
- * If the truncate failed, fix it up now. If a concurrent
- * truncate, we do its job, so it does not have to do anything.
- */
- err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
- attr_len);
- /* Shrinking cannot fail. */
- BUG_ON(err);
- }
- addr = kmap_local_folio(folio, 0);
- /* Copy the data from the folio to the mft record. */
- memcpy((u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset),
- addr, attr_len);
- /* Zero out of bounds area in the page cache folio. */
- memset(addr + attr_len, 0, folio_size(folio) - attr_len);
- kunmap_local(addr);
- flush_dcache_folio(folio);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- /* We are done with the folio. */
- folio_end_writeback(folio);
- /* Finally, mark the mft record dirty, so it gets written back. */
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- return 0;
-err_out:
- if (err == -ENOMEM) {
- ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
- "page so we try again later.");
- /*
- * Put the folio back on mapping->dirty_pages, but leave its
- * buffers' dirty state as-is.
- */
- folio_redirty_for_writepage(wbc, folio);
- err = 0;
- } else {
- ntfs_error(vi->i_sb, "Resident attribute write failed with "
- "error %i.", err);
- folio_set_error(folio);
- NVolSetErrors(ni->vol);
- }
- folio_unlock(folio);
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- return err;
-}
-
-#endif /* NTFS_RW */
-
-/**
- * ntfs_bmap - map logical file block to physical device block
- * @mapping: address space mapping to which the block to be mapped belongs
- * @block: logical block to map to its physical device block
- *
- * For regular, non-resident files (i.e. not compressed and not encrypted), map
- * the logical @block belonging to the file described by the address space
- * mapping @mapping to its physical device block.
- *
- * The size of the block is equal to the @s_blocksize field of the super block
- * of the mounted file system which is guaranteed to be smaller than or equal
- * to the cluster size thus the block is guaranteed to fit entirely inside the
- * cluster which means we do not need to care how many contiguous bytes are
- * available after the beginning of the block.
- *
- * Return the physical device block if the mapping succeeded or 0 if the block
- * is sparse or there was an error.
- *
- * Note: This is a problem if someone tries to run bmap() on $Boot system file
- * as that really is in block zero but there is nothing we can do. bmap() is
- * just broken in that respect (just like it cannot distinguish sparse from
- * not available or error).
- */
-static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
-{
- s64 ofs, size;
- loff_t i_size;
- LCN lcn;
- unsigned long blocksize, flags;
- ntfs_inode *ni = NTFS_I(mapping->host);
- ntfs_volume *vol = ni->vol;
- unsigned delta;
- unsigned char blocksize_bits, cluster_size_shift;
-
- ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
- ni->mft_no, (unsigned long long)block);
- if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
- ntfs_error(vol->sb, "BMAP does not make sense for %s "
- "attributes, returning 0.",
- (ni->type != AT_DATA) ? "non-data" :
- (!NInoNonResident(ni) ? "resident" :
- "encrypted"));
- return 0;
- }
- /* None of these can happen. */
- BUG_ON(NInoCompressed(ni));
- BUG_ON(NInoMstProtected(ni));
- blocksize = vol->sb->s_blocksize;
- blocksize_bits = vol->sb->s_blocksize_bits;
- ofs = (s64)block << blocksize_bits;
- read_lock_irqsave(&ni->size_lock, flags);
- size = ni->initialized_size;
- i_size = i_size_read(VFS_I(ni));
- read_unlock_irqrestore(&ni->size_lock, flags);
- /*
- * If the offset is outside the initialized size or the block straddles
- * the initialized size then pretend it is a hole unless the
- * initialized size equals the file size.
- */
- if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
- goto hole;
- cluster_size_shift = vol->cluster_size_bits;
- down_read(&ni->runlist.lock);
- lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
- up_read(&ni->runlist.lock);
- if (unlikely(lcn < LCN_HOLE)) {
- /*
- * Step down to an integer to avoid gcc doing a long long
- * comparision in the switch when we know @lcn is between
- * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
- *
- * Otherwise older gcc (at least on some architectures) will
- * try to use __cmpdi2() which is of course not available in
- * the kernel.
- */
- switch ((int)lcn) {
- case LCN_ENOENT:
- /*
- * If the offset is out of bounds then pretend it is a
- * hole.
- */
- goto hole;
- case LCN_ENOMEM:
- ntfs_error(vol->sb, "Not enough memory to complete "
- "mapping for inode 0x%lx. "
- "Returning 0.", ni->mft_no);
- break;
- default:
- ntfs_error(vol->sb, "Failed to complete mapping for "
- "inode 0x%lx. Run chkdsk. "
- "Returning 0.", ni->mft_no);
- break;
- }
- return 0;
- }
- if (lcn < 0) {
- /* It is a hole. */
-hole:
- ntfs_debug("Done (returning hole).");
- return 0;
- }
- /*
- * The block is really allocated and fullfils all our criteria.
- * Convert the cluster to units of block size and return the result.
- */
- delta = ofs & vol->cluster_size_mask;
- if (unlikely(sizeof(block) < sizeof(lcn))) {
- block = lcn = ((lcn << cluster_size_shift) + delta) >>
- blocksize_bits;
- /* If the block number was truncated return 0. */
- if (unlikely(block != lcn)) {
- ntfs_error(vol->sb, "Physical block 0x%llx is too "
- "large to be returned, returning 0.",
- (long long)lcn);
- return 0;
- }
- } else
- block = ((lcn << cluster_size_shift) + delta) >>
- blocksize_bits;
- ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
- return block;
-}
-
-/*
- * ntfs_normal_aops - address space operations for normal inodes and attributes
- *
- * Note these are not used for compressed or mst protected inodes and
- * attributes.
- */
-const struct address_space_operations ntfs_normal_aops = {
- .read_folio = ntfs_read_folio,
-#ifdef NTFS_RW
- .writepage = ntfs_writepage,
- .dirty_folio = block_dirty_folio,
-#endif /* NTFS_RW */
- .bmap = ntfs_bmap,
- .migrate_folio = buffer_migrate_folio,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_folio = generic_error_remove_folio,
-};
-
-/*
- * ntfs_compressed_aops - address space operations for compressed inodes
- */
-const struct address_space_operations ntfs_compressed_aops = {
- .read_folio = ntfs_read_folio,
-#ifdef NTFS_RW
- .writepage = ntfs_writepage,
- .dirty_folio = block_dirty_folio,
-#endif /* NTFS_RW */
- .migrate_folio = buffer_migrate_folio,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_folio = generic_error_remove_folio,
-};
-
-/*
- * ntfs_mst_aops - general address space operations for mst protecteed inodes
- * and attributes
- */
-const struct address_space_operations ntfs_mst_aops = {
- .read_folio = ntfs_read_folio, /* Fill page with data. */
-#ifdef NTFS_RW
- .writepage = ntfs_writepage, /* Write dirty page to disk. */
- .dirty_folio = filemap_dirty_folio,
-#endif /* NTFS_RW */
- .migrate_folio = buffer_migrate_folio,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_folio = generic_error_remove_folio,
-};
-
-#ifdef NTFS_RW
-
-/**
- * mark_ntfs_record_dirty - mark an ntfs record dirty
- * @page: page containing the ntfs record to mark dirty
- * @ofs: byte offset within @page at which the ntfs record begins
- *
- * Set the buffers and the page in which the ntfs record is located dirty.
- *
- * The latter also marks the vfs inode the ntfs record belongs to dirty
- * (I_DIRTY_PAGES only).
- *
- * If the page does not have buffers, we create them and set them uptodate.
- * The page may not be locked which is why we need to handle the buffers under
- * the mapping->i_private_lock. Once the buffers are marked dirty we no longer
- * need the lock since try_to_free_buffers() does not free dirty buffers.
- */
-void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
- struct address_space *mapping = page->mapping;
- ntfs_inode *ni = NTFS_I(mapping->host);
- struct buffer_head *bh, *head, *buffers_to_free = NULL;
- unsigned int end, bh_size, bh_ofs;
-
- BUG_ON(!PageUptodate(page));
- end = ofs + ni->itype.index.block_size;
- bh_size = VFS_I(ni)->i_sb->s_blocksize;
- spin_lock(&mapping->i_private_lock);
- if (unlikely(!page_has_buffers(page))) {
- spin_unlock(&mapping->i_private_lock);
- bh = head = alloc_page_buffers(page, bh_size, true);
- spin_lock(&mapping->i_private_lock);
- if (likely(!page_has_buffers(page))) {
- struct buffer_head *tail;
-
- do {
- set_buffer_uptodate(bh);
- tail = bh;
- bh = bh->b_this_page;
- } while (bh);
- tail->b_this_page = head;
- attach_page_private(page, head);
- } else
- buffers_to_free = bh;
- }
- bh = head = page_buffers(page);
- BUG_ON(!bh);
- do {
- bh_ofs = bh_offset(bh);
- if (bh_ofs + bh_size <= ofs)
- continue;
- if (unlikely(bh_ofs >= end))
- break;
- set_buffer_dirty(bh);
- } while ((bh = bh->b_this_page) != head);
- spin_unlock(&mapping->i_private_lock);
- filemap_dirty_folio(mapping, page_folio(page));
- if (unlikely(buffers_to_free)) {
- do {
- bh = buffers_to_free->b_this_page;
- free_buffer_head(buffers_to_free);
- buffers_to_free = bh;
- } while (buffers_to_free);
- }
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
deleted file mode 100644
index 8d0958a149cb..000000000000
--- a/fs/ntfs/aops.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * aops.h - Defines for NTFS kernel address space operations and page cache
- * handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_AOPS_H
-#define _LINUX_NTFS_AOPS_H
-
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/fs.h>
-
-#include "inode.h"
-
-/**
- * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
- * @page: the page to release
- *
- * Unpin, unmap and release a page that was obtained from ntfs_map_page().
- */
-static inline void ntfs_unmap_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
-/**
- * ntfs_map_page - map a page into accessible memory, reading it if necessary
- * @mapping: address space for which to obtain the page
- * @index: index into the page cache for @mapping of the page to map
- *
- * Read a page from the page cache of the address space @mapping at position
- * @index, where @index is in units of PAGE_SIZE, and not in bytes.
- *
- * If the page is not in memory it is loaded from disk first using the
- * read_folio method defined in the address space operations of @mapping
- * and the page is added to the page cache of @mapping in the process.
- *
- * If the page belongs to an mst protected attribute and it is marked as such
- * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
- * error checking is performed. This means the caller has to verify whether
- * the ntfs record(s) contained in the page are valid or not using one of the
- * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are
- * expecting to see. (For details of the macros, see fs/ntfs/layout.h.)
- *
- * If the page is in high memory it is mapped into memory directly addressible
- * by the kernel.
- *
- * Finally the page count is incremented, thus pinning the page into place.
- *
- * The above means that page_address(page) can be used on all pages obtained
- * with ntfs_map_page() to get the kernel virtual address of the page.
- *
- * When finished with the page, the caller has to call ntfs_unmap_page() to
- * unpin, unmap and release the page.
- *
- * Note this does not grant exclusive access. If such is desired, the caller
- * must provide it independently of the ntfs_{un}map_page() calls by using
- * a {rw_}semaphore or other means of serialization. A spin lock cannot be
- * used as ntfs_map_page() can block.
- *
- * The unlocked and uptodate page is returned on success or an encoded error
- * on failure. Caller has to test for error using the IS_ERR() macro on the
- * return value. If that evaluates to 'true', the negative error code can be
- * obtained using PTR_ERR() on the return value of ntfs_map_page().
- */
-static inline struct page *ntfs_map_page(struct address_space *mapping,
- unsigned long index)
-{
- struct page *page = read_mapping_page(mapping, index, NULL);
-
- if (!IS_ERR(page))
- kmap(page);
- return page;
-}
-
-#ifdef NTFS_RW
-
-extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_AOPS_H */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
deleted file mode 100644
index f79408f9127a..000000000000
--- a/fs/ntfs/attrib.c
+++ /dev/null
@@ -1,2624 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/writeback.h>
-
-#include "attrib.h"
-#include "debug.h"
-#include "layout.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-#include "types.h"
-
-/**
- * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
- * @ni: ntfs inode for which to map (part of) a runlist
- * @vcn: map runlist part containing this vcn
- * @ctx: active attribute search context if present or NULL if not
- *
- * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped
- * runlist fragments and allows their mapping. If you do not have the mft
- * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
- * will perform the necessary mapping and unmapping.
- *
- * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
- * restores it before returning. Thus, @ctx will be left pointing to the same
- * attribute on return as on entry. However, the actual pointers in @ctx may
- * point to different memory locations on return, so you must remember to reset
- * any cached pointers from the @ctx, i.e. after the call to
- * ntfs_map_runlist_nolock(), you will probably want to do:
- * m = ctx->mrec;
- * a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * Return 0 on success and -errno on error. There is one special error code
- * which is not an error as such. This is -ENOENT. It means that @vcn is out
- * of bounds of the runlist.
- *
- * Note the runlist can be NULL after this function returns if @vcn is zero and
- * the attribute has zero allocated size, i.e. there simply is no runlist.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- * is no longer valid, i.e. you need to either call
- * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- * In that case PTR_ERR(@ctx->mrec) will give you the error code for
- * why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- * and is locked on return. Note the runlist will be modified.
- * - If @ctx is NULL, the base mft record of @ni must not be mapped on
- * entry and it will be left unmapped on return.
- * - If @ctx is not NULL, the base mft record must be mapped on entry
- * and it will be left mapped on return.
- */
-int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
-{
- VCN end_vcn;
- unsigned long flags;
- ntfs_inode *base_ni;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- runlist_element *rl;
- struct page *put_this_page = NULL;
- int err = 0;
- bool ctx_is_temporary, ctx_needs_reset;
- ntfs_attr_search_ctx old_ctx = { NULL, };
-
- ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
- (unsigned long long)vcn);
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- if (!ctx) {
- ctx_is_temporary = ctx_needs_reset = true;
- m = map_mft_record(base_ni);
- if (IS_ERR(m))
- return PTR_ERR(m);
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- } else {
- VCN allocated_size_vcn;
-
- BUG_ON(IS_ERR(ctx->mrec));
- a = ctx->attr;
- BUG_ON(!a->non_resident);
- ctx_is_temporary = false;
- end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size_vcn = ni->allocated_size >>
- ni->vol->cluster_size_bits;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
- end_vcn = allocated_size_vcn - 1;
- /*
- * If we already have the attribute extent containing @vcn in
- * @ctx, no need to look it up again. We slightly cheat in
- * that if vcn exceeds the allocated size, we will refuse to
- * map the runlist below, so there is definitely no need to get
- * the right attribute extent.
- */
- if (vcn >= allocated_size_vcn || (a->type == ni->type &&
- a->name_length == ni->name_len &&
- !memcmp((u8*)a + le16_to_cpu(a->name_offset),
- ni->name, ni->name_len) &&
- sle64_to_cpu(a->data.non_resident.lowest_vcn)
- <= vcn && end_vcn >= vcn))
- ctx_needs_reset = false;
- else {
- /* Save the old search context. */
- old_ctx = *ctx;
- /*
- * If the currently mapped (extent) inode is not the
- * base inode we will unmap it when we reinitialize the
- * search context which means we need to get a
- * reference to the page containing the mapped mft
- * record so we do not accidentally drop changes to the
- * mft record when it has not been marked dirty yet.
- */
- if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
- old_ctx.base_ntfs_ino) {
- put_this_page = old_ctx.ntfs_ino->page;
- get_page(put_this_page);
- }
- /*
- * Reinitialize the search context so we can lookup the
- * needed attribute extent.
- */
- ntfs_attr_reinit_search_ctx(ctx);
- ctx_needs_reset = true;
- }
- }
- if (ctx_needs_reset) {
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, vcn, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- BUG_ON(!ctx->attr->non_resident);
- }
- a = ctx->attr;
- /*
- * Only decompress the mapping pairs if @vcn is inside it. Otherwise
- * we get into problems when we try to map an out of bounds vcn because
- * we then try to map the already mapped runlist fragment and
- * ntfs_mapping_pairs_decompress() fails.
- */
- end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
- if (unlikely(vcn && vcn >= end_vcn)) {
- err = -ENOENT;
- goto err_out;
- }
- rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl);
- if (IS_ERR(rl))
- err = PTR_ERR(rl);
- else
- ni->runlist.rl = rl;
-err_out:
- if (ctx_is_temporary) {
- if (likely(ctx))
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- } else if (ctx_needs_reset) {
- /*
- * If there is no attribute list, restoring the search context
- * is accomplished simply by copying the saved context back over
- * the caller supplied context. If there is an attribute list,
- * things are more complicated as we need to deal with mapping
- * of mft records and resulting potential changes in pointers.
- */
- if (NInoAttrList(base_ni)) {
- /*
- * If the currently mapped (extent) inode is not the
- * one we had before, we need to unmap it and map the
- * old one.
- */
- if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
- /*
- * If the currently mapped inode is not the
- * base inode, unmap it.
- */
- if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
- ctx->base_ntfs_ino) {
- unmap_extent_mft_record(ctx->ntfs_ino);
- ctx->mrec = ctx->base_mrec;
- BUG_ON(!ctx->mrec);
- }
- /*
- * If the old mapped inode is not the base
- * inode, map it.
- */
- if (old_ctx.base_ntfs_ino &&
- old_ctx.ntfs_ino !=
- old_ctx.base_ntfs_ino) {
-retry_map:
- ctx->mrec = map_mft_record(
- old_ctx.ntfs_ino);
- /*
- * Something bad has happened. If out
- * of memory retry till it succeeds.
- * Any other errors are fatal and we
- * return the error code in ctx->mrec.
- * Let the caller deal with it... We
- * just need to fudge things so the
- * caller can reinit and/or put the
- * search context safely.
- */
- if (IS_ERR(ctx->mrec)) {
- if (PTR_ERR(ctx->mrec) ==
- -ENOMEM) {
- schedule();
- goto retry_map;
- } else
- old_ctx.ntfs_ino =
- old_ctx.
- base_ntfs_ino;
- }
- }
- }
- /* Update the changed pointers in the saved context. */
- if (ctx->mrec != old_ctx.mrec) {
- if (!IS_ERR(ctx->mrec))
- old_ctx.attr = (ATTR_RECORD*)(
- (u8*)ctx->mrec +
- ((u8*)old_ctx.attr -
- (u8*)old_ctx.mrec));
- old_ctx.mrec = ctx->mrec;
- }
- }
- /* Restore the search context to the saved one. */
- *ctx = old_ctx;
- /*
- * We drop the reference on the page we took earlier. In the
- * case that IS_ERR(ctx->mrec) is true this means we might lose
- * some changes to the mft record that had been made between
- * the last time it was marked dirty/written out and now. This
- * at this stage is not a problem as the mapping error is fatal
- * enough that the mft record cannot be written out anyway and
- * the caller is very likely to shutdown the whole inode
- * immediately and mark the volume dirty for chkdsk to pick up
- * the pieces anyway.
- */
- if (put_this_page)
- put_page(put_this_page);
- }
- return err;
-}
-
-/**
- * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
- * @ni: ntfs inode for which to map (part of) a runlist
- * @vcn: map runlist part containing this vcn
- *
- * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
- *
- * Return 0 on success and -errno on error. There is one special error code
- * which is not an error as such. This is -ENOENT. It means that @vcn is out
- * of bounds of the runlist.
- *
- * Locking: - The runlist must be unlocked on entry and is unlocked on return.
- * - This function takes the runlist lock for writing and may modify
- * the runlist.
- */
-int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
-{
- int err = 0;
-
- down_write(&ni->runlist.lock);
- /* Make sure someone else didn't do the work while we were sleeping. */
- if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
- LCN_RL_NOT_MAPPED))
- err = ntfs_map_runlist_nolock(ni, vcn, NULL);
- up_write(&ni->runlist.lock);
- return err;
-}
-
-/**
- * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode
- * @ni: ntfs inode of the attribute whose runlist to search
- * @vcn: vcn to convert
- * @write_locked: true if the runlist is locked for writing
- *
- * Find the virtual cluster number @vcn in the runlist of the ntfs attribute
- * described by the ntfs inode @ni and return the corresponding logical cluster
- * number (lcn).
- *
- * If the @vcn is not mapped yet, the attempt is made to map the attribute
- * extent containing the @vcn and the vcn to lcn conversion is retried.
- *
- * If @write_locked is true the caller has locked the runlist for writing and
- * if false for reading.
- *
- * Since lcns must be >= 0, we use negative return codes with special meaning:
- *
- * Return code Meaning / Description
- * ==========================================
- * LCN_HOLE Hole / not allocated on disk.
- * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds.
- * LCN_ENOMEM Not enough memory to map runlist.
- * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc).
- *
- * Locking: - The runlist must be locked on entry and is left locked on return.
- * - If @write_locked is 'false', i.e. the runlist is locked for reading,
- * the lock may be dropped inside the function so you cannot rely on
- * the runlist still being the same when this function returns.
- */
-LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
- const bool write_locked)
-{
- LCN lcn;
- unsigned long flags;
- bool is_retry = false;
-
- BUG_ON(!ni);
- ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
- ni->mft_no, (unsigned long long)vcn,
- write_locked ? "write" : "read");
- BUG_ON(!NInoNonResident(ni));
- BUG_ON(vcn < 0);
- if (!ni->runlist.rl) {
- read_lock_irqsave(&ni->size_lock, flags);
- if (!ni->allocated_size) {
- read_unlock_irqrestore(&ni->size_lock, flags);
- return LCN_ENOENT;
- }
- read_unlock_irqrestore(&ni->size_lock, flags);
- }
-retry_remap:
- /* Convert vcn to lcn. If that fails map the runlist and retry once. */
- lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
- if (likely(lcn >= LCN_HOLE)) {
- ntfs_debug("Done, lcn 0x%llx.", (long long)lcn);
- return lcn;
- }
- if (lcn != LCN_RL_NOT_MAPPED) {
- if (lcn != LCN_ENOENT)
- lcn = LCN_EIO;
- } else if (!is_retry) {
- int err;
-
- if (!write_locked) {
- up_read(&ni->runlist.lock);
- down_write(&ni->runlist.lock);
- if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) !=
- LCN_RL_NOT_MAPPED)) {
- up_write(&ni->runlist.lock);
- down_read(&ni->runlist.lock);
- goto retry_remap;
- }
- }
- err = ntfs_map_runlist_nolock(ni, vcn, NULL);
- if (!write_locked) {
- up_write(&ni->runlist.lock);
- down_read(&ni->runlist.lock);
- }
- if (likely(!err)) {
- is_retry = true;
- goto retry_remap;
- }
- if (err == -ENOENT)
- lcn = LCN_ENOENT;
- else if (err == -ENOMEM)
- lcn = LCN_ENOMEM;
- else
- lcn = LCN_EIO;
- }
- if (lcn != LCN_ENOENT)
- ntfs_error(ni->vol->sb, "Failed with error code %lli.",
- (long long)lcn);
- return lcn;
-}
-
-/**
- * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
- * @ni: ntfs inode describing the runlist to search
- * @vcn: vcn to find
- * @ctx: active attribute search context if present or NULL if not
- *
- * Find the virtual cluster number @vcn in the runlist described by the ntfs
- * inode @ni and return the address of the runlist element containing the @vcn.
- *
- * If the @vcn is not mapped yet, the attempt is made to map the attribute
- * extent containing the @vcn and the vcn to lcn conversion is retried.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
- * runlist fragments and allows their mapping. If you do not have the mft
- * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
- * will perform the necessary mapping and unmapping.
- *
- * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
- * restores it before returning. Thus, @ctx will be left pointing to the same
- * attribute on return as on entry. However, the actual pointers in @ctx may
- * point to different memory locations on return, so you must remember to reset
- * any cached pointers from the @ctx, i.e. after the call to
- * ntfs_attr_find_vcn_nolock(), you will probably want to do:
- * m = ctx->mrec;
- * a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- * Note you need to distinguish between the lcn of the returned runlist element
- * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on
- * read and allocate clusters on write.
- *
- * Return the runlist element containing the @vcn on success and
- * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR()
- * to decide if the return is success or failure and PTR_ERR() to get to the
- * error code if IS_ERR() is true.
- *
- * The possible error return codes are:
- * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds.
- * -ENOMEM - Not enough memory to map runlist.
- * -EIO - Critical error (runlist/file is corrupt, i/o error, etc).
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- * is no longer valid, i.e. you need to either call
- * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- * In that case PTR_ERR(@ctx->mrec) will give you the error code for
- * why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- * and is locked on return. Note the runlist may be modified when
- * needed runlist fragments need to be mapped.
- * - If @ctx is NULL, the base mft record of @ni must not be mapped on
- * entry and it will be left unmapped on return.
- * - If @ctx is not NULL, the base mft record must be mapped on entry
- * and it will be left mapped on return.
- */
-runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
- ntfs_attr_search_ctx *ctx)
-{
- unsigned long flags;
- runlist_element *rl;
- int err = 0;
- bool is_retry = false;
-
- BUG_ON(!ni);
- ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
- ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
- BUG_ON(!NInoNonResident(ni));
- BUG_ON(vcn < 0);
- if (!ni->runlist.rl) {
- read_lock_irqsave(&ni->size_lock, flags);
- if (!ni->allocated_size) {
- read_unlock_irqrestore(&ni->size_lock, flags);
- return ERR_PTR(-ENOENT);
- }
- read_unlock_irqrestore(&ni->size_lock, flags);
- }
-retry_remap:
- rl = ni->runlist.rl;
- if (likely(rl && vcn >= rl[0].vcn)) {
- while (likely(rl->length)) {
- if (unlikely(vcn < rl[1].vcn)) {
- if (likely(rl->lcn >= LCN_HOLE)) {
- ntfs_debug("Done.");
- return rl;
- }
- break;
- }
- rl++;
- }
- if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) {
- if (likely(rl->lcn == LCN_ENOENT))
- err = -ENOENT;
- else
- err = -EIO;
- }
- }
- if (!err && !is_retry) {
- /*
- * If the search context is invalid we cannot map the unmapped
- * region.
- */
- if (IS_ERR(ctx->mrec))
- err = PTR_ERR(ctx->mrec);
- else {
- /*
- * The @vcn is in an unmapped region, map the runlist
- * and retry.
- */
- err = ntfs_map_runlist_nolock(ni, vcn, ctx);
- if (likely(!err)) {
- is_retry = true;
- goto retry_remap;
- }
- }
- if (err == -EINVAL)
- err = -EIO;
- } else if (!err)
- err = -EIO;
- if (err != -ENOENT)
- ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
- return ERR_PTR(err);
-}
-
-/**
- * ntfs_attr_find - find (next) attribute in mft record
- * @type: attribute type to find
- * @name: attribute name to find (optional, i.e. NULL means don't care)
- * @name_len: attribute name length (only needed if @name present)
- * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @val: attribute value to find (optional, resident attributes only)
- * @val_len: attribute value length
- * @ctx: search context with mft record and attribute to search from
- *
- * You should not need to call this function directly. Use ntfs_attr_lookup()
- * instead.
- *
- * ntfs_attr_find() takes a search context @ctx as parameter and searches the
- * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an
- * attribute of @type, optionally @name and @val.
- *
- * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will
- * point to the found attribute.
- *
- * If the attribute is not found, ntfs_attr_find() returns -ENOENT and
- * @ctx->attr will point to the attribute before which the attribute being
- * searched for would need to be inserted if such an action were to be desired.
- *
- * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is
- * undefined and in particular do not rely on it not changing.
- *
- * If @ctx->is_first is 'true', the search begins with @ctx->attr itself. If it
- * is 'false', the search begins after @ctx->attr.
- *
- * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
- * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
- * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at
- * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case
- * sensitive. When @name is present, @name_len is the @name length in Unicode
- * characters.
- *
- * If @name is not present (NULL), we assume that the unnamed attribute is
- * being searched for.
- *
- * Finally, the resident attribute value @val is looked for, if present. If
- * @val is not present (NULL), @val_len is ignored.
- *
- * ntfs_attr_find() only searches the specified mft record and it ignores the
- * presence of an attribute list attribute (unless it is the one being searched
- * for, obviously). If you need to take attribute lists into consideration,
- * use ntfs_attr_lookup() instead (see below). This also means that you cannot
- * use ntfs_attr_find() to search for extent records of non-resident
- * attributes, as extents with lowest_vcn != 0 are usually described by the
- * attribute list attribute only. - Note that it is possible that the first
- * extent is only in the attribute list while the last extent is in the base
- * mft record, so do not rely on being able to find the first extent in the
- * base mft record.
- *
- * Warning: Never use @val when looking for attribute types which can be
- * non-resident as this most likely will result in a crash!
- */
-static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
- const u32 name_len, const IGNORE_CASE_BOOL ic,
- const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
-{
- ATTR_RECORD *a;
- ntfs_volume *vol = ctx->ntfs_ino->vol;
- ntfschar *upcase = vol->upcase;
- u32 upcase_len = vol->upcase_len;
-
- /*
- * Iterate over attributes in mft record starting at @ctx->attr, or the
- * attribute following that, if @ctx->is_first is 'true'.
- */
- if (ctx->is_first) {
- a = ctx->attr;
- ctx->is_first = false;
- } else
- a = (ATTR_RECORD*)((u8*)ctx->attr +
- le32_to_cpu(ctx->attr->length));
- for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
- u8 *mrec_end = (u8 *)ctx->mrec +
- le32_to_cpu(ctx->mrec->bytes_allocated);
- u8 *name_end;
-
- /* check whether ATTR_RECORD wrap */
- if ((u8 *)a < (u8 *)ctx->mrec)
- break;
-
- /* check whether Attribute Record Header is within bounds */
- if ((u8 *)a > mrec_end ||
- (u8 *)a + sizeof(ATTR_RECORD) > mrec_end)
- break;
-
- /* check whether ATTR_RECORD's name is within bounds */
- name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
- a->name_length * sizeof(ntfschar);
- if (name_end > mrec_end)
- break;
-
- ctx->attr = a;
- if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
- a->type == AT_END))
- return -ENOENT;
- if (unlikely(!a->length))
- break;
-
- /* check whether ATTR_RECORD's length wrap */
- if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a)
- break;
- /* check whether ATTR_RECORD's length is within bounds */
- if ((u8 *)a + le32_to_cpu(a->length) > mrec_end)
- break;
-
- if (a->type != type)
- continue;
- /*
- * If @name is present, compare the two names. If @name is
- * missing, assume we want an unnamed attribute.
- */
- if (!name) {
- /* The search failed if the found attribute is named. */
- if (a->name_length)
- return -ENOENT;
- } else if (!ntfs_are_names_equal(name, name_len,
- (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)),
- a->name_length, ic, upcase, upcase_len)) {
- register int rc;
-
- rc = ntfs_collate_names(name, name_len,
- (ntfschar*)((u8*)a +
- le16_to_cpu(a->name_offset)),
- a->name_length, 1, IGNORE_CASE,
- upcase, upcase_len);
- /*
- * If @name collates before a->name, there is no
- * matching attribute.
- */
- if (rc == -1)
- return -ENOENT;
- /* If the strings are not equal, continue search. */
- if (rc)
- continue;
- rc = ntfs_collate_names(name, name_len,
- (ntfschar*)((u8*)a +
- le16_to_cpu(a->name_offset)),
- a->name_length, 1, CASE_SENSITIVE,
- upcase, upcase_len);
- if (rc == -1)
- return -ENOENT;
- if (rc)
- continue;
- }
- /*
- * The names match or @name not present and attribute is
- * unnamed. If no @val specified, we have found the attribute
- * and are done.
- */
- if (!val)
- return 0;
- /* @val is present; compare values. */
- else {
- register int rc;
-
- rc = memcmp(val, (u8*)a + le16_to_cpu(
- a->data.resident.value_offset),
- min_t(u32, val_len, le32_to_cpu(
- a->data.resident.value_length)));
- /*
- * If @val collates before the current attribute's
- * value, there is no matching attribute.
- */
- if (!rc) {
- register u32 avl;
-
- avl = le32_to_cpu(
- a->data.resident.value_length);
- if (val_len == avl)
- return 0;
- if (val_len < avl)
- return -ENOENT;
- } else if (rc < 0)
- return -ENOENT;
- }
- }
- ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk.");
- NVolSetErrors(vol);
- return -EIO;
-}
-
-/**
- * load_attribute_list - load an attribute list into memory
- * @vol: ntfs volume from which to read
- * @runlist: runlist of the attribute list
- * @al_start: destination buffer
- * @size: size of the destination buffer in bytes
- * @initialized_size: initialized size of the attribute list
- *
- * Walk the runlist @runlist and load all clusters from it copying them into
- * the linear buffer @al. The maximum number of bytes copied to @al is @size
- * bytes. Note, @size does not need to be a multiple of the cluster size. If
- * @initialized_size is less than @size, the region in @al between
- * @initialized_size and @size will be zeroed and not read from disk.
- *
- * Return 0 on success or -errno on error.
- */
-int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
- const s64 size, const s64 initialized_size)
-{
- LCN lcn;
- u8 *al = al_start;
- u8 *al_end = al + initialized_size;
- runlist_element *rl;
- struct buffer_head *bh;
- struct super_block *sb;
- unsigned long block_size;
- unsigned long block, max_block;
- int err = 0;
- unsigned char block_size_bits;
-
- ntfs_debug("Entering.");
- if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 ||
- initialized_size > size)
- return -EINVAL;
- if (!initialized_size) {
- memset(al, 0, size);
- return 0;
- }
- sb = vol->sb;
- block_size = sb->s_blocksize;
- block_size_bits = sb->s_blocksize_bits;
- down_read(&runlist->lock);
- rl = runlist->rl;
- if (!rl) {
- ntfs_error(sb, "Cannot read attribute list since runlist is "
- "missing.");
- goto err_out;
- }
- /* Read all clusters specified by the runlist one run at a time. */
- while (rl->length) {
- lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
- ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
- (unsigned long long)rl->vcn,
- (unsigned long long)lcn);
- /* The attribute list cannot be sparse. */
- if (lcn < 0) {
- ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot "
- "read attribute list.");
- goto err_out;
- }
- block = lcn << vol->cluster_size_bits >> block_size_bits;
- /* Read the run from device in chunks of block_size bytes. */
- max_block = block + (rl->length << vol->cluster_size_bits >>
- block_size_bits);
- ntfs_debug("max_block = 0x%lx.", max_block);
- do {
- ntfs_debug("Reading block = 0x%lx.", block);
- bh = sb_bread(sb, block);
- if (!bh) {
- ntfs_error(sb, "sb_bread() failed. Cannot "
- "read attribute list.");
- goto err_out;
- }
- if (al + block_size >= al_end)
- goto do_final;
- memcpy(al, bh->b_data, block_size);
- brelse(bh);
- al += block_size;
- } while (++block < max_block);
- rl++;
- }
- if (initialized_size < size) {
-initialize:
- memset(al_start + initialized_size, 0, size - initialized_size);
- }
-done:
- up_read(&runlist->lock);
- return err;
-do_final:
- if (al < al_end) {
- /*
- * Partial block.
- *
- * Note: The attribute list can be smaller than its allocation
- * by multiple clusters. This has been encountered by at least
- * two people running Windows XP, thus we cannot do any
- * truncation sanity checking here. (AIA)
- */
- memcpy(al, bh->b_data, al_end - al);
- brelse(bh);
- if (initialized_size < size)
- goto initialize;
- goto done;
- }
- brelse(bh);
- /* Real overflow! */
- ntfs_error(sb, "Attribute list buffer overflow. Read attribute list "
- "is truncated.");
-err_out:
- err = -EIO;
- goto done;
-}
-
-/**
- * ntfs_external_attr_find - find an attribute in the attribute list of an inode
- * @type: attribute type to find
- * @name: attribute name to find (optional, i.e. NULL means don't care)
- * @name_len: attribute name length (only needed if @name present)
- * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only)
- * @val: attribute value to find (optional, resident attributes only)
- * @val_len: attribute value length
- * @ctx: search context with mft record and attribute to search from
- *
- * You should not need to call this function directly. Use ntfs_attr_lookup()
- * instead.
- *
- * Find an attribute by searching the attribute list for the corresponding
- * attribute list entry. Having found the entry, map the mft record if the
- * attribute is in a different mft record/inode, ntfs_attr_find() the attribute
- * in there and return it.
- *
- * On first search @ctx->ntfs_ino must be the base mft record and @ctx must
- * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent
- * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is
- * then the base inode).
- *
- * After finishing with the attribute/mft record you need to call
- * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
- * mapped inodes, etc).
- *
- * If the attribute is found, ntfs_external_attr_find() returns 0 and
- * @ctx->attr will point to the found attribute. @ctx->mrec will point to the
- * mft record in which @ctx->attr is located and @ctx->al_entry will point to
- * the attribute list entry for the attribute.
- *
- * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and
- * @ctx->attr will point to the attribute in the base mft record before which
- * the attribute being searched for would need to be inserted if such an action
- * were to be desired. @ctx->mrec will point to the mft record in which
- * @ctx->attr is located and @ctx->al_entry will point to the attribute list
- * entry of the attribute before which the attribute being searched for would
- * need to be inserted if such an action were to be desired.
- *
- * Thus to insert the not found attribute, one wants to add the attribute to
- * @ctx->mrec (the base mft record) and if there is not enough space, the
- * attribute should be placed in a newly allocated extent mft record. The
- * attribute list entry for the inserted attribute should be inserted in the
- * attribute list attribute at @ctx->al_entry.
- *
- * On actual error, ntfs_external_attr_find() returns -EIO. In this case
- * @ctx->attr is undefined and in particular do not rely on it not changing.
- */
-static int ntfs_external_attr_find(const ATTR_TYPE type,
- const ntfschar *name, const u32 name_len,
- const IGNORE_CASE_BOOL ic, const VCN lowest_vcn,
- const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
-{
- ntfs_inode *base_ni, *ni;
- ntfs_volume *vol;
- ATTR_LIST_ENTRY *al_entry, *next_al_entry;
- u8 *al_start, *al_end;
- ATTR_RECORD *a;
- ntfschar *al_name;
- u32 al_name_len;
- int err = 0;
- static const char *es = " Unmount and run chkdsk.";
-
- ni = ctx->ntfs_ino;
- base_ni = ctx->base_ntfs_ino;
- ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type);
- if (!base_ni) {
- /* First call happens with the base mft record. */
- base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino;
- ctx->base_mrec = ctx->mrec;
- }
- if (ni == base_ni)
- ctx->base_attr = ctx->attr;
- if (type == AT_END)
- goto not_found;
- vol = base_ni->vol;
- al_start = base_ni->attr_list;
- al_end = al_start + base_ni->attr_list_size;
- if (!ctx->al_entry)
- ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
- /*
- * Iterate over entries in attribute list starting at @ctx->al_entry,
- * or the entry following that, if @ctx->is_first is 'true'.
- */
- if (ctx->is_first) {
- al_entry = ctx->al_entry;
- ctx->is_first = false;
- } else
- al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
- le16_to_cpu(ctx->al_entry->length));
- for (;; al_entry = next_al_entry) {
- /* Out of bounds check. */
- if ((u8*)al_entry < base_ni->attr_list ||
- (u8*)al_entry > al_end)
- break; /* Inode is corrupt. */
- ctx->al_entry = al_entry;
- /* Catch the end of the attribute list. */
- if ((u8*)al_entry == al_end)
- goto not_found;
- if (!al_entry->length)
- break;
- if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
- le16_to_cpu(al_entry->length) > al_end)
- break;
- next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
- le16_to_cpu(al_entry->length));
- if (le32_to_cpu(al_entry->type) > le32_to_cpu(type))
- goto not_found;
- if (type != al_entry->type)
- continue;
- /*
- * If @name is present, compare the two names. If @name is
- * missing, assume we want an unnamed attribute.
- */
- al_name_len = al_entry->name_length;
- al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset);
- if (!name) {
- if (al_name_len)
- goto not_found;
- } else if (!ntfs_are_names_equal(al_name, al_name_len, name,
- name_len, ic, vol->upcase, vol->upcase_len)) {
- register int rc;
-
- rc = ntfs_collate_names(name, name_len, al_name,
- al_name_len, 1, IGNORE_CASE,
- vol->upcase, vol->upcase_len);
- /*
- * If @name collates before al_name, there is no
- * matching attribute.
- */
- if (rc == -1)
- goto not_found;
- /* If the strings are not equal, continue search. */
- if (rc)
- continue;
- /*
- * FIXME: Reverse engineering showed 0, IGNORE_CASE but
- * that is inconsistent with ntfs_attr_find(). The
- * subsequent rc checks were also different. Perhaps I
- * made a mistake in one of the two. Need to recheck
- * which is correct or at least see what is going on...
- * (AIA)
- */
- rc = ntfs_collate_names(name, name_len, al_name,
- al_name_len, 1, CASE_SENSITIVE,
- vol->upcase, vol->upcase_len);
- if (rc == -1)
- goto not_found;
- if (rc)
- continue;
- }
- /*
- * The names match or @name not present and attribute is
- * unnamed. Now check @lowest_vcn. Continue search if the
- * next attribute list entry still fits @lowest_vcn. Otherwise
- * we have reached the right one or the search has failed.
- */
- if (lowest_vcn && (u8*)next_al_entry >= al_start &&
- (u8*)next_al_entry + 6 < al_end &&
- (u8*)next_al_entry + le16_to_cpu(
- next_al_entry->length) <= al_end &&
- sle64_to_cpu(next_al_entry->lowest_vcn) <=
- lowest_vcn &&
- next_al_entry->type == al_entry->type &&
- next_al_entry->name_length == al_name_len &&
- ntfs_are_names_equal((ntfschar*)((u8*)
- next_al_entry +
- next_al_entry->name_offset),
- next_al_entry->name_length,
- al_name, al_name_len, CASE_SENSITIVE,
- vol->upcase, vol->upcase_len))
- continue;
- if (MREF_LE(al_entry->mft_reference) == ni->mft_no) {
- if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) {
- ntfs_error(vol->sb, "Found stale mft "
- "reference in attribute list "
- "of base inode 0x%lx.%s",
- base_ni->mft_no, es);
- err = -EIO;
- break;
- }
- } else { /* Mft references do not match. */
- /* If there is a mapped record unmap it first. */
- if (ni != base_ni)
- unmap_extent_mft_record(ni);
- /* Do we want the base record back? */
- if (MREF_LE(al_entry->mft_reference) ==
- base_ni->mft_no) {
- ni = ctx->ntfs_ino = base_ni;
- ctx->mrec = ctx->base_mrec;
- } else {
- /* We want an extent record. */
- ctx->mrec = map_extent_mft_record(base_ni,
- le64_to_cpu(
- al_entry->mft_reference), &ni);
- if (IS_ERR(ctx->mrec)) {
- ntfs_error(vol->sb, "Failed to map "
- "extent mft record "
- "0x%lx of base inode "
- "0x%lx.%s",
- MREF_LE(al_entry->
- mft_reference),
- base_ni->mft_no, es);
- err = PTR_ERR(ctx->mrec);
- if (err == -ENOENT)
- err = -EIO;
- /* Cause @ctx to be sanitized below. */
- ni = NULL;
- break;
- }
- ctx->ntfs_ino = ni;
- }
- ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
- le16_to_cpu(ctx->mrec->attrs_offset));
- }
- /*
- * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the
- * mft record containing the attribute represented by the
- * current al_entry.
- */
- /*
- * We could call into ntfs_attr_find() to find the right
- * attribute in this mft record but this would be less
- * efficient and not quite accurate as ntfs_attr_find() ignores
- * the attribute instance numbers for example which become
- * important when one plays with attribute lists. Also,
- * because a proper match has been found in the attribute list
- * entry above, the comparison can now be optimized. So it is
- * worth re-implementing a simplified ntfs_attr_find() here.
- */
- a = ctx->attr;
- /*
- * Use a manual loop so we can still use break and continue
- * with the same meanings as above.
- */
-do_next_attr_loop:
- if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
- le32_to_cpu(ctx->mrec->bytes_allocated))
- break;
- if (a->type == AT_END)
- break;
- if (!a->length)
- break;
- if (al_entry->instance != a->instance)
- goto do_next_attr;
- /*
- * If the type and/or the name are mismatched between the
- * attribute list entry and the attribute record, there is
- * corruption so we break and return error EIO.
- */
- if (al_entry->type != a->type)
- break;
- if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
- le16_to_cpu(a->name_offset)), a->name_length,
- al_name, al_name_len, CASE_SENSITIVE,
- vol->upcase, vol->upcase_len))
- break;
- ctx->attr = a;
- /*
- * If no @val specified or @val specified and it matches, we
- * have found it!
- */
- if (!val || (!a->non_resident && le32_to_cpu(
- a->data.resident.value_length) == val_len &&
- !memcmp((u8*)a +
- le16_to_cpu(a->data.resident.value_offset),
- val, val_len))) {
- ntfs_debug("Done, found.");
- return 0;
- }
-do_next_attr:
- /* Proceed to the next attribute in the current mft record. */
- a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
- goto do_next_attr_loop;
- }
- if (!err) {
- ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt "
- "attribute list attribute.%s", base_ni->mft_no,
- es);
- err = -EIO;
- }
- if (ni != base_ni) {
- if (ni)
- unmap_extent_mft_record(ni);
- ctx->ntfs_ino = base_ni;
- ctx->mrec = ctx->base_mrec;
- ctx->attr = ctx->base_attr;
- }
- if (err != -ENOMEM)
- NVolSetErrors(vol);
- return err;
-not_found:
- /*
- * If we were looking for AT_END, we reset the search context @ctx and
- * use ntfs_attr_find() to seek to the end of the base mft record.
- */
- if (type == AT_END) {
- ntfs_attr_reinit_search_ctx(ctx);
- return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len,
- ctx);
- }
- /*
- * The attribute was not found. Before we return, we want to ensure
- * @ctx->mrec and @ctx->attr indicate the position at which the
- * attribute should be inserted in the base mft record. Since we also
- * want to preserve @ctx->al_entry we cannot reinitialize the search
- * context using ntfs_attr_reinit_search_ctx() as this would set
- * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see
- * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve
- * @ctx->al_entry as the remaining fields (base_*) are identical to
- * their non base_ counterparts and we cannot set @ctx->base_attr
- * correctly yet as we do not know what @ctx->attr will be set to by
- * the call to ntfs_attr_find() below.
- */
- if (ni != base_ni)
- unmap_extent_mft_record(ni);
- ctx->mrec = ctx->base_mrec;
- ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
- le16_to_cpu(ctx->mrec->attrs_offset));
- ctx->is_first = true;
- ctx->ntfs_ino = base_ni;
- ctx->base_ntfs_ino = NULL;
- ctx->base_mrec = NULL;
- ctx->base_attr = NULL;
- /*
- * In case there are multiple matches in the base mft record, need to
- * keep enumerating until we get an attribute not found response (or
- * another error), otherwise we would keep returning the same attribute
- * over and over again and all programs using us for enumeration would
- * lock up in a tight loop.
- */
- do {
- err = ntfs_attr_find(type, name, name_len, ic, val, val_len,
- ctx);
- } while (!err);
- ntfs_debug("Done, not found.");
- return err;
-}
-
-/**
- * ntfs_attr_lookup - find an attribute in an ntfs inode
- * @type: attribute type to find
- * @name: attribute name to find (optional, i.e. NULL means don't care)
- * @name_len: attribute name length (only needed if @name present)
- * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only)
- * @val: attribute value to find (optional, resident attributes only)
- * @val_len: attribute value length
- * @ctx: search context with mft record and attribute to search from
- *
- * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must
- * be the base mft record and @ctx must have been obtained from a call to
- * ntfs_attr_get_search_ctx().
- *
- * This function transparently handles attribute lists and @ctx is used to
- * continue searches where they were left off at.
- *
- * After finishing with the attribute/mft record you need to call
- * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
- * mapped inodes, etc).
- *
- * Return 0 if the search was successful and -errno if not.
- *
- * When 0, @ctx->attr is the found attribute and it is in mft record
- * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is
- * the attribute list entry of the found attribute.
- *
- * When -ENOENT, @ctx->attr is the attribute which collates just after the
- * attribute being searched for, i.e. if one wants to add the attribute to the
- * mft record this is the correct place to insert it into. If an attribute
- * list attribute is present, @ctx->al_entry is the attribute list entry which
- * collates just after the attribute list entry of the attribute being searched
- * for, i.e. if one wants to add the attribute to the mft record this is the
- * correct place to insert its attribute list entry into.
- *
- * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is
- * then undefined and in particular you should not rely on it not changing.
- */
-int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
- const u32 name_len, const IGNORE_CASE_BOOL ic,
- const VCN lowest_vcn, const u8 *val, const u32 val_len,
- ntfs_attr_search_ctx *ctx)
-{
- ntfs_inode *base_ni;
-
- ntfs_debug("Entering.");
- BUG_ON(IS_ERR(ctx->mrec));
- if (ctx->base_ntfs_ino)
- base_ni = ctx->base_ntfs_ino;
- else
- base_ni = ctx->ntfs_ino;
- /* Sanity check, just for debugging really. */
- BUG_ON(!base_ni);
- if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST)
- return ntfs_attr_find(type, name, name_len, ic, val, val_len,
- ctx);
- return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn,
- val, val_len, ctx);
-}
-
-/**
- * ntfs_attr_init_search_ctx - initialize an attribute search context
- * @ctx: attribute search context to initialize
- * @ni: ntfs inode with which to initialize the search context
- * @mrec: mft record with which to initialize the search context
- *
- * Initialize the attribute search context @ctx with @ni and @mrec.
- */
-static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
- ntfs_inode *ni, MFT_RECORD *mrec)
-{
- *ctx = (ntfs_attr_search_ctx) {
- .mrec = mrec,
- /* Sanity checks are performed elsewhere. */
- .attr = (ATTR_RECORD*)((u8*)mrec +
- le16_to_cpu(mrec->attrs_offset)),
- .is_first = true,
- .ntfs_ino = ni,
- };
-}
-
-/**
- * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context
- * @ctx: attribute search context to reinitialize
- *
- * Reinitialize the attribute search context @ctx, unmapping an associated
- * extent mft record if present, and initialize the search context again.
- *
- * This is used when a search for a new attribute is being started to reset
- * the search context to the beginning.
- */
-void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
-{
- if (likely(!ctx->base_ntfs_ino)) {
- /* No attribute list. */
- ctx->is_first = true;
- /* Sanity checks are performed elsewhere. */
- ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
- le16_to_cpu(ctx->mrec->attrs_offset));
- /*
- * This needs resetting due to ntfs_external_attr_find() which
- * can leave it set despite having zeroed ctx->base_ntfs_ino.
- */
- ctx->al_entry = NULL;
- return;
- } /* Attribute list. */
- if (ctx->ntfs_ino != ctx->base_ntfs_ino)
- unmap_extent_mft_record(ctx->ntfs_ino);
- ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec);
- return;
-}
-
-/**
- * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context
- * @ni: ntfs inode with which to initialize the search context
- * @mrec: mft record with which to initialize the search context
- *
- * Allocate a new attribute search context, initialize it with @ni and @mrec,
- * and return it. Return NULL if allocation failed.
- */
-ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
-{
- ntfs_attr_search_ctx *ctx;
-
- ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS);
- if (ctx)
- ntfs_attr_init_search_ctx(ctx, ni, mrec);
- return ctx;
-}
-
-/**
- * ntfs_attr_put_search_ctx - release an attribute search context
- * @ctx: attribute search context to free
- *
- * Release the attribute search context @ctx, unmapping an associated extent
- * mft record if present.
- */
-void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
-{
- if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino)
- unmap_extent_mft_record(ctx->ntfs_ino);
- kmem_cache_free(ntfs_attr_ctx_cache, ctx);
- return;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
- * @vol: ntfs volume to which the attribute belongs
- * @type: attribute type which to find
- *
- * Search for the attribute definition record corresponding to the attribute
- * @type in the $AttrDef system file.
- *
- * Return the attribute type definition record if found and NULL if not found.
- */
-static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol,
- const ATTR_TYPE type)
-{
- ATTR_DEF *ad;
-
- BUG_ON(!vol->attrdef);
- BUG_ON(!type);
- for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef <
- vol->attrdef_size && ad->type; ++ad) {
- /* We have not found it yet, carry on searching. */
- if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type)))
- continue;
- /* We found the attribute; return it. */
- if (likely(ad->type == type))
- return ad;
- /* We have gone too far already. No point in continuing. */
- break;
- }
- /* Attribute not found. */
- ntfs_debug("Attribute type 0x%x not found in $AttrDef.",
- le32_to_cpu(type));
- return NULL;
-}
-
-/**
- * ntfs_attr_size_bounds_check - check a size of an attribute type for validity
- * @vol: ntfs volume to which the attribute belongs
- * @type: attribute type which to check
- * @size: size which to check
- *
- * Check whether the @size in bytes is valid for an attribute of @type on the
- * ntfs volume @vol. This information is obtained from $AttrDef system file.
- *
- * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not
- * listed in $AttrDef.
- */
-int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type,
- const s64 size)
-{
- ATTR_DEF *ad;
-
- BUG_ON(size < 0);
- /*
- * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not
- * listed in $AttrDef.
- */
- if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024))
- return -ERANGE;
- /* Get the $AttrDef entry for the attribute @type. */
- ad = ntfs_attr_find_in_attrdef(vol, type);
- if (unlikely(!ad))
- return -ENOENT;
- /* Do the bounds check. */
- if (((sle64_to_cpu(ad->min_size) > 0) &&
- size < sle64_to_cpu(ad->min_size)) ||
- ((sle64_to_cpu(ad->max_size) > 0) && size >
- sle64_to_cpu(ad->max_size)))
- return -ERANGE;
- return 0;
-}
-
-/**
- * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident
- * @vol: ntfs volume to which the attribute belongs
- * @type: attribute type which to check
- *
- * Check whether the attribute of @type on the ntfs volume @vol is allowed to
- * be non-resident. This information is obtained from $AttrDef system file.
- *
- * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and
- * -ENOENT if the attribute is not listed in $AttrDef.
- */
-int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
-{
- ATTR_DEF *ad;
-
- /* Find the attribute definition record in $AttrDef. */
- ad = ntfs_attr_find_in_attrdef(vol, type);
- if (unlikely(!ad))
- return -ENOENT;
- /* Check the flags and return the result. */
- if (ad->flags & ATTR_DEF_RESIDENT)
- return -EPERM;
- return 0;
-}
-
-/**
- * ntfs_attr_can_be_resident - check if an attribute can be resident
- * @vol: ntfs volume to which the attribute belongs
- * @type: attribute type which to check
- *
- * Check whether the attribute of @type on the ntfs volume @vol is allowed to
- * be resident. This information is derived from our ntfs knowledge and may
- * not be completely accurate, especially when user defined attributes are
- * present. Basically we allow everything to be resident except for index
- * allocation and $EA attributes.
- *
- * Return 0 if the attribute is allowed to be non-resident and -EPERM if not.
- *
- * Warning: In the system file $MFT the attribute $Bitmap must be non-resident
- * otherwise windows will not boot (blue screen of death)! We cannot
- * check for this here as we do not know which inode's $Bitmap is
- * being asked about so the caller needs to special case this.
- */
-int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
-{
- if (type == AT_INDEX_ALLOCATION)
- return -EPERM;
- return 0;
-}
-
-/**
- * ntfs_attr_record_resize - resize an attribute record
- * @m: mft record containing attribute record
- * @a: attribute record to resize
- * @new_size: new size in bytes to which to resize the attribute record @a
- *
- * Resize the attribute record @a, i.e. the resident part of the attribute, in
- * the mft record @m to @new_size bytes.
- *
- * Return 0 on success and -errno on error. The following error codes are
- * defined:
- * -ENOSPC - Not enough space in the mft record @m to perform the resize.
- *
- * Note: On error, no modifications have been performed whatsoever.
- *
- * Warning: If you make a record smaller without having copied all the data you
- * are interested in the data may be overwritten.
- */
-int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
-{
- ntfs_debug("Entering for new_size %u.", new_size);
- /* Align to 8 bytes if it is not already done. */
- if (new_size & 7)
- new_size = (new_size + 7) & ~7;
- /* If the actual attribute length has changed, move things around. */
- if (new_size != le32_to_cpu(a->length)) {
- u32 new_muse = le32_to_cpu(m->bytes_in_use) -
- le32_to_cpu(a->length) + new_size;
- /* Not enough space in this mft record. */
- if (new_muse > le32_to_cpu(m->bytes_allocated))
- return -ENOSPC;
- /* Move attributes following @a to their new location. */
- memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length),
- le32_to_cpu(m->bytes_in_use) - ((u8*)a -
- (u8*)m) - le32_to_cpu(a->length));
- /* Adjust @m to reflect the change in used space. */
- m->bytes_in_use = cpu_to_le32(new_muse);
- /* Adjust @a to reflect the new size. */
- if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
- a->length = cpu_to_le32(new_size);
- }
- return 0;
-}
-
-/**
- * ntfs_resident_attr_value_resize - resize the value of a resident attribute
- * @m: mft record containing attribute record
- * @a: attribute record whose value to resize
- * @new_size: new size in bytes to which to resize the attribute value of @a
- *
- * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
- * If the value is made bigger, the newly allocated space is cleared.
- *
- * Return 0 on success and -errno on error. The following error codes are
- * defined:
- * -ENOSPC - Not enough space in the mft record @m to perform the resize.
- *
- * Note: On error, no modifications have been performed whatsoever.
- *
- * Warning: If you make a record smaller without having copied all the data you
- * are interested in the data may be overwritten.
- */
-int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
- const u32 new_size)
-{
- u32 old_size;
-
- /* Resize the resident part of the attribute record. */
- if (ntfs_attr_record_resize(m, a,
- le16_to_cpu(a->data.resident.value_offset) + new_size))
- return -ENOSPC;
- /*
- * The resize succeeded! If we made the attribute value bigger, clear
- * the area between the old size and @new_size.
- */
- old_size = le32_to_cpu(a->data.resident.value_length);
- if (new_size > old_size)
- memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
- old_size, 0, new_size - old_size);
- /* Finally update the length of the attribute value. */
- a->data.resident.value_length = cpu_to_le32(new_size);
- return 0;
-}
-
-/**
- * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
- * @ni: ntfs inode describing the attribute to convert
- * @data_size: size of the resident data to copy to the non-resident attribute
- *
- * Convert the resident ntfs attribute described by the ntfs inode @ni to a
- * non-resident one.
- *
- * @data_size must be equal to the attribute value size. This is needed since
- * we need to know the size before we can map the mft record and our callers
- * always know it. The reason we cannot simply read the size from the vfs
- * inode i_size is that this is not necessarily uptodate. This happens when
- * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
- *
- * Return 0 on success and -errno on error. The following error return codes
- * are defined:
- * -EPERM - The attribute is not allowed to be non-resident.
- * -ENOMEM - Not enough memory.
- * -ENOSPC - Not enough disk space.
- * -EINVAL - Attribute not defined on the volume.
- * -EIO - I/o error or other error.
- * Note that -ENOSPC is also returned in the case that there is not enough
- * space in the mft record to do the conversion. This can happen when the mft
- * record is already very full. The caller is responsible for trying to make
- * space in the mft record and trying again. FIXME: Do we need a separate
- * error return code for this kind of -ENOSPC or is it always worth trying
- * again in case the attribute may then fit in a resident state so no need to
- * make it non-resident at all? Ho-hum... (AIA)
- *
- * NOTE to self: No changes in the attribute list are required to move from
- * a resident to a non-resident attribute.
- *
- * Locking: - The caller must hold i_mutex on the inode.
- */
-int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
-{
- s64 new_size;
- struct inode *vi = VFS_I(ni);
- ntfs_volume *vol = ni->vol;
- ntfs_inode *base_ni;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
- struct page *page;
- runlist_element *rl;
- u8 *kaddr;
- unsigned long flags;
- int mp_size, mp_ofs, name_ofs, arec_size, err, err2;
- u32 attr_size;
- u8 old_res_attr_flags;
-
- /* Check that the attribute is allowed to be non-resident. */
- err = ntfs_attr_can_be_non_resident(vol, ni->type);
- if (unlikely(err)) {
- if (err == -EPERM)
- ntfs_debug("Attribute is not allowed to be "
- "non-resident.");
- else
- ntfs_debug("Attribute not defined on the NTFS "
- "volume!");
- return err;
- }
- /*
- * FIXME: Compressed and encrypted attributes are not supported when
- * writing and we should never have gotten here for them.
- */
- BUG_ON(NInoCompressed(ni));
- BUG_ON(NInoEncrypted(ni));
- /*
- * The size needs to be aligned to a cluster boundary for allocation
- * purposes.
- */
- new_size = (data_size + vol->cluster_size - 1) &
- ~(vol->cluster_size - 1);
- if (new_size > 0) {
- /*
- * Will need the page later and since the page lock nests
- * outside all ntfs locks, we need to get the page now.
- */
- page = find_or_create_page(vi->i_mapping, 0,
- mapping_gfp_mask(vi->i_mapping));
- if (unlikely(!page))
- return -ENOMEM;
- /* Start by allocating clusters to hold the attribute value. */
- rl = ntfs_cluster_alloc(vol, 0, new_size >>
- vol->cluster_size_bits, -1, DATA_ZONE, true);
- if (IS_ERR(rl)) {
- err = PTR_ERR(rl);
- ntfs_debug("Failed to allocate cluster%s, error code "
- "%i.", (new_size >>
- vol->cluster_size_bits) > 1 ? "s" : "",
- err);
- goto page_err_out;
- }
- } else {
- rl = NULL;
- page = NULL;
- }
- /* Determine the size of the mapping pairs array. */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1);
- if (unlikely(mp_size < 0)) {
- err = mp_size;
- ntfs_debug("Failed to get size for mapping pairs array, error "
- "code %i.", err);
- goto rl_err_out;
- }
- down_write(&ni->runlist.lock);
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- BUG_ON(NInoNonResident(ni));
- BUG_ON(a->non_resident);
- /*
- * Calculate new offsets for the name and the mapping pairs array.
- */
- if (NInoSparse(ni) || NInoCompressed(ni))
- name_ofs = (offsetof(ATTR_REC,
- data.non_resident.compressed_size) +
- sizeof(a->data.non_resident.compressed_size) +
- 7) & ~7;
- else
- name_ofs = (offsetof(ATTR_REC,
- data.non_resident.compressed_size) + 7) & ~7;
- mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
- /*
- * Determine the size of the resident part of the now non-resident
- * attribute record.
- */
- arec_size = (mp_ofs + mp_size + 7) & ~7;
- /*
- * If the page is not uptodate bring it uptodate by copying from the
- * attribute value.
- */
- attr_size = le32_to_cpu(a->data.resident.value_length);
- BUG_ON(attr_size != data_size);
- if (page && !PageUptodate(page)) {
- kaddr = kmap_atomic(page);
- memcpy(kaddr, (u8*)a +
- le16_to_cpu(a->data.resident.value_offset),
- attr_size);
- memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size);
- kunmap_atomic(kaddr);
- flush_dcache_page(page);
- SetPageUptodate(page);
- }
- /* Backup the attribute flag. */
- old_res_attr_flags = a->data.resident.flags;
- /* Resize the resident part of the attribute record. */
- err = ntfs_attr_record_resize(m, a, arec_size);
- if (unlikely(err))
- goto err_out;
- /*
- * Convert the resident part of the attribute record to describe a
- * non-resident attribute.
- */
- a->non_resident = 1;
- /* Move the attribute name if it exists and update the offset. */
- if (a->name_length)
- memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
- a->name_length * sizeof(ntfschar));
- a->name_offset = cpu_to_le16(name_ofs);
- /* Setup the fields specific to non-resident attributes. */
- a->data.non_resident.lowest_vcn = 0;
- a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
- vol->cluster_size_bits);
- a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
- memset(&a->data.non_resident.reserved, 0,
- sizeof(a->data.non_resident.reserved));
- a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
- a->data.non_resident.data_size =
- a->data.non_resident.initialized_size =
- cpu_to_sle64(attr_size);
- if (NInoSparse(ni) || NInoCompressed(ni)) {
- a->data.non_resident.compression_unit = 0;
- if (NInoCompressed(ni) || vol->major_ver < 3)
- a->data.non_resident.compression_unit = 4;
- a->data.non_resident.compressed_size =
- a->data.non_resident.allocated_size;
- } else
- a->data.non_resident.compression_unit = 0;
- /* Generate the mapping pairs array into the attribute record. */
- err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
- arec_size - mp_ofs, rl, 0, -1, NULL);
- if (unlikely(err)) {
- ntfs_debug("Failed to build mapping pairs, error code %i.",
- err);
- goto undo_err_out;
- }
- /* Setup the in-memory attribute structure to be non-resident. */
- ni->runlist.rl = rl;
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = new_size;
- if (NInoSparse(ni) || NInoCompressed(ni)) {
- ni->itype.compressed.size = ni->allocated_size;
- if (a->data.non_resident.compression_unit) {
- ni->itype.compressed.block_size = 1U << (a->data.
- non_resident.compression_unit +
- vol->cluster_size_bits);
- ni->itype.compressed.block_size_bits =
- ffs(ni->itype.compressed.block_size) -
- 1;
- ni->itype.compressed.block_clusters = 1U <<
- a->data.non_resident.compression_unit;
- } else {
- ni->itype.compressed.block_size = 0;
- ni->itype.compressed.block_size_bits = 0;
- ni->itype.compressed.block_clusters = 0;
- }
- vi->i_blocks = ni->itype.compressed.size >> 9;
- } else
- vi->i_blocks = ni->allocated_size >> 9;
- write_unlock_irqrestore(&ni->size_lock, flags);
- /*
- * This needs to be last since the address space operations ->read_folio
- * and ->writepage can run concurrently with us as they are not
- * serialized on i_mutex. Note, we are not allowed to fail once we flip
- * this switch, which is another reason to do this last.
- */
- NInoSetNonResident(ni);
- /* Mark the mft record dirty, so it gets written back. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- if (page) {
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
- }
- ntfs_debug("Done.");
- return 0;
-undo_err_out:
- /* Convert the attribute back into a resident attribute. */
- a->non_resident = 0;
- /* Move the attribute name if it exists and update the offset. */
- name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) +
- sizeof(a->data.resident.reserved) + 7) & ~7;
- if (a->name_length)
- memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
- a->name_length * sizeof(ntfschar));
- mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
- a->name_offset = cpu_to_le16(name_ofs);
- arec_size = (mp_ofs + attr_size + 7) & ~7;
- /* Resize the resident part of the attribute record. */
- err2 = ntfs_attr_record_resize(m, a, arec_size);
- if (unlikely(err2)) {
- /*
- * This cannot happen (well if memory corruption is at work it
- * could happen in theory), but deal with it as well as we can.
- * If the old size is too small, truncate the attribute,
- * otherwise simply give it a larger allocated size.
- * FIXME: Should check whether chkdsk complains when the
- * allocated size is much bigger than the resident value size.
- */
- arec_size = le32_to_cpu(a->length);
- if ((mp_ofs + attr_size) > arec_size) {
- err2 = attr_size;
- attr_size = arec_size - mp_ofs;
- ntfs_error(vol->sb, "Failed to undo partial resident "
- "to non-resident attribute "
- "conversion. Truncating inode 0x%lx, "
- "attribute type 0x%x from %i bytes to "
- "%i bytes to maintain metadata "
- "consistency. THIS MEANS YOU ARE "
- "LOSING %i BYTES DATA FROM THIS %s.",
- vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- err2, attr_size, err2 - attr_size,
- ((ni->type == AT_DATA) &&
- !ni->name_len) ? "FILE": "ATTRIBUTE");
- write_lock_irqsave(&ni->size_lock, flags);
- ni->initialized_size = attr_size;
- i_size_write(vi, attr_size);
- write_unlock_irqrestore(&ni->size_lock, flags);
- }
- }
- /* Setup the fields specific to resident attributes. */
- a->data.resident.value_length = cpu_to_le32(attr_size);
- a->data.resident.value_offset = cpu_to_le16(mp_ofs);
- a->data.resident.flags = old_res_attr_flags;
- memset(&a->data.resident.reserved, 0,
- sizeof(a->data.resident.reserved));
- /* Copy the data from the page back to the attribute value. */
- if (page) {
- kaddr = kmap_atomic(page);
- memcpy((u8*)a + mp_ofs, kaddr, attr_size);
- kunmap_atomic(kaddr);
- }
- /* Setup the allocated size in the ntfs inode in case it changed. */
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = arec_size - mp_ofs;
- write_unlock_irqrestore(&ni->size_lock, flags);
- /* Mark the mft record dirty, so it gets written back. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- ni->runlist.rl = NULL;
- up_write(&ni->runlist.lock);
-rl_err_out:
- if (rl) {
- if (ntfs_cluster_free_from_rl(vol, rl) < 0) {
- ntfs_error(vol->sb, "Failed to release allocated "
- "cluster(s) in error code path. Run "
- "chkdsk to recover the lost "
- "cluster(s).");
- NVolSetErrors(vol);
- }
- ntfs_free(rl);
-page_err_out:
- unlock_page(page);
- put_page(page);
- }
- if (err == -EINVAL)
- err = -EIO;
- return err;
-}
-
-/**
- * ntfs_attr_extend_allocation - extend the allocated space of an attribute
- * @ni: ntfs inode of the attribute whose allocation to extend
- * @new_alloc_size: new size in bytes to which to extend the allocation to
- * @new_data_size: new size in bytes to which to extend the data to
- * @data_start: beginning of region which is required to be non-sparse
- *
- * Extend the allocated space of an attribute described by the ntfs inode @ni
- * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be
- * implemented as a hole in the file (as long as both the volume and the ntfs
- * inode @ni have sparse support enabled). If @data_start is >= 0, then the
- * region between the old allocated size and @data_start - 1 may be made sparse
- * but the regions between @data_start and @new_alloc_size must be backed by
- * actual clusters.
- *
- * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size
- * of the attribute is extended to @new_data_size. Note that the i_size of the
- * vfs inode is not updated. Only the data size in the base attribute record
- * is updated. The caller has to update i_size separately if this is required.
- * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
- * size as well as for @new_data_size to be greater than @new_alloc_size.
- *
- * For resident attributes this involves resizing the attribute record and if
- * necessary moving it and/or other attributes into extent mft records and/or
- * converting the attribute to a non-resident attribute which in turn involves
- * extending the allocation of a non-resident attribute as described below.
- *
- * For non-resident attributes this involves allocating clusters in the data
- * zone on the volume (except for regions that are being made sparse) and
- * extending the run list to describe the allocated clusters as well as
- * updating the mapping pairs array of the attribute. This in turn involves
- * resizing the attribute record and if necessary moving it and/or other
- * attributes into extent mft records and/or splitting the attribute record
- * into multiple extent attribute records.
- *
- * Also, the attribute list attribute is updated if present and in some of the
- * above cases (the ones where extent mft records/attributes come into play),
- * an attribute list attribute is created if not already present.
- *
- * Return the new allocated size on success and -errno on error. In the case
- * that an error is encountered but a partial extension at least up to
- * @data_start (if present) is possible, the allocation is partially extended
- * and this is returned. This means the caller must check the returned size to
- * determine if the extension was partial. If @data_start is -1 then partial
- * allocations are not performed.
- *
- * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
- *
- * Locking: This function takes the runlist lock of @ni for writing as well as
- * locking the mft record of the base ntfs inode. These locks are maintained
- * throughout execution of the function. These locks are required so that the
- * attribute can be resized safely and so that it can for example be converted
- * from resident to non-resident safely.
- *
- * TODO: At present attribute list attribute handling is not implemented.
- *
- * TODO: At present it is not safe to call this function for anything other
- * than the $DATA attribute(s) of an uncompressed and unencrypted file.
- */
-s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
- const s64 new_data_size, const s64 data_start)
-{
- VCN vcn;
- s64 ll, allocated_size, start = data_start;
- struct inode *vi = VFS_I(ni);
- ntfs_volume *vol = ni->vol;
- ntfs_inode *base_ni;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
- runlist_element *rl, *rl2;
- unsigned long flags;
- int err, mp_size;
- u32 attr_len = 0; /* Silence stupid gcc warning. */
- bool mp_rebuilt;
-
-#ifdef DEBUG
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
- "old_allocated_size 0x%llx, "
- "new_allocated_size 0x%llx, new_data_size 0x%llx, "
- "data_start 0x%llx.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)allocated_size,
- (unsigned long long)new_alloc_size,
- (unsigned long long)new_data_size,
- (unsigned long long)start);
-#endif
-retry_extend:
- /*
- * For non-resident attributes, @start and @new_size need to be aligned
- * to cluster boundaries for allocation purposes.
- */
- if (NInoNonResident(ni)) {
- if (start > 0)
- start &= ~(s64)vol->cluster_size_mask;
- new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
- ~(s64)vol->cluster_size_mask;
- }
- BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
- /* Check if new size is allowed in $AttrDef. */
- err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
- if (unlikely(err)) {
- /* Only emit errors when the write will fail completely. */
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (start < 0 || start >= allocated_size) {
- if (err == -ERANGE) {
- ntfs_error(vol->sb, "Cannot extend allocation "
- "of inode 0x%lx, attribute "
- "type 0x%x, because the new "
- "allocation would exceed the "
- "maximum allowed size for "
- "this attribute type.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- } else {
- ntfs_error(vol->sb, "Cannot extend allocation "
- "of inode 0x%lx, attribute "
- "type 0x%x, because this "
- "attribute type is not "
- "defined on the NTFS volume. "
- "Possible corruption! You "
- "should run chkdsk!",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- }
- }
- /* Translate error code to be POSIX conformant for write(2). */
- if (err == -ERANGE)
- err = -EFBIG;
- else
- err = -EIO;
- return err;
- }
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /*
- * We will be modifying both the runlist (if non-resident) and the mft
- * record so lock them both down.
- */
- down_write(&ni->runlist.lock);
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- /*
- * If non-resident, seek to the last extent. If resident, there is
- * only one extent, so seek to that.
- */
- vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
- 0;
- /*
- * Abort if someone did the work whilst we waited for the locks. If we
- * just converted the attribute from resident to non-resident it is
- * likely that exactly this has happened already. We cannot quite
- * abort if we need to update the data size.
- */
- if (unlikely(new_alloc_size <= allocated_size)) {
- ntfs_debug("Allocated size already exceeds requested size.");
- new_alloc_size = allocated_size;
- if (new_data_size < 0)
- goto done;
- /*
- * We want the first attribute extent so that we can update the
- * data size.
- */
- vcn = 0;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, vcn, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- /* Use goto to reduce indentation. */
- if (a->non_resident)
- goto do_non_resident_extend;
- BUG_ON(NInoNonResident(ni));
- /* The total length of the attribute value. */
- attr_len = le32_to_cpu(a->data.resident.value_length);
- /*
- * Extend the attribute record to be able to store the new attribute
- * size. ntfs_attr_record_resize() will not do anything if the size is
- * not changing.
- */
- if (new_alloc_size < vol->mft_record_size &&
- !ntfs_attr_record_resize(m, a,
- le16_to_cpu(a->data.resident.value_offset) +
- new_alloc_size)) {
- /* The resize succeeded! */
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = le32_to_cpu(a->length) -
- le16_to_cpu(a->data.resident.value_offset);
- write_unlock_irqrestore(&ni->size_lock, flags);
- if (new_data_size >= 0) {
- BUG_ON(new_data_size < attr_len);
- a->data.resident.value_length =
- cpu_to_le32((u32)new_data_size);
- }
- goto flush_done;
- }
- /*
- * We have to drop all the locks so we can call
- * ntfs_attr_make_non_resident(). This could be optimised by try-
- * locking the first page cache page and only if that fails dropping
- * the locks, locking the page, and redoing all the locking and
- * lookups. While this would be a huge optimisation, it is not worth
- * it as this is definitely a slow code path.
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- /*
- * Not enough space in the mft record, try to make the attribute
- * non-resident and if successful restart the extension process.
- */
- err = ntfs_attr_make_non_resident(ni, attr_len);
- if (likely(!err))
- goto retry_extend;
- /*
- * Could not make non-resident. If this is due to this not being
- * permitted for this attribute type or there not being enough space,
- * try to make other attributes non-resident. Otherwise fail.
- */
- if (unlikely(err != -EPERM && err != -ENOSPC)) {
- /* Only emit errors when the write will fail completely. */
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation of "
- "inode 0x%lx, attribute type 0x%x, "
- "because the conversion from resident "
- "to non-resident attribute failed "
- "with error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- if (err != -ENOMEM)
- err = -EIO;
- goto conv_err_out;
- }
- /* TODO: Not implemented from here, abort. */
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (start < 0 || start >= allocated_size) {
- if (err == -ENOSPC)
- ntfs_error(vol->sb, "Not enough space in the mft "
- "record/on disk for the non-resident "
- "attribute value. This case is not "
- "implemented yet.");
- else /* if (err == -EPERM) */
- ntfs_error(vol->sb, "This attribute type may not be "
- "non-resident. This case is not "
- "implemented yet.");
- }
- err = -EOPNOTSUPP;
- goto conv_err_out;
-#if 0
- // TODO: Attempt to make other attributes non-resident.
- if (!err)
- goto do_resident_extend;
- /*
- * Both the attribute list attribute and the standard information
- * attribute must remain in the base inode. Thus, if this is one of
- * these attributes, we have to try to move other attributes out into
- * extent mft records instead.
- */
- if (ni->type == AT_ATTRIBUTE_LIST ||
- ni->type == AT_STANDARD_INFORMATION) {
- // TODO: Attempt to move other attributes into extent mft
- // records.
- err = -EOPNOTSUPP;
- if (!err)
- goto do_resident_extend;
- goto err_out;
- }
- // TODO: Attempt to move this attribute to an extent mft record, but
- // only if it is not already the only attribute in an mft record in
- // which case there would be nothing to gain.
- err = -EOPNOTSUPP;
- if (!err)
- goto do_resident_extend;
- /* There is nothing we can do to make enough space. )-: */
- goto err_out;
-#endif
-do_non_resident_extend:
- BUG_ON(!NInoNonResident(ni));
- if (new_alloc_size == allocated_size) {
- BUG_ON(vcn);
- goto alloc_done;
- }
- /*
- * If the data starts after the end of the old allocation, this is a
- * $DATA attribute and sparse attributes are enabled on the volume and
- * for this inode, then create a sparse region between the old
- * allocated size and the start of the data. Otherwise simply proceed
- * with filling the whole space between the old allocated size and the
- * new allocated size with clusters.
- */
- if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
- !NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
- goto skip_sparse;
- // TODO: This is not implemented yet. We just fill in with real
- // clusters for now...
- ntfs_debug("Inserting holes is not-implemented yet. Falling back to "
- "allocating real clusters instead.");
-skip_sparse:
- rl = ni->runlist.rl;
- if (likely(rl)) {
- /* Seek to the end of the runlist. */
- while (rl->length)
- rl++;
- }
- /* If this attribute extent is not mapped, map it now. */
- if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
- (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
- (rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
- if (!rl && !allocated_size)
- goto first_alloc;
- rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
- if (IS_ERR(rl)) {
- err = PTR_ERR(rl);
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation "
- "of inode 0x%lx, attribute "
- "type 0x%x, because the "
- "mapping of a runlist "
- "fragment failed with error "
- "code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- err);
- if (err != -ENOMEM)
- err = -EIO;
- goto err_out;
- }
- ni->runlist.rl = rl;
- /* Seek to the end of the runlist. */
- while (rl->length)
- rl++;
- }
- /*
- * We now know the runlist of the last extent is mapped and @rl is at
- * the end of the runlist. We want to begin allocating clusters
- * starting at the last allocated cluster to reduce fragmentation. If
- * there are no valid LCNs in the attribute we let the cluster
- * allocator choose the starting cluster.
- */
- /* If the last LCN is a hole or simillar seek back to last real LCN. */
- while (rl->lcn < 0 && rl > ni->runlist.rl)
- rl--;
-first_alloc:
- // FIXME: Need to implement partial allocations so at least part of the
- // write can be performed when start >= 0. (Needed for POSIX write(2)
- // conformance.)
- rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
- (new_alloc_size - allocated_size) >>
- vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
- rl->lcn + rl->length : -1, DATA_ZONE, true);
- if (IS_ERR(rl2)) {
- err = PTR_ERR(rl2);
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation of "
- "inode 0x%lx, attribute type 0x%x, "
- "because the allocation of clusters "
- "failed with error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- if (err != -ENOMEM && err != -ENOSPC)
- err = -EIO;
- goto err_out;
- }
- rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
- if (IS_ERR(rl)) {
- err = PTR_ERR(rl);
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation of "
- "inode 0x%lx, attribute type 0x%x, "
- "because the runlist merge failed "
- "with error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- if (err != -ENOMEM)
- err = -EIO;
- if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to release allocated "
- "cluster(s) in error code path. Run "
- "chkdsk to recover the lost "
- "cluster(s).");
- NVolSetErrors(vol);
- }
- ntfs_free(rl2);
- goto err_out;
- }
- ni->runlist.rl = rl;
- ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
- allocated_size) >> vol->cluster_size_bits);
- /* Find the runlist element with which the attribute extent starts. */
- ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
- rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
- BUG_ON(!rl2);
- BUG_ON(!rl2->length);
- BUG_ON(rl2->lcn < LCN_HOLE);
- mp_rebuilt = false;
- /* Get the size for the new mapping pairs array for this extent. */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
- if (unlikely(mp_size <= 0)) {
- err = mp_size;
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation of "
- "inode 0x%lx, attribute type 0x%x, "
- "because determining the size for the "
- "mapping pairs failed with error code "
- "%i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- err = -EIO;
- goto undo_alloc;
- }
- /* Extend the attribute record to fit the bigger mapping pairs array. */
- attr_len = le32_to_cpu(a->length);
- err = ntfs_attr_record_resize(m, a, mp_size +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
- if (unlikely(err)) {
- BUG_ON(err != -ENOSPC);
- // TODO: Deal with this by moving this extent to a new mft
- // record or by starting a new extent in a new mft record,
- // possibly by extending this extent partially and filling it
- // and creating a new extent for the remainder, or by making
- // other attributes non-resident and/or by moving other
- // attributes out of this mft record.
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Not enough space in the mft "
- "record for the extended attribute "
- "record. This case is not "
- "implemented yet.");
- err = -EOPNOTSUPP;
- goto undo_alloc;
- }
- mp_rebuilt = true;
- /* Generate the mapping pairs array directly into the attr record. */
- err = ntfs_mapping_pairs_build(vol, (u8*)a +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
- mp_size, rl2, ll, -1, NULL);
- if (unlikely(err)) {
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation of "
- "inode 0x%lx, attribute type 0x%x, "
- "because building the mapping pairs "
- "failed with error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- err = -EIO;
- goto undo_alloc;
- }
- /* Update the highest_vcn. */
- a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
- vol->cluster_size_bits) - 1);
- /*
- * We now have extended the allocated size of the attribute. Reflect
- * this in the ntfs_inode structure and the attribute record.
- */
- if (a->data.non_resident.lowest_vcn) {
- /*
- * We are not in the first attribute extent, switch to it, but
- * first ensure the changes will make it to disk later.
- */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err))
- goto restore_undo_alloc;
- /* @m is not used any more so no need to set it. */
- a = ctx->attr;
- }
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = new_alloc_size;
- a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
- /*
- * FIXME: This would fail if @ni is a directory, $MFT, or an index,
- * since those can have sparse/compressed set. For example can be
- * set compressed even though it is not compressed itself and in that
- * case the bit means that files are to be created compressed in the
- * directory... At present this is ok as this code is only called for
- * regular files, and only for their $DATA attribute(s).
- * FIXME: The calculation is wrong if we created a hole above. For now
- * it does not matter as we never create holes.
- */
- if (NInoSparse(ni) || NInoCompressed(ni)) {
- ni->itype.compressed.size += new_alloc_size - allocated_size;
- a->data.non_resident.compressed_size =
- cpu_to_sle64(ni->itype.compressed.size);
- vi->i_blocks = ni->itype.compressed.size >> 9;
- } else
- vi->i_blocks = new_alloc_size >> 9;
- write_unlock_irqrestore(&ni->size_lock, flags);
-alloc_done:
- if (new_data_size >= 0) {
- BUG_ON(new_data_size <
- sle64_to_cpu(a->data.non_resident.data_size));
- a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
- }
-flush_done:
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
-done:
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- ntfs_debug("Done, new_allocated_size 0x%llx.",
- (unsigned long long)new_alloc_size);
- return new_alloc_size;
-restore_undo_alloc:
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot complete extension of allocation "
- "of inode 0x%lx, attribute type 0x%x, because "
- "lookup of first attribute extent failed with "
- "error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- if (err == -ENOENT)
- err = -EIO;
- ntfs_attr_reinit_search_ctx(ctx);
- if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
- allocated_size >> vol->cluster_size_bits, NULL, 0,
- ctx)) {
- ntfs_error(vol->sb, "Failed to find last attribute extent of "
- "attribute in error code path. Run chkdsk to "
- "recover.");
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = new_alloc_size;
- /*
- * FIXME: This would fail if @ni is a directory... See above.
- * FIXME: The calculation is wrong if we created a hole above.
- * For now it does not matter as we never create holes.
- */
- if (NInoSparse(ni) || NInoCompressed(ni)) {
- ni->itype.compressed.size += new_alloc_size -
- allocated_size;
- vi->i_blocks = ni->itype.compressed.size >> 9;
- } else
- vi->i_blocks = new_alloc_size >> 9;
- write_unlock_irqrestore(&ni->size_lock, flags);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- /*
- * The only thing that is now wrong is the allocated size of the
- * base attribute extent which chkdsk should be able to fix.
- */
- NVolSetErrors(vol);
- return err;
- }
- ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
- (allocated_size >> vol->cluster_size_bits) - 1);
-undo_alloc:
- ll = allocated_size >> vol->cluster_size_bits;
- if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
- ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
- "in error code path. Run chkdsk to recover "
- "the lost cluster(s).");
- NVolSetErrors(vol);
- }
- m = ctx->mrec;
- a = ctx->attr;
- /*
- * If the runlist truncation fails and/or the search context is no
- * longer valid, we cannot resize the attribute record or build the
- * mapping pairs array thus we mark the inode bad so that no access to
- * the freed clusters can happen.
- */
- if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
- ntfs_error(vol->sb, "Failed to %s in error code path. Run "
- "chkdsk to recover.", IS_ERR(m) ?
- "restore attribute search context" :
- "truncate attribute runlist");
- NVolSetErrors(vol);
- } else if (mp_rebuilt) {
- if (ntfs_attr_record_resize(m, a, attr_len)) {
- ntfs_error(vol->sb, "Failed to restore attribute "
- "record in error code path. Run "
- "chkdsk to recover.");
- NVolSetErrors(vol);
- } else /* if (success) */ {
- if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
- a->data.non_resident.
- mapping_pairs_offset), attr_len -
- le16_to_cpu(a->data.non_resident.
- mapping_pairs_offset), rl2, ll, -1,
- NULL)) {
- ntfs_error(vol->sb, "Failed to restore "
- "mapping pairs array in error "
- "code path. Run chkdsk to "
- "recover.");
- NVolSetErrors(vol);
- }
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- }
- }
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
-conv_err_out:
- ntfs_debug("Failed. Returning error code %i.", err);
- return err;
-}
-
-/**
- * ntfs_attr_set - fill (a part of) an attribute with a byte
- * @ni: ntfs inode describing the attribute to fill
- * @ofs: offset inside the attribute at which to start to fill
- * @cnt: number of bytes to fill
- * @val: the unsigned 8-bit value with which to fill the attribute
- *
- * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
- * byte offset @ofs inside the attribute with the constant byte @val.
- *
- * This function is effectively like memset() applied to an ntfs attribute.
- * Note this function actually only operates on the page cache pages belonging
- * to the ntfs attribute and it marks them dirty after doing the memset().
- * Thus it relies on the vm dirty page write code paths to cause the modified
- * pages to be written to the mft record/disk.
- *
- * Return 0 on success and -errno on error. An error code of -ESPIPE means
- * that @ofs + @cnt were outside the end of the attribute and no write was
- * performed.
- */
-int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
-{
- ntfs_volume *vol = ni->vol;
- struct address_space *mapping;
- struct page *page;
- u8 *kaddr;
- pgoff_t idx, end;
- unsigned start_ofs, end_ofs, size;
-
- ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
- (long long)ofs, (long long)cnt, val);
- BUG_ON(ofs < 0);
- BUG_ON(cnt < 0);
- if (!cnt)
- goto done;
- /*
- * FIXME: Compressed and encrypted attributes are not supported when
- * writing and we should never have gotten here for them.
- */
- BUG_ON(NInoCompressed(ni));
- BUG_ON(NInoEncrypted(ni));
- mapping = VFS_I(ni)->i_mapping;
- /* Work out the starting index and page offset. */
- idx = ofs >> PAGE_SHIFT;
- start_ofs = ofs & ~PAGE_MASK;
- /* Work out the ending index and page offset. */
- end = ofs + cnt;
- end_ofs = end & ~PAGE_MASK;
- /* If the end is outside the inode size return -ESPIPE. */
- if (unlikely(end > i_size_read(VFS_I(ni)))) {
- ntfs_error(vol->sb, "Request exceeds end of attribute.");
- return -ESPIPE;
- }
- end >>= PAGE_SHIFT;
- /* If there is a first partial page, need to do it the slow way. */
- if (start_ofs) {
- page = read_mapping_page(mapping, idx, NULL);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read first partial "
- "page (error, index 0x%lx).", idx);
- return PTR_ERR(page);
- }
- /*
- * If the last page is the same as the first page, need to
- * limit the write to the end offset.
- */
- size = PAGE_SIZE;
- if (idx == end)
- size = end_ofs;
- kaddr = kmap_atomic(page);
- memset(kaddr + start_ofs, val, size - start_ofs);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- set_page_dirty(page);
- put_page(page);
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- if (idx == end)
- goto done;
- idx++;
- }
- /* Do the whole pages the fast way. */
- for (; idx < end; idx++) {
- /* Find or create the current page. (The page is locked.) */
- page = grab_cache_page(mapping, idx);
- if (unlikely(!page)) {
- ntfs_error(vol->sb, "Insufficient memory to grab "
- "page (index 0x%lx).", idx);
- return -ENOMEM;
- }
- kaddr = kmap_atomic(page);
- memset(kaddr, val, PAGE_SIZE);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- /*
- * If the page has buffers, mark them uptodate since buffer
- * state and not page state is definitive in 2.6 kernels.
- */
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
-
- bh = head = page_buffers(page);
- do {
- set_buffer_uptodate(bh);
- } while ((bh = bh->b_this_page) != head);
- }
- /* Now that buffers are uptodate, set the page uptodate, too. */
- SetPageUptodate(page);
- /*
- * Set the page and all its buffers dirty and mark the inode
- * dirty, too. The VM will write the page later on.
- */
- set_page_dirty(page);
- /* Finally unlock and release the page. */
- unlock_page(page);
- put_page(page);
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- }
- /* If there is a last partial page, need to do it the slow way. */
- if (end_ofs) {
- page = read_mapping_page(mapping, idx, NULL);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read last partial page "
- "(error, index 0x%lx).", idx);
- return PTR_ERR(page);
- }
- kaddr = kmap_atomic(page);
- memset(kaddr, val, end_ofs);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- set_page_dirty(page);
- put_page(page);
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- }
-done:
- ntfs_debug("Done.");
- return 0;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
deleted file mode 100644
index fe0890d3d072..000000000000
--- a/fs/ntfs/attrib.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_ATTRIB_H
-#define _LINUX_NTFS_ATTRIB_H
-
-#include "endian.h"
-#include "types.h"
-#include "layout.h"
-#include "inode.h"
-#include "runlist.h"
-#include "volume.h"
-
-/**
- * ntfs_attr_search_ctx - used in attribute search functions
- * @mrec: buffer containing mft record to search
- * @attr: attribute record in @mrec where to begin/continue search
- * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after
- *
- * Structure must be initialized to zero before the first call to one of the
- * attribute search functions. Initialize @mrec to point to the mft record to
- * search, and @attr to point to the first attribute within @mrec (not necessary
- * if calling the _first() functions), and set @is_first to 'true' (not necessary
- * if calling the _first() functions).
- *
- * If @is_first is 'true', the search begins with @attr. If @is_first is 'false',
- * the search begins after @attr. This is so that, after the first call to one
- * of the search attribute functions, we can call the function again, without
- * any modification of the search context, to automagically get the next
- * matching attribute.
- */
-typedef struct {
- MFT_RECORD *mrec;
- ATTR_RECORD *attr;
- bool is_first;
- ntfs_inode *ntfs_ino;
- ATTR_LIST_ENTRY *al_entry;
- ntfs_inode *base_ntfs_ino;
- MFT_RECORD *base_mrec;
- ATTR_RECORD *base_attr;
-} ntfs_attr_search_ctx;
-
-extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
- ntfs_attr_search_ctx *ctx);
-extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
-
-extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
- const bool write_locked);
-
-extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
- const VCN vcn, ntfs_attr_search_ctx *ctx);
-
-int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
- const u32 name_len, const IGNORE_CASE_BOOL ic,
- const VCN lowest_vcn, const u8 *val, const u32 val_len,
- ntfs_attr_search_ctx *ctx);
-
-extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start,
- const s64 size, const s64 initialized_size);
-
-static inline s64 ntfs_attr_size(const ATTR_RECORD *a)
-{
- if (!a->non_resident)
- return (s64)le32_to_cpu(a->data.resident.value_length);
- return sle64_to_cpu(a->data.non_resident.data_size);
-}
-
-extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx);
-extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
- MFT_RECORD *mrec);
-extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
-
-#ifdef NTFS_RW
-
-extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol,
- const ATTR_TYPE type, const s64 size);
-extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
- const ATTR_TYPE type);
-extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
- const ATTR_TYPE type);
-
-extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
-extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
- const u32 new_size);
-
-extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
-
-extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
- const s64 new_data_size, const s64 data_start);
-
-extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
- const u8 val);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
deleted file mode 100644
index 0675b2400873..000000000000
--- a/fs/ntfs/bitmap.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/pagemap.h>
-
-#include "bitmap.h"
-#include "debug.h"
-#include "aops.h"
-#include "ntfs.h"
-
-/**
- * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
- * @vi: vfs inode describing the bitmap
- * @start_bit: first bit to set
- * @count: number of bits to set
- * @value: value to set the bits to (i.e. 0 or 1)
- * @is_rollback: if 'true' this is a rollback operation
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi to @value, where @value is either 0 or 1.
- *
- * @is_rollback should always be 'false', it is for internal use to rollback
- * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead.
- *
- * Return 0 on success and -errno on error.
- */
-int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
- const s64 count, const u8 value, const bool is_rollback)
-{
- s64 cnt = count;
- pgoff_t index, end_index;
- struct address_space *mapping;
- struct page *page;
- u8 *kaddr;
- int pos, len;
- u8 bit;
-
- BUG_ON(!vi);
- ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, "
- "value %u.%s", vi->i_ino, (unsigned long long)start_bit,
- (unsigned long long)cnt, (unsigned int)value,
- is_rollback ? " (rollback)" : "");
- BUG_ON(start_bit < 0);
- BUG_ON(cnt < 0);
- BUG_ON(value > 1);
- /*
- * Calculate the indices for the pages containing the first and last
- * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
- */
- index = start_bit >> (3 + PAGE_SHIFT);
- end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT);
-
- /* Get the page containing the first bit (@start_bit). */
- mapping = vi->i_mapping;
- page = ntfs_map_page(mapping, index);
- if (IS_ERR(page)) {
- if (!is_rollback)
- ntfs_error(vi->i_sb, "Failed to map first page (error "
- "%li), aborting.", PTR_ERR(page));
- return PTR_ERR(page);
- }
- kaddr = page_address(page);
-
- /* Set @pos to the position of the byte containing @start_bit. */
- pos = (start_bit >> 3) & ~PAGE_MASK;
-
- /* Calculate the position of @start_bit in the first byte. */
- bit = start_bit & 7;
-
- /* If the first byte is partial, modify the appropriate bits in it. */
- if (bit) {
- u8 *byte = kaddr + pos;
- while ((bit & 7) && cnt) {
- cnt--;
- if (value)
- *byte |= 1 << bit++;
- else
- *byte &= ~(1 << bit++);
- }
- /* If we are done, unmap the page and return success. */
- if (!cnt)
- goto done;
-
- /* Update @pos to the new position. */
- pos++;
- }
- /*
- * Depending on @value, modify all remaining whole bytes in the page up
- * to @cnt.
- */
- len = min_t(s64, cnt >> 3, PAGE_SIZE - pos);
- memset(kaddr + pos, value ? 0xff : 0, len);
- cnt -= len << 3;
-
- /* Update @len to point to the first not-done byte in the page. */
- if (cnt < 8)
- len += pos;
-
- /* If we are not in the last page, deal with all subsequent pages. */
- while (index < end_index) {
- BUG_ON(cnt <= 0);
-
- /* Update @index and get the next page. */
- flush_dcache_page(page);
- set_page_dirty(page);
- ntfs_unmap_page(page);
- page = ntfs_map_page(mapping, ++index);
- if (IS_ERR(page))
- goto rollback;
- kaddr = page_address(page);
- /*
- * Depending on @value, modify all remaining whole bytes in the
- * page up to @cnt.
- */
- len = min_t(s64, cnt >> 3, PAGE_SIZE);
- memset(kaddr, value ? 0xff : 0, len);
- cnt -= len << 3;
- }
- /*
- * The currently mapped page is the last one. If the last byte is
- * partial, modify the appropriate bits in it. Note, @len is the
- * position of the last byte inside the page.
- */
- if (cnt) {
- u8 *byte;
-
- BUG_ON(cnt > 7);
-
- bit = cnt;
- byte = kaddr + len;
- while (bit--) {
- if (value)
- *byte |= 1 << bit;
- else
- *byte &= ~(1 << bit);
- }
- }
-done:
- /* We are done. Unmap the page and return success. */
- flush_dcache_page(page);
- set_page_dirty(page);
- ntfs_unmap_page(page);
- ntfs_debug("Done.");
- return 0;
-rollback:
- /*
- * Current state:
- * - no pages are mapped
- * - @count - @cnt is the number of bits that have been modified
- */
- if (is_rollback)
- return PTR_ERR(page);
- if (count != cnt)
- pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
- value ? 0 : 1, true);
- else
- pos = 0;
- if (!pos) {
- /* Rollback was successful. */
- ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
- "%li), aborting.", PTR_ERR(page));
- } else {
- /* Rollback failed. */
- ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
- "%li) and rollback failed (error %i). "
- "Aborting and leaving inconsistent metadata. "
- "Unmount and run chkdsk.", PTR_ERR(page), pos);
- NVolSetErrors(NTFS_SB(vi->i_sb));
- }
- return PTR_ERR(page);
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
deleted file mode 100644
index 9dd2224ca9c4..000000000000
--- a/fs/ntfs/bitmap.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS
- * project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_BITMAP_H
-#define _LINUX_NTFS_BITMAP_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "types.h"
-
-extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
- const s64 count, const u8 value, const bool is_rollback);
-
-/**
- * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
- * @vi: vfs inode describing the bitmap
- * @start_bit: first bit to set
- * @count: number of bits to set
- * @value: value to set the bits to (i.e. 0 or 1)
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi to @value, where @value is either 0 or 1.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
- const s64 start_bit, const s64 count, const u8 value)
-{
- return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
- false);
-}
-
-/**
- * ntfs_bitmap_set_run - set a run of bits in a bitmap
- * @vi: vfs inode describing the bitmap
- * @start_bit: first bit to set
- * @count: number of bits to set
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit,
- const s64 count)
-{
- return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1);
-}
-
-/**
- * ntfs_bitmap_clear_run - clear a run of bits in a bitmap
- * @vi: vfs inode describing the bitmap
- * @start_bit: first bit to clear
- * @count: number of bits to clear
- *
- * Clear @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit,
- const s64 count)
-{
- return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0);
-}
-
-/**
- * ntfs_bitmap_set_bit - set a bit in a bitmap
- * @vi: vfs inode describing the bitmap
- * @bit: bit to set
- *
- * Set bit @bit in the bitmap described by the vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit)
-{
- return ntfs_bitmap_set_run(vi, bit, 1);
-}
-
-/**
- * ntfs_bitmap_clear_bit - clear a bit in a bitmap
- * @vi: vfs inode describing the bitmap
- * @bit: bit to clear
- *
- * Clear bit @bit in the bitmap described by the vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit)
-{
- return ntfs_bitmap_clear_run(vi, bit, 1);
-}
-
-#endif /* NTFS_RW */
-
-#endif /* defined _LINUX_NTFS_BITMAP_H */
diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c
deleted file mode 100644
index 3ab6ec96abfe..000000000000
--- a/fs/ntfs/collate.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#include "collate.h"
-#include "debug.h"
-#include "ntfs.h"
-
-static int ntfs_collate_binary(ntfs_volume *vol,
- const void *data1, const int data1_len,
- const void *data2, const int data2_len)
-{
- int rc;
-
- ntfs_debug("Entering.");
- rc = memcmp(data1, data2, min(data1_len, data2_len));
- if (!rc && (data1_len != data2_len)) {
- if (data1_len < data2_len)
- rc = -1;
- else
- rc = 1;
- }
- ntfs_debug("Done, returning %i", rc);
- return rc;
-}
-
-static int ntfs_collate_ntofs_ulong(ntfs_volume *vol,
- const void *data1, const int data1_len,
- const void *data2, const int data2_len)
-{
- int rc;
- u32 d1, d2;
-
- ntfs_debug("Entering.");
- // FIXME: We don't really want to bug here.
- BUG_ON(data1_len != data2_len);
- BUG_ON(data1_len != 4);
- d1 = le32_to_cpup(data1);
- d2 = le32_to_cpup(data2);
- if (d1 < d2)
- rc = -1;
- else {
- if (d1 == d2)
- rc = 0;
- else
- rc = 1;
- }
- ntfs_debug("Done, returning %i", rc);
- return rc;
-}
-
-typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int,
- const void *, const int);
-
-static ntfs_collate_func_t ntfs_do_collate0x0[3] = {
- ntfs_collate_binary,
- NULL/*ntfs_collate_file_name*/,
- NULL/*ntfs_collate_unicode_string*/,
-};
-
-static ntfs_collate_func_t ntfs_do_collate0x1[4] = {
- ntfs_collate_ntofs_ulong,
- NULL/*ntfs_collate_ntofs_sid*/,
- NULL/*ntfs_collate_ntofs_security_hash*/,
- NULL/*ntfs_collate_ntofs_ulongs*/,
-};
-
-/**
- * ntfs_collate - collate two data items using a specified collation rule
- * @vol: ntfs volume to which the data items belong
- * @cr: collation rule to use when comparing the items
- * @data1: first data item to collate
- * @data1_len: length in bytes of @data1
- * @data2: second data item to collate
- * @data2_len: length in bytes of @data2
- *
- * Collate the two data items @data1 and @data2 using the collation rule @cr
- * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before,
- * to match, or to collate after @data2.
- *
- * For speed we use the collation rule @cr as an index into two tables of
- * function pointers to call the appropriate collation function.
- */
-int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
- const void *data1, const int data1_len,
- const void *data2, const int data2_len) {
- int i;
-
- ntfs_debug("Entering.");
- /*
- * FIXME: At the moment we only support COLLATION_BINARY and
- * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now.
- */
- BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG);
- i = le32_to_cpu(cr);
- BUG_ON(i < 0);
- if (i <= 0x02)
- return ntfs_do_collate0x0[i](vol, data1, data1_len,
- data2, data2_len);
- BUG_ON(i < 0x10);
- i -= 0x10;
- if (likely(i <= 3))
- return ntfs_do_collate0x1[i](vol, data1, data1_len,
- data2, data2_len);
- BUG();
- return 0;
-}
diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
deleted file mode 100644
index f2255619b4f4..000000000000
--- a/fs/ntfs/collate.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * collate.h - Defines for NTFS kernel collation handling. Part of the
- * Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_COLLATE_H
-#define _LINUX_NTFS_COLLATE_H
-
-#include "types.h"
-#include "volume.h"
-
-static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
- int i;
-
- /*
- * FIXME: At the moment we only support COLLATION_BINARY and
- * COLLATION_NTOFS_ULONG, so we return false for everything else for
- * now.
- */
- if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
- return false;
- i = le32_to_cpu(cr);
- if (likely(((i >= 0) && (i <= 0x02)) ||
- ((i >= 0x10) && (i <= 0x13))))
- return true;
- return false;
-}
-
-extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
- const void *data1, const int data1_len,
- const void *data2, const int data2_len);
-
-#endif /* _LINUX_NTFS_COLLATE_H */
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
deleted file mode 100644
index 761aaa0195d6..000000000000
--- a/fs/ntfs/compress.c
+++ /dev/null
@@ -1,950 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * compress.c - NTFS kernel compressed attributes handling.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include "attrib.h"
-#include "inode.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/**
- * ntfs_compression_constants - enum of constants used in the compression code
- */
-typedef enum {
- /* Token types and access mask. */
- NTFS_SYMBOL_TOKEN = 0,
- NTFS_PHRASE_TOKEN = 1,
- NTFS_TOKEN_MASK = 1,
-
- /* Compression sub-block constants. */
- NTFS_SB_SIZE_MASK = 0x0fff,
- NTFS_SB_SIZE = 0x1000,
- NTFS_SB_IS_COMPRESSED = 0x8000,
-
- /*
- * The maximum compression block size is by definition 16 * the cluster
- * size, with the maximum supported cluster size being 4kiB. Thus the
- * maximum compression buffer size is 64kiB, so we use this when
- * initializing the compression buffer.
- */
- NTFS_MAX_CB_SIZE = 64 * 1024,
-} ntfs_compression_constants;
-
-/*
- * ntfs_compression_buffer - one buffer for the decompression engine
- */
-static u8 *ntfs_compression_buffer;
-
-/*
- * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
- */
-static DEFINE_SPINLOCK(ntfs_cb_lock);
-
-/**
- * allocate_compression_buffers - allocate the decompression buffers
- *
- * Caller has to hold the ntfs_lock mutex.
- *
- * Return 0 on success or -ENOMEM if the allocations failed.
- */
-int allocate_compression_buffers(void)
-{
- BUG_ON(ntfs_compression_buffer);
-
- ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE);
- if (!ntfs_compression_buffer)
- return -ENOMEM;
- return 0;
-}
-
-/**
- * free_compression_buffers - free the decompression buffers
- *
- * Caller has to hold the ntfs_lock mutex.
- */
-void free_compression_buffers(void)
-{
- BUG_ON(!ntfs_compression_buffer);
- vfree(ntfs_compression_buffer);
- ntfs_compression_buffer = NULL;
-}
-
-/**
- * zero_partial_compressed_page - zero out of bounds compressed page region
- */
-static void zero_partial_compressed_page(struct page *page,
- const s64 initialized_size)
-{
- u8 *kp = page_address(page);
- unsigned int kp_ofs;
-
- ntfs_debug("Zeroing page region outside initialized size.");
- if (((s64)page->index << PAGE_SHIFT) >= initialized_size) {
- clear_page(kp);
- return;
- }
- kp_ofs = initialized_size & ~PAGE_MASK;
- memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs);
- return;
-}
-
-/**
- * handle_bounds_compressed_page - test for&handle out of bounds compressed page
- */
-static inline void handle_bounds_compressed_page(struct page *page,
- const loff_t i_size, const s64 initialized_size)
-{
- if ((page->index >= (initialized_size >> PAGE_SHIFT)) &&
- (initialized_size < i_size))
- zero_partial_compressed_page(page, initialized_size);
- return;
-}
-
-/**
- * ntfs_decompress - decompress a compression block into an array of pages
- * @dest_pages: destination array of pages
- * @completed_pages: scratch space to track completed pages
- * @dest_index: current index into @dest_pages (IN/OUT)
- * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT)
- * @dest_max_index: maximum index into @dest_pages (IN)
- * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN)
- * @xpage: the target page (-1 if none) (IN)
- * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT)
- * @cb_start: compression block to decompress (IN)
- * @cb_size: size of compression block @cb_start in bytes (IN)
- * @i_size: file size when we started the read (IN)
- * @initialized_size: initialized file size when we started the read (IN)
- *
- * The caller must have disabled preemption. ntfs_decompress() reenables it when
- * the critical section is finished.
- *
- * This decompresses the compression block @cb_start into the array of
- * destination pages @dest_pages starting at index @dest_index into @dest_pages
- * and at offset @dest_pos into the page @dest_pages[@dest_index].
- *
- * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1.
- * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified.
- *
- * @cb_start is a pointer to the compression block which needs decompressing
- * and @cb_size is the size of @cb_start in bytes (8-64kiB).
- *
- * Return 0 if success or -EOVERFLOW on error in the compressed stream.
- * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
- * completed during the decompression of the compression block (@cb_start).
- *
- * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up
- * unpredicatbly! You have been warned!
- *
- * Note to hackers: This function may not sleep until it has finished accessing
- * the compression block @cb_start as it is a per-CPU buffer.
- */
-static int ntfs_decompress(struct page *dest_pages[], int completed_pages[],
- int *dest_index, int *dest_ofs, const int dest_max_index,
- const int dest_max_ofs, const int xpage, char *xpage_done,
- u8 *const cb_start, const u32 cb_size, const loff_t i_size,
- const s64 initialized_size)
-{
- /*
- * Pointers into the compressed data, i.e. the compression block (cb),
- * and the therein contained sub-blocks (sb).
- */
- u8 *cb_end = cb_start + cb_size; /* End of cb. */
- u8 *cb = cb_start; /* Current position in cb. */
- u8 *cb_sb_start; /* Beginning of the current sb in the cb. */
- u8 *cb_sb_end; /* End of current sb / beginning of next sb. */
-
- /* Variables for uncompressed data / destination. */
- struct page *dp; /* Current destination page being worked on. */
- u8 *dp_addr; /* Current pointer into dp. */
- u8 *dp_sb_start; /* Start of current sub-block in dp. */
- u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start +
- NTFS_SB_SIZE). */
- u16 do_sb_start; /* @dest_ofs when starting this sub-block. */
- u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start +
- NTFS_SB_SIZE). */
-
- /* Variables for tag and token parsing. */
- u8 tag; /* Current tag. */
- int token; /* Loop counter for the eight tokens in tag. */
- int nr_completed_pages = 0;
-
- /* Default error code. */
- int err = -EOVERFLOW;
-
- ntfs_debug("Entering, cb_size = 0x%x.", cb_size);
-do_next_sb:
- ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.",
- cb - cb_start);
- /*
- * Have we reached the end of the compression block or the end of the
- * decompressed data? The latter can happen for example if the current
- * position in the compression block is one byte before its end so the
- * first two checks do not detect it.
- */
- if (cb == cb_end || !le16_to_cpup((le16*)cb) ||
- (*dest_index == dest_max_index &&
- *dest_ofs == dest_max_ofs)) {
- int i;
-
- ntfs_debug("Completed. Returning success (0).");
- err = 0;
-return_error:
- /* We can sleep from now on, so we drop lock. */
- spin_unlock(&ntfs_cb_lock);
- /* Second stage: finalize completed pages. */
- if (nr_completed_pages > 0) {
- for (i = 0; i < nr_completed_pages; i++) {
- int di = completed_pages[i];
-
- dp = dest_pages[di];
- /*
- * If we are outside the initialized size, zero
- * the out of bounds page range.
- */
- handle_bounds_compressed_page(dp, i_size,
- initialized_size);
- flush_dcache_page(dp);
- kunmap(dp);
- SetPageUptodate(dp);
- unlock_page(dp);
- if (di == xpage)
- *xpage_done = 1;
- else
- put_page(dp);
- dest_pages[di] = NULL;
- }
- }
- return err;
- }
-
- /* Setup offsets for the current sub-block destination. */
- do_sb_start = *dest_ofs;
- do_sb_end = do_sb_start + NTFS_SB_SIZE;
-
- /* Check that we are still within allowed boundaries. */
- if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs)
- goto return_overflow;
-
- /* Does the minimum size of a compressed sb overflow valid range? */
- if (cb + 6 > cb_end)
- goto return_overflow;
-
- /* Setup the current sub-block source pointers and validate range. */
- cb_sb_start = cb;
- cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK)
- + 3;
- if (cb_sb_end > cb_end)
- goto return_overflow;
-
- /* Get the current destination page. */
- dp = dest_pages[*dest_index];
- if (!dp) {
- /* No page present. Skip decompression of this sub-block. */
- cb = cb_sb_end;
-
- /* Advance destination position to next sub-block. */
- *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK;
- if (!*dest_ofs && (++*dest_index > dest_max_index))
- goto return_overflow;
- goto do_next_sb;
- }
-
- /* We have a valid destination page. Setup the destination pointers. */
- dp_addr = (u8*)page_address(dp) + do_sb_start;
-
- /* Now, we are ready to process the current sub-block (sb). */
- if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) {
- ntfs_debug("Found uncompressed sub-block.");
- /* This sb is not compressed, just copy it into destination. */
-
- /* Advance source position to first data byte. */
- cb += 2;
-
- /* An uncompressed sb must be full size. */
- if (cb_sb_end - cb != NTFS_SB_SIZE)
- goto return_overflow;
-
- /* Copy the block and advance the source position. */
- memcpy(dp_addr, cb, NTFS_SB_SIZE);
- cb += NTFS_SB_SIZE;
-
- /* Advance destination position to next sub-block. */
- *dest_ofs += NTFS_SB_SIZE;
- if (!(*dest_ofs &= ~PAGE_MASK)) {
-finalize_page:
- /*
- * First stage: add current page index to array of
- * completed pages.
- */
- completed_pages[nr_completed_pages++] = *dest_index;
- if (++*dest_index > dest_max_index)
- goto return_overflow;
- }
- goto do_next_sb;
- }
- ntfs_debug("Found compressed sub-block.");
- /* This sb is compressed, decompress it into destination. */
-
- /* Setup destination pointers. */
- dp_sb_start = dp_addr;
- dp_sb_end = dp_sb_start + NTFS_SB_SIZE;
-
- /* Forward to the first tag in the sub-block. */
- cb += 2;
-do_next_tag:
- if (cb == cb_sb_end) {
- /* Check if the decompressed sub-block was not full-length. */
- if (dp_addr < dp_sb_end) {
- int nr_bytes = do_sb_end - *dest_ofs;
-
- ntfs_debug("Filling incomplete sub-block with "
- "zeroes.");
- /* Zero remainder and update destination position. */
- memset(dp_addr, 0, nr_bytes);
- *dest_ofs += nr_bytes;
- }
- /* We have finished the current sub-block. */
- if (!(*dest_ofs &= ~PAGE_MASK))
- goto finalize_page;
- goto do_next_sb;
- }
-
- /* Check we are still in range. */
- if (cb > cb_sb_end || dp_addr > dp_sb_end)
- goto return_overflow;
-
- /* Get the next tag and advance to first token. */
- tag = *cb++;
-
- /* Parse the eight tokens described by the tag. */
- for (token = 0; token < 8; token++, tag >>= 1) {
- u16 lg, pt, length, max_non_overlap;
- register u16 i;
- u8 *dp_back_addr;
-
- /* Check if we are done / still in range. */
- if (cb >= cb_sb_end || dp_addr > dp_sb_end)
- break;
-
- /* Determine token type and parse appropriately.*/
- if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) {
- /*
- * We have a symbol token, copy the symbol across, and
- * advance the source and destination positions.
- */
- *dp_addr++ = *cb++;
- ++*dest_ofs;
-
- /* Continue with the next token. */
- continue;
- }
-
- /*
- * We have a phrase token. Make sure it is not the first tag in
- * the sb as this is illegal and would confuse the code below.
- */
- if (dp_addr == dp_sb_start)
- goto return_overflow;
-
- /*
- * Determine the number of bytes to go back (p) and the number
- * of bytes to copy (l). We use an optimized algorithm in which
- * we first calculate log2(current destination position in sb),
- * which allows determination of l and p in O(1) rather than
- * O(n). We just need an arch-optimized log2() function now.
- */
- lg = 0;
- for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1)
- lg++;
-
- /* Get the phrase token into i. */
- pt = le16_to_cpup((le16*)cb);
-
- /*
- * Calculate starting position of the byte sequence in
- * the destination using the fact that p = (pt >> (12 - lg)) + 1
- * and make sure we don't go too far back.
- */
- dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1;
- if (dp_back_addr < dp_sb_start)
- goto return_overflow;
-
- /* Now calculate the length of the byte sequence. */
- length = (pt & (0xfff >> lg)) + 3;
-
- /* Advance destination position and verify it is in range. */
- *dest_ofs += length;
- if (*dest_ofs > do_sb_end)
- goto return_overflow;
-
- /* The number of non-overlapping bytes. */
- max_non_overlap = dp_addr - dp_back_addr;
-
- if (length <= max_non_overlap) {
- /* The byte sequence doesn't overlap, just copy it. */
- memcpy(dp_addr, dp_back_addr, length);
-
- /* Advance destination pointer. */
- dp_addr += length;
- } else {
- /*
- * The byte sequence does overlap, copy non-overlapping
- * part and then do a slow byte by byte copy for the
- * overlapping part. Also, advance the destination
- * pointer.
- */
- memcpy(dp_addr, dp_back_addr, max_non_overlap);
- dp_addr += max_non_overlap;
- dp_back_addr += max_non_overlap;
- length -= max_non_overlap;
- while (length--)
- *dp_addr++ = *dp_back_addr++;
- }
-
- /* Advance source position and continue with the next token. */
- cb += 2;
- }
-
- /* No tokens left in the current tag. Continue with the next tag. */
- goto do_next_tag;
-
-return_overflow:
- ntfs_error(NULL, "Failed. Returning -EOVERFLOW.");
- goto return_error;
-}
-
-/**
- * ntfs_read_compressed_block - read a compressed block into the page cache
- * @page: locked page in the compression block(s) we need to read
- *
- * When we are called the page has already been verified to be locked and the
- * attribute is known to be non-resident, not encrypted, but compressed.
- *
- * 1. Determine which compression block(s) @page is in.
- * 2. Get hold of all pages corresponding to this/these compression block(s).
- * 3. Read the (first) compression block.
- * 4. Decompress it into the corresponding pages.
- * 5. Throw the compressed data away and proceed to 3. for the next compression
- * block or return success if no more compression blocks left.
- *
- * Warning: We have to be careful what we do about existing pages. They might
- * have been written to so that we would lose data if we were to just overwrite
- * them with the out-of-date uncompressed data.
- *
- * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at
- * the end of the file I think. We need to detect this case and zero the out
- * of bounds remainder of the page in question and mark it as handled. At the
- * moment we would just return -EIO on such a page. This bug will only become
- * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte
- * clusters so is probably not going to be seen by anyone. Still this should
- * be fixed. (AIA)
- *
- * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in
- * handling sparse and compressed cbs. (AIA)
- *
- * FIXME: At the moment we don't do any zeroing out in the case that
- * initialized_size is less than data_size. This should be safe because of the
- * nature of the compression algorithm used. Just in case we check and output
- * an error message in read inode if the two sizes are not equal for a
- * compressed file. (AIA)
- */
-int ntfs_read_compressed_block(struct page *page)
-{
- loff_t i_size;
- s64 initialized_size;
- struct address_space *mapping = page->mapping;
- ntfs_inode *ni = NTFS_I(mapping->host);
- ntfs_volume *vol = ni->vol;
- struct super_block *sb = vol->sb;
- runlist_element *rl;
- unsigned long flags, block_size = sb->s_blocksize;
- unsigned char block_size_bits = sb->s_blocksize_bits;
- u8 *cb, *cb_pos, *cb_end;
- struct buffer_head **bhs;
- unsigned long offset, index = page->index;
- u32 cb_size = ni->itype.compressed.block_size;
- u64 cb_size_mask = cb_size - 1UL;
- VCN vcn;
- LCN lcn;
- /* The first wanted vcn (minimum alignment is PAGE_SIZE). */
- VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >>
- vol->cluster_size_bits;
- /*
- * The first vcn after the last wanted vcn (minimum alignment is again
- * PAGE_SIZE.
- */
- VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1)
- & ~cb_size_mask) >> vol->cluster_size_bits;
- /* Number of compression blocks (cbs) in the wanted vcn range. */
- unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
- >> ni->itype.compressed.block_size_bits;
- /*
- * Number of pages required to store the uncompressed data from all
- * compression blocks (cbs) overlapping @page. Due to alignment
- * guarantees of start_vcn and end_vcn, no need to round up here.
- */
- unsigned int nr_pages = (end_vcn - start_vcn) <<
- vol->cluster_size_bits >> PAGE_SHIFT;
- unsigned int xpage, max_page, cur_page, cur_ofs, i;
- unsigned int cb_clusters, cb_max_ofs;
- int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
- struct page **pages;
- int *completed_pages;
- unsigned char xpage_done = 0;
-
- ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
- "%i.", index, cb_size, nr_pages);
- /*
- * Bad things happen if we get here for anything that is not an
- * unnamed $DATA attribute.
- */
- BUG_ON(ni->type != AT_DATA);
- BUG_ON(ni->name_len);
-
- pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
- completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS);
-
- /* Allocate memory to store the buffer heads we need. */
- bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
- bhs = kmalloc(bhs_size, GFP_NOFS);
-
- if (unlikely(!pages || !bhs || !completed_pages)) {
- kfree(bhs);
- kfree(pages);
- kfree(completed_pages);
- unlock_page(page);
- ntfs_error(vol->sb, "Failed to allocate internal buffers.");
- return -ENOMEM;
- }
-
- /*
- * We have already been given one page, this is the one we must do.
- * Once again, the alignment guarantees keep it simple.
- */
- offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT;
- xpage = index - offset;
- pages[xpage] = page;
- /*
- * The remaining pages need to be allocated and inserted into the page
- * cache, alignment guarantees keep all the below much simpler. (-8
- */
- read_lock_irqsave(&ni->size_lock, flags);
- i_size = i_size_read(VFS_I(ni));
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) -
- offset;
- /* Is the page fully outside i_size? (truncate in progress) */
- if (xpage >= max_page) {
- kfree(bhs);
- kfree(pages);
- kfree(completed_pages);
- zero_user(page, 0, PAGE_SIZE);
- ntfs_debug("Compressed read outside i_size - truncated?");
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
- }
- if (nr_pages < max_page)
- max_page = nr_pages;
- for (i = 0; i < max_page; i++, offset++) {
- if (i != xpage)
- pages[i] = grab_cache_page_nowait(mapping, offset);
- page = pages[i];
- if (page) {
- /*
- * We only (re)read the page if it isn't already read
- * in and/or dirty or we would be losing data or at
- * least wasting our time.
- */
- if (!PageDirty(page) && (!PageUptodate(page) ||
- PageError(page))) {
- ClearPageError(page);
- kmap(page);
- continue;
- }
- unlock_page(page);
- put_page(page);
- pages[i] = NULL;
- }
- }
-
- /*
- * We have the runlist, and all the destination pages we need to fill.
- * Now read the first compression block.
- */
- cur_page = 0;
- cur_ofs = 0;
- cb_clusters = ni->itype.compressed.block_clusters;
-do_next_cb:
- nr_cbs--;
- nr_bhs = 0;
-
- /* Read all cb buffer heads one cluster at a time. */
- rl = NULL;
- for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
- vcn++) {
- bool is_retry = false;
-
- if (!rl) {
-lock_retry_remap:
- down_read(&ni->runlist.lock);
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- } else
- lcn = LCN_RL_NOT_MAPPED;
- ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
- (unsigned long long)vcn,
- (unsigned long long)lcn);
- if (lcn < 0) {
- /*
- * When we reach the first sparse cluster we have
- * finished with the cb.
- */
- if (lcn == LCN_HOLE)
- break;
- if (is_retry || lcn != LCN_RL_NOT_MAPPED)
- goto rl_err;
- is_retry = true;
- /*
- * Attempt to map runlist, dropping lock for the
- * duration.
- */
- up_read(&ni->runlist.lock);
- if (!ntfs_map_runlist(ni, vcn))
- goto lock_retry_remap;
- goto map_rl_err;
- }
- block = lcn << vol->cluster_size_bits >> block_size_bits;
- /* Read the lcn from device in chunks of block_size bytes. */
- max_block = block + (vol->cluster_size >> block_size_bits);
- do {
- ntfs_debug("block = 0x%x.", block);
- if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block))))
- goto getblk_err;
- nr_bhs++;
- } while (++block < max_block);
- }
-
- /* Release the lock if we took it. */
- if (rl)
- up_read(&ni->runlist.lock);
-
- /* Setup and initiate io on all buffer heads. */
- for (i = 0; i < nr_bhs; i++) {
- struct buffer_head *tbh = bhs[i];
-
- if (!trylock_buffer(tbh))
- continue;
- if (unlikely(buffer_uptodate(tbh))) {
- unlock_buffer(tbh);
- continue;
- }
- get_bh(tbh);
- tbh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, tbh);
- }
-
- /* Wait for io completion on all buffer heads. */
- for (i = 0; i < nr_bhs; i++) {
- struct buffer_head *tbh = bhs[i];
-
- if (buffer_uptodate(tbh))
- continue;
- wait_on_buffer(tbh);
- /*
- * We need an optimization barrier here, otherwise we start
- * hitting the below fixup code when accessing a loopback
- * mounted ntfs partition. This indicates either there is a
- * race condition in the loop driver or, more likely, gcc
- * overoptimises the code without the barrier and it doesn't
- * do the Right Thing(TM).
- */
- barrier();
- if (unlikely(!buffer_uptodate(tbh))) {
- ntfs_warning(vol->sb, "Buffer is unlocked but not "
- "uptodate! Unplugging the disk queue "
- "and rescheduling.");
- get_bh(tbh);
- io_schedule();
- put_bh(tbh);
- if (unlikely(!buffer_uptodate(tbh)))
- goto read_err;
- ntfs_warning(vol->sb, "Buffer is now uptodate. Good.");
- }
- }
-
- /*
- * Get the compression buffer. We must not sleep any more
- * until we are finished with it.
- */
- spin_lock(&ntfs_cb_lock);
- cb = ntfs_compression_buffer;
-
- BUG_ON(!cb);
-
- cb_pos = cb;
- cb_end = cb + cb_size;
-
- /* Copy the buffer heads into the contiguous buffer. */
- for (i = 0; i < nr_bhs; i++) {
- memcpy(cb_pos, bhs[i]->b_data, block_size);
- cb_pos += block_size;
- }
-
- /* Just a precaution. */
- if (cb_pos + 2 <= cb + cb_size)
- *(u16*)cb_pos = 0;
-
- /* Reset cb_pos back to the beginning. */
- cb_pos = cb;
-
- /* We now have both source (if present) and destination. */
- ntfs_debug("Successfully read the compression block.");
-
- /* The last page and maximum offset within it for the current cb. */
- cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size;
- cb_max_ofs = cb_max_page & ~PAGE_MASK;
- cb_max_page >>= PAGE_SHIFT;
-
- /* Catch end of file inside a compression block. */
- if (cb_max_page > max_page)
- cb_max_page = max_page;
-
- if (vcn == start_vcn - cb_clusters) {
- /* Sparse cb, zero out page range overlapping the cb. */
- ntfs_debug("Found sparse compression block.");
- /* We can sleep from now on, so we drop lock. */
- spin_unlock(&ntfs_cb_lock);
- if (cb_max_ofs)
- cb_max_page--;
- for (; cur_page < cb_max_page; cur_page++) {
- page = pages[cur_page];
- if (page) {
- if (likely(!cur_ofs))
- clear_page(page_address(page));
- else
- memset(page_address(page) + cur_ofs, 0,
- PAGE_SIZE -
- cur_ofs);
- flush_dcache_page(page);
- kunmap(page);
- SetPageUptodate(page);
- unlock_page(page);
- if (cur_page == xpage)
- xpage_done = 1;
- else
- put_page(page);
- pages[cur_page] = NULL;
- }
- cb_pos += PAGE_SIZE - cur_ofs;
- cur_ofs = 0;
- if (cb_pos >= cb_end)
- break;
- }
- /* If we have a partial final page, deal with it now. */
- if (cb_max_ofs && cb_pos < cb_end) {
- page = pages[cur_page];
- if (page)
- memset(page_address(page) + cur_ofs, 0,
- cb_max_ofs - cur_ofs);
- /*
- * No need to update cb_pos at this stage:
- * cb_pos += cb_max_ofs - cur_ofs;
- */
- cur_ofs = cb_max_ofs;
- }
- } else if (vcn == start_vcn) {
- /* We can't sleep so we need two stages. */
- unsigned int cur2_page = cur_page;
- unsigned int cur_ofs2 = cur_ofs;
- u8 *cb_pos2 = cb_pos;
-
- ntfs_debug("Found uncompressed compression block.");
- /* Uncompressed cb, copy it to the destination pages. */
- /*
- * TODO: As a big optimization, we could detect this case
- * before we read all the pages and use block_read_full_folio()
- * on all full pages instead (we still have to treat partial
- * pages especially but at least we are getting rid of the
- * synchronous io for the majority of pages.
- * Or if we choose not to do the read-ahead/-behind stuff, we
- * could just return block_read_full_folio(pages[xpage]) as long
- * as PAGE_SIZE <= cb_size.
- */
- if (cb_max_ofs)
- cb_max_page--;
- /* First stage: copy data into destination pages. */
- for (; cur_page < cb_max_page; cur_page++) {
- page = pages[cur_page];
- if (page)
- memcpy(page_address(page) + cur_ofs, cb_pos,
- PAGE_SIZE - cur_ofs);
- cb_pos += PAGE_SIZE - cur_ofs;
- cur_ofs = 0;
- if (cb_pos >= cb_end)
- break;
- }
- /* If we have a partial final page, deal with it now. */
- if (cb_max_ofs && cb_pos < cb_end) {
- page = pages[cur_page];
- if (page)
- memcpy(page_address(page) + cur_ofs, cb_pos,
- cb_max_ofs - cur_ofs);
- cb_pos += cb_max_ofs - cur_ofs;
- cur_ofs = cb_max_ofs;
- }
- /* We can sleep from now on, so drop lock. */
- spin_unlock(&ntfs_cb_lock);
- /* Second stage: finalize pages. */
- for (; cur2_page < cb_max_page; cur2_page++) {
- page = pages[cur2_page];
- if (page) {
- /*
- * If we are outside the initialized size, zero
- * the out of bounds page range.
- */
- handle_bounds_compressed_page(page, i_size,
- initialized_size);
- flush_dcache_page(page);
- kunmap(page);
- SetPageUptodate(page);
- unlock_page(page);
- if (cur2_page == xpage)
- xpage_done = 1;
- else
- put_page(page);
- pages[cur2_page] = NULL;
- }
- cb_pos2 += PAGE_SIZE - cur_ofs2;
- cur_ofs2 = 0;
- if (cb_pos2 >= cb_end)
- break;
- }
- } else {
- /* Compressed cb, decompress it into the destination page(s). */
- unsigned int prev_cur_page = cur_page;
-
- ntfs_debug("Found compressed compression block.");
- err = ntfs_decompress(pages, completed_pages, &cur_page,
- &cur_ofs, cb_max_page, cb_max_ofs, xpage,
- &xpage_done, cb_pos, cb_size - (cb_pos - cb),
- i_size, initialized_size);
- /*
- * We can sleep from now on, lock already dropped by
- * ntfs_decompress().
- */
- if (err) {
- ntfs_error(vol->sb, "ntfs_decompress() failed in inode "
- "0x%lx with error code %i. Skipping "
- "this compression block.",
- ni->mft_no, -err);
- /* Release the unfinished pages. */
- for (; prev_cur_page < cur_page; prev_cur_page++) {
- page = pages[prev_cur_page];
- if (page) {
- flush_dcache_page(page);
- kunmap(page);
- unlock_page(page);
- if (prev_cur_page != xpage)
- put_page(page);
- pages[prev_cur_page] = NULL;
- }
- }
- }
- }
-
- /* Release the buffer heads. */
- for (i = 0; i < nr_bhs; i++)
- brelse(bhs[i]);
-
- /* Do we have more work to do? */
- if (nr_cbs)
- goto do_next_cb;
-
- /* We no longer need the list of buffer heads. */
- kfree(bhs);
-
- /* Clean up if we have any pages left. Should never happen. */
- for (cur_page = 0; cur_page < max_page; cur_page++) {
- page = pages[cur_page];
- if (page) {
- ntfs_error(vol->sb, "Still have pages left! "
- "Terminating them with extreme "
- "prejudice. Inode 0x%lx, page index "
- "0x%lx.", ni->mft_no, page->index);
- flush_dcache_page(page);
- kunmap(page);
- unlock_page(page);
- if (cur_page != xpage)
- put_page(page);
- pages[cur_page] = NULL;
- }
- }
-
- /* We no longer need the list of pages. */
- kfree(pages);
- kfree(completed_pages);
-
- /* If we have completed the requested page, we return success. */
- if (likely(xpage_done))
- return 0;
-
- ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
- "EOVERFLOW" : (!err ? "EIO" : "unknown error"));
- return err < 0 ? err : -EIO;
-
-read_err:
- ntfs_error(vol->sb, "IO error while reading compressed data.");
- /* Release the buffer heads. */
- for (i = 0; i < nr_bhs; i++)
- brelse(bhs[i]);
- goto err_out;
-
-map_rl_err:
- ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read "
- "compression block.");
- goto err_out;
-
-rl_err:
- up_read(&ni->runlist.lock);
- ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read "
- "compression block.");
- goto err_out;
-
-getblk_err:
- up_read(&ni->runlist.lock);
- ntfs_error(vol->sb, "getblk() failed. Cannot read compression block.");
-
-err_out:
- kfree(bhs);
- for (i = cur_page; i < max_page; i++) {
- page = pages[i];
- if (page) {
- flush_dcache_page(page);
- kunmap(page);
- unlock_page(page);
- if (i != xpage)
- put_page(page);
- }
- }
- kfree(pages);
- kfree(completed_pages);
- return -EIO;
-}
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
deleted file mode 100644
index a3c1c5656f8f..000000000000
--- a/fs/ntfs/debug.c
+++ /dev/null
@@ -1,159 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include "debug.h"
-
-/**
- * __ntfs_warning - output a warning to the syslog
- * @function: name of function outputting the warning
- * @sb: super block of mounted ntfs filesystem
- * @fmt: warning string containing format specifications
- * @...: a variable number of arguments specified in @fmt
- *
- * Outputs a warning to the syslog for the mounted ntfs filesystem described
- * by @sb.
- *
- * @fmt and the corresponding @... is printf style format string containing
- * the warning string and the corresponding format arguments, respectively.
- *
- * @function is the name of the function from which __ntfs_warning is being
- * called.
- *
- * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead
- * as this provides the @function parameter automatically.
- */
-void __ntfs_warning(const char *function, const struct super_block *sb,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
- int flen = 0;
-
-#ifndef DEBUG
- if (!printk_ratelimit())
- return;
-#endif
- if (function)
- flen = strlen(function);
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- if (sb)
- pr_warn("(device %s): %s(): %pV\n",
- sb->s_id, flen ? function : "", &vaf);
- else
- pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
- va_end(args);
-}
-
-/**
- * __ntfs_error - output an error to the syslog
- * @function: name of function outputting the error
- * @sb: super block of mounted ntfs filesystem
- * @fmt: error string containing format specifications
- * @...: a variable number of arguments specified in @fmt
- *
- * Outputs an error to the syslog for the mounted ntfs filesystem described
- * by @sb.
- *
- * @fmt and the corresponding @... is printf style format string containing
- * the error string and the corresponding format arguments, respectively.
- *
- * @function is the name of the function from which __ntfs_error is being
- * called.
- *
- * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead
- * as this provides the @function parameter automatically.
- */
-void __ntfs_error(const char *function, const struct super_block *sb,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
- int flen = 0;
-
-#ifndef DEBUG
- if (!printk_ratelimit())
- return;
-#endif
- if (function)
- flen = strlen(function);
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- if (sb)
- pr_err("(device %s): %s(): %pV\n",
- sb->s_id, flen ? function : "", &vaf);
- else
- pr_err("%s(): %pV\n", flen ? function : "", &vaf);
- va_end(args);
-}
-
-#ifdef DEBUG
-
-/* If 1, output debug messages, and if 0, don't. */
-int debug_msgs = 0;
-
-void __ntfs_debug(const char *file, int line, const char *function,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
- int flen = 0;
-
- if (!debug_msgs)
- return;
- if (function)
- flen = strlen(function);
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
- va_end(args);
-}
-
-/* Dump a runlist. Caller has to provide synchronisation for @rl. */
-void ntfs_debug_dump_runlist(const runlist_element *rl)
-{
- int i;
- const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED",
- "LCN_ENOENT ", "LCN_unknown " };
-
- if (!debug_msgs)
- return;
- pr_debug("Dumping runlist (values in hex):\n");
- if (!rl) {
- pr_debug("Run list not present.\n");
- return;
- }
- pr_debug("VCN LCN Run length\n");
- for (i = 0; ; i++) {
- LCN lcn = (rl + i)->lcn;
-
- if (lcn < (LCN)0) {
- int index = -lcn - 1;
-
- if (index > -LCN_ENOENT - 1)
- index = 3;
- pr_debug("%-16Lx %s %-16Lx%s\n",
- (long long)(rl + i)->vcn, lcn_str[index],
- (long long)(rl + i)->length,
- (rl + i)->length ? "" :
- " (runlist end)");
- } else
- pr_debug("%-16Lx %-16Lx %-16Lx%s\n",
- (long long)(rl + i)->vcn,
- (long long)(rl + i)->lcn,
- (long long)(rl + i)->length,
- (rl + i)->length ? "" :
- " (runlist end)");
- if (!(rl + i)->length)
- break;
- }
-}
-
-#endif
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
deleted file mode 100644
index 6fdef388f129..000000000000
--- a/fs/ntfs/debug.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_DEBUG_H
-#define _LINUX_NTFS_DEBUG_H
-
-#include <linux/fs.h>
-
-#include "runlist.h"
-
-#ifdef DEBUG
-
-extern int debug_msgs;
-
-extern __printf(4, 5)
-void __ntfs_debug(const char *file, int line, const char *function,
- const char *format, ...);
-/**
- * ntfs_debug - write a debug level message to syslog
- * @f: a printf format string containing the message
- * @...: the variables to substitute into @f
- *
- * ntfs_debug() writes a DEBUG level message to the syslog but only if the
- * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
- */
-#define ntfs_debug(f, a...) \
- __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
-
-extern void ntfs_debug_dump_runlist(const runlist_element *rl);
-
-#else /* !DEBUG */
-
-#define ntfs_debug(fmt, ...) \
-do { \
- if (0) \
- no_printk(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define ntfs_debug_dump_runlist(rl) do {} while (0)
-
-#endif /* !DEBUG */
-
-extern __printf(3, 4)
-void __ntfs_warning(const char *function, const struct super_block *sb,
- const char *fmt, ...);
-#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a)
-
-extern __printf(3, 4)
-void __ntfs_error(const char *function, const struct super_block *sb,
- const char *fmt, ...);
-#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a)
-
-#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
deleted file mode 100644
index 629723a8d712..000000000000
--- a/fs/ntfs/dir.c
+++ /dev/null
@@ -1,1540 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-
-#include "dir.h"
-#include "aops.h"
-#include "attrib.h"
-#include "mft.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/*
- * The little endian Unicode string $I30 as a global constant.
- */
-ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
- cpu_to_le16('3'), cpu_to_le16('0'), 0 };
-
-/**
- * ntfs_lookup_inode_by_name - find an inode in a directory given its name
- * @dir_ni: ntfs inode of the directory in which to search for the name
- * @uname: Unicode name for which to search in the directory
- * @uname_len: length of the name @uname in Unicode characters
- * @res: return the found file name if necessary (see below)
- *
- * Look for an inode with name @uname in the directory with inode @dir_ni.
- * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
- * the Unicode name. If the name is found in the directory, the corresponding
- * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
- * is a 64-bit number containing the sequence number.
- *
- * On error, a negative value is returned corresponding to the error code. In
- * particular if the inode is not found -ENOENT is returned. Note that you
- * can't just check the return value for being negative, you have to check the
- * inode number for being negative which you can extract using MREC(return
- * value).
- *
- * Note, @uname_len does not include the (optional) terminating NULL character.
- *
- * Note, we look for a case sensitive match first but we also look for a case
- * insensitive match at the same time. If we find a case insensitive match, we
- * save that for the case that we don't find an exact match, where we return
- * the case insensitive match and setup @res (which we allocate!) with the mft
- * reference, the file name type, length and with a copy of the little endian
- * Unicode file name itself. If we match a file name which is in the DOS name
- * space, we only return the mft reference and file name type in @res.
- * ntfs_lookup() then uses this to find the long file name in the inode itself.
- * This is to avoid polluting the dcache with short file names. We want them to
- * work but we don't care for how quickly one can access them. This also fixes
- * the dcache aliasing issues.
- *
- * Locking: - Caller must hold i_mutex on the directory.
- * - Each page cache page in the index allocation mapping must be
- * locked whilst being accessed otherwise we may find a corrupt
- * page due to it being under ->writepage at the moment which
- * applies the mst protection fixups before writing out and then
- * removes them again after the write is complete after which it
- * unlocks the page.
- */
-MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
- const int uname_len, ntfs_name **res)
-{
- ntfs_volume *vol = dir_ni->vol;
- struct super_block *sb = vol->sb;
- MFT_RECORD *m;
- INDEX_ROOT *ir;
- INDEX_ENTRY *ie;
- INDEX_ALLOCATION *ia;
- u8 *index_end;
- u64 mref;
- ntfs_attr_search_ctx *ctx;
- int err, rc;
- VCN vcn, old_vcn;
- struct address_space *ia_mapping;
- struct page *page;
- u8 *kaddr;
- ntfs_name *name = NULL;
-
- BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode));
- BUG_ON(NInoAttr(dir_ni));
- /* Get hold of the mft record for the directory. */
- m = map_mft_record(dir_ni);
- if (IS_ERR(m)) {
- ntfs_error(sb, "map_mft_record() failed with error code %ld.",
- -PTR_ERR(m));
- return ERR_MREF(PTR_ERR(m));
- }
- ctx = ntfs_attr_get_search_ctx(dir_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Find the index root attribute in the mft record. */
- err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
- 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- ntfs_error(sb, "Index root attribute missing in "
- "directory inode 0x%lx.",
- dir_ni->mft_no);
- err = -EIO;
- }
- goto err_out;
- }
- /* Get to the index root value (it's been verified in read_inode). */
- ir = (INDEX_ROOT*)((u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset));
- index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ir->index +
- le32_to_cpu(ir->index.entries_offset));
- /*
- * Loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds checks. */
- if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end)
- goto dir_err_out;
- /*
- * The last entry cannot contain a name. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /*
- * We perform a case sensitive comparison and if that matches
- * we are done and return the mft reference of the inode (i.e.
- * the inode number together with the sequence number for
- * consistency checking). We convert it to cpu format before
- * returning.
- */
- if (ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
-found_it:
- /*
- * We have a perfect match, so we don't need to care
- * about having matched imperfectly before, so we can
- * free name and set *res to NULL.
- * However, if the perfect match is a short file name,
- * we need to signal this through *res, so that
- * ntfs_lookup() can fix dcache aliasing issues.
- * As an optimization we just reuse an existing
- * allocation of *res.
- */
- if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
- if (!name) {
- name = kmalloc(sizeof(ntfs_name),
- GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- goto err_out;
- }
- }
- name->mref = le64_to_cpu(
- ie->data.dir.indexed_file);
- name->type = FILE_NAME_DOS;
- name->len = 0;
- *res = name;
- } else {
- kfree(name);
- *res = NULL;
- }
- mref = le64_to_cpu(ie->data.dir.indexed_file);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(dir_ni);
- return mref;
- }
- /*
- * For a case insensitive mount, we also perform a case
- * insensitive comparison (provided the file name is not in the
- * POSIX namespace). If the comparison matches, and the name is
- * in the WIN32 namespace, we cache the filename in *res so
- * that the caller, ntfs_lookup(), can work on it. If the
- * comparison matches, and the name is in the DOS namespace, we
- * only cache the mft reference and the file name type (we set
- * the name length to zero for simplicity).
- */
- if (!NVolCaseSensitive(vol) &&
- ie->key.file_name.file_name_type &&
- ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length,
- IGNORE_CASE, vol->upcase, vol->upcase_len)) {
- int name_size = sizeof(ntfs_name);
- u8 type = ie->key.file_name.file_name_type;
- u8 len = ie->key.file_name.file_name_length;
-
- /* Only one case insensitive matching name allowed. */
- if (name) {
- ntfs_error(sb, "Found already allocated name "
- "in phase 1. Please run chkdsk "
- "and if that doesn't find any "
- "errors please report you saw "
- "this message to "
- "linux-ntfs-dev@lists."
- "sourceforge.net.");
- goto dir_err_out;
- }
-
- if (type != FILE_NAME_DOS)
- name_size += len * sizeof(ntfschar);
- name = kmalloc(name_size, GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- goto err_out;
- }
- name->mref = le64_to_cpu(ie->data.dir.indexed_file);
- name->type = type;
- if (type != FILE_NAME_DOS) {
- name->len = len;
- memcpy(name->name, ie->key.file_name.file_name,
- len * sizeof(ntfschar));
- } else
- name->len = 0;
- *res = name;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- IGNORE_CASE, vol->upcase, vol->upcase_len);
- /*
- * If uname collates before the name of the current entry, there
- * is definitely no such name in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /* The names are not equal, continue the search. */
- if (rc)
- continue;
- /*
- * Names match with case insensitive comparison, now try the
- * case sensitive comparison, which is required for proper
- * collation.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len);
- if (rc == -1)
- break;
- if (rc)
- continue;
- /*
- * Perfect match, this will never happen as the
- * ntfs_are_names_equal() call will have gotten a match but we
- * still treat it correctly.
- */
- goto found_it;
- }
- /*
- * We have finished with this index without success. Check for the
- * presence of a child node and if not present return -ENOENT, unless
- * we have got a matching name cached in name in which case return the
- * mft reference associated with it.
- */
- if (!(ie->flags & INDEX_ENTRY_NODE)) {
- if (name) {
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(dir_ni);
- return name->mref;
- }
- ntfs_debug("Entry not found.");
- err = -ENOENT;
- goto err_out;
- } /* Child node present, descend into it. */
- /* Consistency check: Verify that an index allocation exists. */
- if (!NInoIndexAllocPresent(dir_ni)) {
- ntfs_error(sb, "No index allocation attribute but index entry "
- "requires one. Directory inode 0x%lx is "
- "corrupt or driver bug.", dir_ni->mft_no);
- goto err_out;
- }
- /* Get the starting vcn of the index_block holding the child node. */
- vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
- ia_mapping = VFS_I(dir_ni)->i_mapping;
- /*
- * We are done with the index root and the mft record. Release them,
- * otherwise we deadlock with ntfs_map_page().
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(dir_ni);
- m = NULL;
- ctx = NULL;
-descend_into_child_node:
- /*
- * Convert vcn to index into the index allocation attribute in units
- * of PAGE_SIZE and map the page cache page, reading it from
- * disk if necessary.
- */
- page = ntfs_map_page(ia_mapping, vcn <<
- dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
- if (IS_ERR(page)) {
- ntfs_error(sb, "Failed to map directory index page, error %ld.",
- -PTR_ERR(page));
- err = PTR_ERR(page);
- goto err_out;
- }
- lock_page(page);
- kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
- /* Get to the index allocation block. */
- ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
- /* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
- "inode 0x%lx or driver bug.", dir_ni->mft_no);
- goto unm_err_out;
- }
- /* Catch multi sector transfer fixup errors. */
- if (unlikely(!ntfs_is_indx_record(ia->magic))) {
- ntfs_error(sb, "Directory index record with vcn 0x%llx is "
- "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
- ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
- "different from expected VCN (0x%llx). "
- "Directory inode 0x%lx is corrupt or driver "
- "bug.", (unsigned long long)
- sle64_to_cpu(ia->index_block_vcn),
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
- dir_ni->itype.index.block_size) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx has a size (%u) differing from the "
- "directory specified size (%u). Directory "
- "inode is corrupt or driver bug.",
- (unsigned long long)vcn, dir_ni->mft_no,
- le32_to_cpu(ia->index.allocated_size) + 0x18,
- dir_ni->itype.index.block_size);
- goto unm_err_out;
- }
- index_end = (u8*)ia + dir_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx crosses page boundary. Impossible! "
- "Cannot access! This is probably a bug in the "
- "driver.", (unsigned long long)vcn,
- dir_ni->mft_no);
- goto unm_err_out;
- }
- index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
- if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
- ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
- "inode 0x%lx exceeds maximum size.",
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ia->index +
- le32_to_cpu(ia->index.entries_offset));
- /*
- * Iterate similar to above big loop but applied to index buffer, thus
- * loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds check. */
- if ((u8*)ie < (u8*)ia || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end) {
- ntfs_error(sb, "Index entry out of bounds in "
- "directory inode 0x%lx.",
- dir_ni->mft_no);
- goto unm_err_out;
- }
- /*
- * The last entry cannot contain a name. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /*
- * We perform a case sensitive comparison and if that matches
- * we are done and return the mft reference of the inode (i.e.
- * the inode number together with the sequence number for
- * consistency checking). We convert it to cpu format before
- * returning.
- */
- if (ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
-found_it2:
- /*
- * We have a perfect match, so we don't need to care
- * about having matched imperfectly before, so we can
- * free name and set *res to NULL.
- * However, if the perfect match is a short file name,
- * we need to signal this through *res, so that
- * ntfs_lookup() can fix dcache aliasing issues.
- * As an optimization we just reuse an existing
- * allocation of *res.
- */
- if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
- if (!name) {
- name = kmalloc(sizeof(ntfs_name),
- GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- }
- name->mref = le64_to_cpu(
- ie->data.dir.indexed_file);
- name->type = FILE_NAME_DOS;
- name->len = 0;
- *res = name;
- } else {
- kfree(name);
- *res = NULL;
- }
- mref = le64_to_cpu(ie->data.dir.indexed_file);
- unlock_page(page);
- ntfs_unmap_page(page);
- return mref;
- }
- /*
- * For a case insensitive mount, we also perform a case
- * insensitive comparison (provided the file name is not in the
- * POSIX namespace). If the comparison matches, and the name is
- * in the WIN32 namespace, we cache the filename in *res so
- * that the caller, ntfs_lookup(), can work on it. If the
- * comparison matches, and the name is in the DOS namespace, we
- * only cache the mft reference and the file name type (we set
- * the name length to zero for simplicity).
- */
- if (!NVolCaseSensitive(vol) &&
- ie->key.file_name.file_name_type &&
- ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length,
- IGNORE_CASE, vol->upcase, vol->upcase_len)) {
- int name_size = sizeof(ntfs_name);
- u8 type = ie->key.file_name.file_name_type;
- u8 len = ie->key.file_name.file_name_length;
-
- /* Only one case insensitive matching name allowed. */
- if (name) {
- ntfs_error(sb, "Found already allocated name "
- "in phase 2. Please run chkdsk "
- "and if that doesn't find any "
- "errors please report you saw "
- "this message to "
- "linux-ntfs-dev@lists."
- "sourceforge.net.");
- unlock_page(page);
- ntfs_unmap_page(page);
- goto dir_err_out;
- }
-
- if (type != FILE_NAME_DOS)
- name_size += len * sizeof(ntfschar);
- name = kmalloc(name_size, GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- name->mref = le64_to_cpu(ie->data.dir.indexed_file);
- name->type = type;
- if (type != FILE_NAME_DOS) {
- name->len = len;
- memcpy(name->name, ie->key.file_name.file_name,
- len * sizeof(ntfschar));
- } else
- name->len = 0;
- *res = name;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- IGNORE_CASE, vol->upcase, vol->upcase_len);
- /*
- * If uname collates before the name of the current entry, there
- * is definitely no such name in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /* The names are not equal, continue the search. */
- if (rc)
- continue;
- /*
- * Names match with case insensitive comparison, now try the
- * case sensitive comparison, which is required for proper
- * collation.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len);
- if (rc == -1)
- break;
- if (rc)
- continue;
- /*
- * Perfect match, this will never happen as the
- * ntfs_are_names_equal() call will have gotten a match but we
- * still treat it correctly.
- */
- goto found_it2;
- }
- /*
- * We have finished with this index buffer without success. Check for
- * the presence of a child node.
- */
- if (ie->flags & INDEX_ENTRY_NODE) {
- if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
- ntfs_error(sb, "Index entry with child node found in "
- "a leaf node in directory inode 0x%lx.",
- dir_ni->mft_no);
- goto unm_err_out;
- }
- /* Child node present, descend into it. */
- old_vcn = vcn;
- vcn = sle64_to_cpup((sle64*)((u8*)ie +
- le16_to_cpu(ie->length) - 8));
- if (vcn >= 0) {
- /* If vcn is in the same page cache page as old_vcn we
- * recycle the mapped page. */
- if (old_vcn << vol->cluster_size_bits >>
- PAGE_SHIFT == vcn <<
- vol->cluster_size_bits >>
- PAGE_SHIFT)
- goto fast_descend_into_child_node;
- unlock_page(page);
- ntfs_unmap_page(page);
- goto descend_into_child_node;
- }
- ntfs_error(sb, "Negative child node vcn in directory inode "
- "0x%lx.", dir_ni->mft_no);
- goto unm_err_out;
- }
- /*
- * No child node present, return -ENOENT, unless we have got a matching
- * name cached in name in which case return the mft reference
- * associated with it.
- */
- if (name) {
- unlock_page(page);
- ntfs_unmap_page(page);
- return name->mref;
- }
- ntfs_debug("Entry not found.");
- err = -ENOENT;
-unm_err_out:
- unlock_page(page);
- ntfs_unmap_page(page);
-err_out:
- if (!err)
- err = -EIO;
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(dir_ni);
- if (name) {
- kfree(name);
- *res = NULL;
- }
- return ERR_MREF(err);
-dir_err_out:
- ntfs_error(sb, "Corrupt directory. Aborting lookup.");
- goto err_out;
-}
-
-#if 0
-
-// TODO: (AIA)
-// The algorithm embedded in this code will be required for the time when we
-// want to support adding of entries to directories, where we require correct
-// collation of file names in order not to cause corruption of the filesystem.
-
-/**
- * ntfs_lookup_inode_by_name - find an inode in a directory given its name
- * @dir_ni: ntfs inode of the directory in which to search for the name
- * @uname: Unicode name for which to search in the directory
- * @uname_len: length of the name @uname in Unicode characters
- *
- * Look for an inode with name @uname in the directory with inode @dir_ni.
- * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
- * the Unicode name. If the name is found in the directory, the corresponding
- * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
- * is a 64-bit number containing the sequence number.
- *
- * On error, a negative value is returned corresponding to the error code. In
- * particular if the inode is not found -ENOENT is returned. Note that you
- * can't just check the return value for being negative, you have to check the
- * inode number for being negative which you can extract using MREC(return
- * value).
- *
- * Note, @uname_len does not include the (optional) terminating NULL character.
- */
-u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
- const int uname_len)
-{
- ntfs_volume *vol = dir_ni->vol;
- struct super_block *sb = vol->sb;
- MFT_RECORD *m;
- INDEX_ROOT *ir;
- INDEX_ENTRY *ie;
- INDEX_ALLOCATION *ia;
- u8 *index_end;
- u64 mref;
- ntfs_attr_search_ctx *ctx;
- int err, rc;
- IGNORE_CASE_BOOL ic;
- VCN vcn, old_vcn;
- struct address_space *ia_mapping;
- struct page *page;
- u8 *kaddr;
-
- /* Get hold of the mft record for the directory. */
- m = map_mft_record(dir_ni);
- if (IS_ERR(m)) {
- ntfs_error(sb, "map_mft_record() failed with error code %ld.",
- -PTR_ERR(m));
- return ERR_MREF(PTR_ERR(m));
- }
- ctx = ntfs_attr_get_search_ctx(dir_ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Find the index root attribute in the mft record. */
- err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
- 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- ntfs_error(sb, "Index root attribute missing in "
- "directory inode 0x%lx.",
- dir_ni->mft_no);
- err = -EIO;
- }
- goto err_out;
- }
- /* Get to the index root value (it's been verified in read_inode). */
- ir = (INDEX_ROOT*)((u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset));
- index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ir->index +
- le32_to_cpu(ir->index.entries_offset));
- /*
- * Loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds checks. */
- if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end)
- goto dir_err_out;
- /*
- * The last entry cannot contain a name. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /*
- * If the current entry has a name type of POSIX, the name is
- * case sensitive and not otherwise. This has the effect of us
- * not being able to access any POSIX file names which collate
- * after the non-POSIX one when they only differ in case, but
- * anyone doing screwy stuff like that deserves to burn in
- * hell... Doing that kind of stuff on NT4 actually causes
- * corruption on the partition even when using SP6a and Linux
- * is not involved at all.
- */
- ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
- CASE_SENSITIVE;
- /*
- * If the names match perfectly, we are done and return the
- * mft reference of the inode (i.e. the inode number together
- * with the sequence number for consistency checking. We
- * convert it to cpu format before returning.
- */
- if (ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, ic,
- vol->upcase, vol->upcase_len)) {
-found_it:
- mref = le64_to_cpu(ie->data.dir.indexed_file);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(dir_ni);
- return mref;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- IGNORE_CASE, vol->upcase, vol->upcase_len);
- /*
- * If uname collates before the name of the current entry, there
- * is definitely no such name in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /* The names are not equal, continue the search. */
- if (rc)
- continue;
- /*
- * Names match with case insensitive comparison, now try the
- * case sensitive comparison, which is required for proper
- * collation.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len);
- if (rc == -1)
- break;
- if (rc)
- continue;
- /*
- * Perfect match, this will never happen as the
- * ntfs_are_names_equal() call will have gotten a match but we
- * still treat it correctly.
- */
- goto found_it;
- }
- /*
- * We have finished with this index without success. Check for the
- * presence of a child node.
- */
- if (!(ie->flags & INDEX_ENTRY_NODE)) {
- /* No child node, return -ENOENT. */
- err = -ENOENT;
- goto err_out;
- } /* Child node present, descend into it. */
- /* Consistency check: Verify that an index allocation exists. */
- if (!NInoIndexAllocPresent(dir_ni)) {
- ntfs_error(sb, "No index allocation attribute but index entry "
- "requires one. Directory inode 0x%lx is "
- "corrupt or driver bug.", dir_ni->mft_no);
- goto err_out;
- }
- /* Get the starting vcn of the index_block holding the child node. */
- vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
- ia_mapping = VFS_I(dir_ni)->i_mapping;
- /*
- * We are done with the index root and the mft record. Release them,
- * otherwise we deadlock with ntfs_map_page().
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(dir_ni);
- m = NULL;
- ctx = NULL;
-descend_into_child_node:
- /*
- * Convert vcn to index into the index allocation attribute in units
- * of PAGE_SIZE and map the page cache page, reading it from
- * disk if necessary.
- */
- page = ntfs_map_page(ia_mapping, vcn <<
- dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
- if (IS_ERR(page)) {
- ntfs_error(sb, "Failed to map directory index page, error %ld.",
- -PTR_ERR(page));
- err = PTR_ERR(page);
- goto err_out;
- }
- lock_page(page);
- kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
- /* Get to the index allocation block. */
- ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
- /* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
- "inode 0x%lx or driver bug.", dir_ni->mft_no);
- goto unm_err_out;
- }
- /* Catch multi sector transfer fixup errors. */
- if (unlikely(!ntfs_is_indx_record(ia->magic))) {
- ntfs_error(sb, "Directory index record with vcn 0x%llx is "
- "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
- ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
- "different from expected VCN (0x%llx). "
- "Directory inode 0x%lx is corrupt or driver "
- "bug.", (unsigned long long)
- sle64_to_cpu(ia->index_block_vcn),
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
- dir_ni->itype.index.block_size) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx has a size (%u) differing from the "
- "directory specified size (%u). Directory "
- "inode is corrupt or driver bug.",
- (unsigned long long)vcn, dir_ni->mft_no,
- le32_to_cpu(ia->index.allocated_size) + 0x18,
- dir_ni->itype.index.block_size);
- goto unm_err_out;
- }
- index_end = (u8*)ia + dir_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx crosses page boundary. Impossible! "
- "Cannot access! This is probably a bug in the "
- "driver.", (unsigned long long)vcn,
- dir_ni->mft_no);
- goto unm_err_out;
- }
- index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
- if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
- ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
- "inode 0x%lx exceeds maximum size.",
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ia->index +
- le32_to_cpu(ia->index.entries_offset));
- /*
- * Iterate similar to above big loop but applied to index buffer, thus
- * loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds check. */
- if ((u8*)ie < (u8*)ia || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end) {
- ntfs_error(sb, "Index entry out of bounds in "
- "directory inode 0x%lx.",
- dir_ni->mft_no);
- goto unm_err_out;
- }
- /*
- * The last entry cannot contain a name. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /*
- * If the current entry has a name type of POSIX, the name is
- * case sensitive and not otherwise. This has the effect of us
- * not being able to access any POSIX file names which collate
- * after the non-POSIX one when they only differ in case, but
- * anyone doing screwy stuff like that deserves to burn in
- * hell... Doing that kind of stuff on NT4 actually causes
- * corruption on the partition even when using SP6a and Linux
- * is not involved at all.
- */
- ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
- CASE_SENSITIVE;
- /*
- * If the names match perfectly, we are done and return the
- * mft reference of the inode (i.e. the inode number together
- * with the sequence number for consistency checking. We
- * convert it to cpu format before returning.
- */
- if (ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, ic,
- vol->upcase, vol->upcase_len)) {
-found_it2:
- mref = le64_to_cpu(ie->data.dir.indexed_file);
- unlock_page(page);
- ntfs_unmap_page(page);
- return mref;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- IGNORE_CASE, vol->upcase, vol->upcase_len);
- /*
- * If uname collates before the name of the current entry, there
- * is definitely no such name in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /* The names are not equal, continue the search. */
- if (rc)
- continue;
- /*
- * Names match with case insensitive comparison, now try the
- * case sensitive comparison, which is required for proper
- * collation.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len);
- if (rc == -1)
- break;
- if (rc)
- continue;
- /*
- * Perfect match, this will never happen as the
- * ntfs_are_names_equal() call will have gotten a match but we
- * still treat it correctly.
- */
- goto found_it2;
- }
- /*
- * We have finished with this index buffer without success. Check for
- * the presence of a child node.
- */
- if (ie->flags & INDEX_ENTRY_NODE) {
- if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
- ntfs_error(sb, "Index entry with child node found in "
- "a leaf node in directory inode 0x%lx.",
- dir_ni->mft_no);
- goto unm_err_out;
- }
- /* Child node present, descend into it. */
- old_vcn = vcn;
- vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
- if (vcn >= 0) {
- /* If vcn is in the same page cache page as old_vcn we
- * recycle the mapped page. */
- if (old_vcn << vol->cluster_size_bits >>
- PAGE_SHIFT == vcn <<
- vol->cluster_size_bits >>
- PAGE_SHIFT)
- goto fast_descend_into_child_node;
- unlock_page(page);
- ntfs_unmap_page(page);
- goto descend_into_child_node;
- }
- ntfs_error(sb, "Negative child node vcn in directory inode "
- "0x%lx.", dir_ni->mft_no);
- goto unm_err_out;
- }
- /* No child node, return -ENOENT. */
- ntfs_debug("Entry not found.");
- err = -ENOENT;
-unm_err_out:
- unlock_page(page);
- ntfs_unmap_page(page);
-err_out:
- if (!err)
- err = -EIO;
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(dir_ni);
- return ERR_MREF(err);
-dir_err_out:
- ntfs_error(sb, "Corrupt directory. Aborting lookup.");
- goto err_out;
-}
-
-#endif
-
-/**
- * ntfs_filldir - ntfs specific filldir method
- * @vol: current ntfs volume
- * @ndir: ntfs inode of current directory
- * @ia_page: page in which the index allocation buffer @ie is in resides
- * @ie: current index entry
- * @name: buffer to use for the converted name
- * @actor: what to feed the entries to
- *
- * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
- * callback.
- *
- * If @ia_page is not NULL it is the locked page containing the index
- * allocation block containing the index entry @ie.
- *
- * Note, we drop (and then reacquire) the page lock on @ia_page across the
- * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
- * since ntfs_lookup() will lock the same page. As an optimization, we do not
- * retake the lock if we are returning a non-zero value as ntfs_readdir()
- * would need to drop the lock immediately anyway.
- */
-static inline int ntfs_filldir(ntfs_volume *vol,
- ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
- u8 *name, struct dir_context *actor)
-{
- unsigned long mref;
- int name_len;
- unsigned dt_type;
- FILE_NAME_TYPE_FLAGS name_type;
-
- name_type = ie->key.file_name.file_name_type;
- if (name_type == FILE_NAME_DOS) {
- ntfs_debug("Skipping DOS name space entry.");
- return 0;
- }
- if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) {
- ntfs_debug("Skipping root directory self reference entry.");
- return 0;
- }
- if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user &&
- !NVolShowSystemFiles(vol)) {
- ntfs_debug("Skipping system file.");
- return 0;
- }
- name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, &name,
- NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
- if (name_len <= 0) {
- ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
- (long long)MREF_LE(ie->data.dir.indexed_file));
- return 0;
- }
- if (ie->key.file_name.file_attributes &
- FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT)
- dt_type = DT_DIR;
- else
- dt_type = DT_REG;
- mref = MREF_LE(ie->data.dir.indexed_file);
- /*
- * Drop the page lock otherwise we deadlock with NFS when it calls
- * ->lookup since ntfs_lookup() will lock the same page.
- */
- if (ia_page)
- unlock_page(ia_page);
- ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
- "0x%lx, DT_%s.", name, name_len, actor->pos, mref,
- dt_type == DT_DIR ? "DIR" : "REG");
- if (!dir_emit(actor, name, name_len, mref, dt_type))
- return 1;
- /* Relock the page but not if we are aborting ->readdir. */
- if (ia_page)
- lock_page(ia_page);
- return 0;
-}
-
-/*
- * We use the same basic approach as the old NTFS driver, i.e. we parse the
- * index root entries and then the index allocation entries that are marked
- * as in use in the index bitmap.
- *
- * While this will return the names in random order this doesn't matter for
- * ->readdir but OTOH results in a faster ->readdir.
- *
- * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
- * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
- * modifications).
- *
- * Locking: - Caller must hold i_mutex on the directory.
- * - Each page cache page in the index allocation mapping must be
- * locked whilst being accessed otherwise we may find a corrupt
- * page due to it being under ->writepage at the moment which
- * applies the mst protection fixups before writing out and then
- * removes them again after the write is complete after which it
- * unlocks the page.
- */
-static int ntfs_readdir(struct file *file, struct dir_context *actor)
-{
- s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
- loff_t i_size;
- struct inode *bmp_vi, *vdir = file_inode(file);
- struct super_block *sb = vdir->i_sb;
- ntfs_inode *ndir = NTFS_I(vdir);
- ntfs_volume *vol = NTFS_SB(sb);
- MFT_RECORD *m;
- INDEX_ROOT *ir = NULL;
- INDEX_ENTRY *ie;
- INDEX_ALLOCATION *ia;
- u8 *name = NULL;
- int rc, err, ir_pos, cur_bmp_pos;
- struct address_space *ia_mapping, *bmp_mapping;
- struct page *bmp_page = NULL, *ia_page = NULL;
- u8 *kaddr, *bmp, *index_end;
- ntfs_attr_search_ctx *ctx;
-
- ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
- vdir->i_ino, actor->pos);
- rc = err = 0;
- /* Are we at end of dir yet? */
- i_size = i_size_read(vdir);
- if (actor->pos >= i_size + vol->mft_record_size)
- return 0;
- /* Emulate . and .. for all directories. */
- if (!dir_emit_dots(file, actor))
- return 0;
- m = NULL;
- ctx = NULL;
- /*
- * Allocate a buffer to store the current name being processed
- * converted to format determined by current NLS.
- */
- name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
- if (unlikely(!name)) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Are we jumping straight into the index allocation attribute? */
- if (actor->pos >= vol->mft_record_size)
- goto skip_index_root;
- /* Get hold of the mft record for the directory. */
- m = map_mft_record(ndir);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(ndir, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Get the offset into the index root attribute. */
- ir_pos = (s64)actor->pos;
- /* Find the index root attribute in the mft record. */
- err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
- 0, ctx);
- if (unlikely(err)) {
- ntfs_error(sb, "Index root attribute missing in directory "
- "inode 0x%lx.", vdir->i_ino);
- goto err_out;
- }
- /*
- * Copy the index root attribute value to a buffer so that we can put
- * the search context and unmap the mft record before calling the
- * filldir() callback. We need to do this because of NFSd which calls
- * ->lookup() from its filldir callback() and this causes NTFS to
- * deadlock as ntfs_lookup() maps the mft record of the directory and
- * we have got it mapped here already. The only solution is for us to
- * unmap the mft record here so that a call to ntfs_lookup() is able to
- * map the mft record without deadlocking.
- */
- rc = le32_to_cpu(ctx->attr->data.resident.value_length);
- ir = kmalloc(rc, GFP_NOFS);
- if (unlikely(!ir)) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Copy the index root value (it has been verified in read_inode). */
- memcpy(ir, (u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset), rc);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ndir);
- ctx = NULL;
- m = NULL;
- index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ir->index +
- le32_to_cpu(ir->index.entries_offset));
- /*
- * Loop until we exceed valid memory (corruption case) or until we
- * reach the last entry or until filldir tells us it has had enough
- * or signals an error (both covered by the rc test).
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir);
- /* Bounds checks. */
- if (unlikely((u8*)ie < (u8*)ir || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end))
- goto err_out;
- /* The last entry cannot contain a name. */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /* Skip index root entry if continuing previous readdir. */
- if (ir_pos > (u8*)ie - (u8*)ir)
- continue;
- /* Advance the position even if going to skip the entry. */
- actor->pos = (u8*)ie - (u8*)ir;
- /* Submit the name to the filldir callback. */
- rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
- if (rc) {
- kfree(ir);
- goto abort;
- }
- }
- /* We are done with the index root and can free the buffer. */
- kfree(ir);
- ir = NULL;
- /* If there is no index allocation attribute we are finished. */
- if (!NInoIndexAllocPresent(ndir))
- goto EOD;
- /* Advance fpos to the beginning of the index allocation. */
- actor->pos = vol->mft_record_size;
-skip_index_root:
- kaddr = NULL;
- prev_ia_pos = -1LL;
- /* Get the offset into the index allocation attribute. */
- ia_pos = (s64)actor->pos - vol->mft_record_size;
- ia_mapping = vdir->i_mapping;
- ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
- bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
- if (IS_ERR(bmp_vi)) {
- ntfs_error(sb, "Failed to get bitmap attribute.");
- err = PTR_ERR(bmp_vi);
- goto err_out;
- }
- bmp_mapping = bmp_vi->i_mapping;
- /* Get the starting bitmap bit position and sanity check it. */
- bmp_pos = ia_pos >> ndir->itype.index.block_size_bits;
- if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) {
- ntfs_error(sb, "Current index allocation position exceeds "
- "index bitmap size.");
- goto iput_err_out;
- }
- /* Get the starting bit position in the current bitmap page. */
- cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1);
- bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1);
-get_next_bmp_page:
- ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
- (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT),
- (unsigned long long)bmp_pos &
- (unsigned long long)((PAGE_SIZE * 8) - 1));
- bmp_page = ntfs_map_page(bmp_mapping,
- bmp_pos >> (3 + PAGE_SHIFT));
- if (IS_ERR(bmp_page)) {
- ntfs_error(sb, "Reading index bitmap failed.");
- err = PTR_ERR(bmp_page);
- bmp_page = NULL;
- goto iput_err_out;
- }
- bmp = (u8*)page_address(bmp_page);
- /* Find next index block in use. */
- while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) {
-find_next_index_buffer:
- cur_bmp_pos++;
- /*
- * If we have reached the end of the bitmap page, get the next
- * page, and put away the old one.
- */
- if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) {
- ntfs_unmap_page(bmp_page);
- bmp_pos += PAGE_SIZE * 8;
- cur_bmp_pos = 0;
- goto get_next_bmp_page;
- }
- /* If we have reached the end of the bitmap, we are done. */
- if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size))
- goto unm_EOD;
- ia_pos = (bmp_pos + cur_bmp_pos) <<
- ndir->itype.index.block_size_bits;
- }
- ntfs_debug("Handling index buffer 0x%llx.",
- (unsigned long long)bmp_pos + cur_bmp_pos);
- /* If the current index buffer is in the same page we reuse the page. */
- if ((prev_ia_pos & (s64)PAGE_MASK) !=
- (ia_pos & (s64)PAGE_MASK)) {
- prev_ia_pos = ia_pos;
- if (likely(ia_page != NULL)) {
- unlock_page(ia_page);
- ntfs_unmap_page(ia_page);
- }
- /*
- * Map the page cache page containing the current ia_pos,
- * reading it from disk if necessary.
- */
- ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT);
- if (IS_ERR(ia_page)) {
- ntfs_error(sb, "Reading index allocation data failed.");
- err = PTR_ERR(ia_page);
- ia_page = NULL;
- goto err_out;
- }
- lock_page(ia_page);
- kaddr = (u8*)page_address(ia_page);
- }
- /* Get the current index buffer. */
- ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK &
- ~(s64)(ndir->itype.index.block_size - 1)));
- /* Bounds checks. */
- if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) {
- ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
- "inode 0x%lx or driver bug.", vdir->i_ino);
- goto err_out;
- }
- /* Catch multi sector transfer fixup errors. */
- if (unlikely(!ntfs_is_indx_record(ia->magic))) {
- ntfs_error(sb, "Directory index record with vcn 0x%llx is "
- "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
- (unsigned long long)ia_pos >>
- ndir->itype.index.vcn_size_bits, vdir->i_ino);
- goto err_out;
- }
- if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
- ~(s64)(ndir->itype.index.block_size - 1)) >>
- ndir->itype.index.vcn_size_bits)) {
- ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
- "different from expected VCN (0x%llx). "
- "Directory inode 0x%lx is corrupt or driver "
- "bug. ", (unsigned long long)
- sle64_to_cpu(ia->index_block_vcn),
- (unsigned long long)ia_pos >>
- ndir->itype.index.vcn_size_bits, vdir->i_ino);
- goto err_out;
- }
- if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 !=
- ndir->itype.index.block_size)) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx has a size (%u) differing from the "
- "directory specified size (%u). Directory "
- "inode is corrupt or driver bug.",
- (unsigned long long)ia_pos >>
- ndir->itype.index.vcn_size_bits, vdir->i_ino,
- le32_to_cpu(ia->index.allocated_size) + 0x18,
- ndir->itype.index.block_size);
- goto err_out;
- }
- index_end = (u8*)ia + ndir->itype.index.block_size;
- if (unlikely(index_end > kaddr + PAGE_SIZE)) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx crosses page boundary. Impossible! "
- "Cannot access! This is probably a bug in the "
- "driver.", (unsigned long long)ia_pos >>
- ndir->itype.index.vcn_size_bits, vdir->i_ino);
- goto err_out;
- }
- ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1);
- index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
- if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) {
- ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
- "inode 0x%lx exceeds maximum size.",
- (unsigned long long)ia_pos >>
- ndir->itype.index.vcn_size_bits, vdir->i_ino);
- goto err_out;
- }
- /* The first index entry in this index buffer. */
- ie = (INDEX_ENTRY*)((u8*)&ia->index +
- le32_to_cpu(ia->index.entries_offset));
- /*
- * Loop until we exceed valid memory (corruption case) or until we
- * reach the last entry or until filldir tells us it has had enough
- * or signals an error (both covered by the rc test).
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- ntfs_debug("In index allocation, offset 0x%llx.",
- (unsigned long long)ia_start +
- (unsigned long long)((u8*)ie - (u8*)ia));
- /* Bounds checks. */
- if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end))
- goto err_out;
- /* The last entry cannot contain a name. */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /* Skip index block entry if continuing previous readdir. */
- if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
- continue;
- /* Advance the position even if going to skip the entry. */
- actor->pos = (u8*)ie - (u8*)ia +
- (sle64_to_cpu(ia->index_block_vcn) <<
- ndir->itype.index.vcn_size_bits) +
- vol->mft_record_size;
- /*
- * Submit the name to the @filldir callback. Note,
- * ntfs_filldir() drops the lock on @ia_page but it retakes it
- * before returning, unless a non-zero value is returned in
- * which case the page is left unlocked.
- */
- rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
- if (rc) {
- /* @ia_page is already unlocked in this case. */
- ntfs_unmap_page(ia_page);
- ntfs_unmap_page(bmp_page);
- iput(bmp_vi);
- goto abort;
- }
- }
- goto find_next_index_buffer;
-unm_EOD:
- if (ia_page) {
- unlock_page(ia_page);
- ntfs_unmap_page(ia_page);
- }
- ntfs_unmap_page(bmp_page);
- iput(bmp_vi);
-EOD:
- /* We are finished, set fpos to EOD. */
- actor->pos = i_size + vol->mft_record_size;
-abort:
- kfree(name);
- return 0;
-err_out:
- if (bmp_page) {
- ntfs_unmap_page(bmp_page);
-iput_err_out:
- iput(bmp_vi);
- }
- if (ia_page) {
- unlock_page(ia_page);
- ntfs_unmap_page(ia_page);
- }
- kfree(ir);
- kfree(name);
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(ndir);
- if (!err)
- err = -EIO;
- ntfs_debug("Failed. Returning error code %i.", -err);
- return err;
-}
-
-/**
- * ntfs_dir_open - called when an inode is about to be opened
- * @vi: inode to be opened
- * @filp: file structure describing the inode
- *
- * Limit directory size to the page cache limit on architectures where unsigned
- * long is 32-bits. This is the most we can do for now without overflowing the
- * page cache page index. Doing it this way means we don't run into problems
- * because of existing too large directories. It would be better to allow the
- * user to read the accessible part of the directory but I doubt very much
- * anyone is going to hit this check on a 32-bit architecture, so there is no
- * point in adding the extra complexity required to support this.
- *
- * On 64-bit architectures, the check is hopefully optimized away by the
- * compiler.
- */
-static int ntfs_dir_open(struct inode *vi, struct file *filp)
-{
- if (sizeof(unsigned long) < 8) {
- if (i_size_read(vi) > MAX_LFS_FILESIZE)
- return -EFBIG;
- }
- return 0;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_dir_fsync - sync a directory to disk
- * @filp: directory to be synced
- * @start: offset in bytes of the beginning of data range to sync
- * @end: offset in bytes of the end of data range (inclusive)
- * @datasync: if non-zero only flush user data and not metadata
- *
- * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and
- * msync system calls. This function is based on file.c::ntfs_file_fsync().
- *
- * Write the mft record and all associated extent mft records as well as the
- * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
- *
- * If @datasync is true, we do not wait on the inode(s) to be written out
- * but we always wait on the page cache pages to be written out.
- *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
- * Locking: Caller must hold i_mutex on the inode.
- *
- * TODO: We should probably also write all attribute/index inodes associated
- * with this inode but since we have no simple way of getting to them we ignore
- * this problem for now. We do write the $BITMAP attribute if it is present
- * which is the important one for a directory so things are not too bad.
- */
-static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *bmp_vi, *vi = filp->f_mapping->host;
- int err, ret;
- ntfs_attr na;
-
- ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-
- err = file_write_and_wait_range(filp, start, end);
- if (err)
- return err;
- inode_lock(vi);
-
- BUG_ON(!S_ISDIR(vi->i_mode));
- /* If the bitmap attribute inode is in memory sync it, too. */
- na.mft_no = vi->i_ino;
- na.type = AT_BITMAP;
- na.name = I30;
- na.name_len = 4;
- bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na);
- if (bmp_vi) {
- write_inode_now(bmp_vi, !datasync);
- iput(bmp_vi);
- }
- ret = __ntfs_write_inode(vi, 1);
- write_inode_now(vi, !datasync);
- err = sync_blockdev(vi->i_sb->s_bdev);
- if (unlikely(err && !ret))
- ret = err;
- if (likely(!ret))
- ntfs_debug("Done.");
- else
- ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
- "%u.", datasync ? "data" : "", vi->i_ino, -ret);
- inode_unlock(vi);
- return ret;
-}
-
-#endif /* NTFS_RW */
-
-WRAP_DIR_ITER(ntfs_readdir) // FIXME!
-const struct file_operations ntfs_dir_ops = {
- .llseek = generic_file_llseek, /* Seek inside directory. */
- .read = generic_read_dir, /* Return -EISDIR. */
- .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */
-#ifdef NTFS_RW
- .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */
-#endif /* NTFS_RW */
- /*.ioctl = ,*/ /* Perform function on the
- mounted filesystem. */
- .open = ntfs_dir_open, /* Open directory. */
-};
diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h
deleted file mode 100644
index 0e326753df40..000000000000
--- a/fs/ntfs/dir.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of
- * the Linux-NTFS project.
- *
- * Copyright (c) 2002-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_DIR_H
-#define _LINUX_NTFS_DIR_H
-
-#include "layout.h"
-#include "inode.h"
-#include "types.h"
-
-/*
- * ntfs_name is used to return the file name to the caller of
- * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup())
- * to be able to deal with dcache aliasing issues.
- */
-typedef struct {
- MFT_REF mref;
- FILE_NAME_TYPE_FLAGS type;
- u8 len;
- ntfschar name[0];
-} __attribute__ ((__packed__)) ntfs_name;
-
-/* The little endian Unicode string $I30 as a global constant. */
-extern ntfschar I30[5];
-
-extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni,
- const ntfschar *uname, const int uname_len, ntfs_name **res);
-
-#endif /* _LINUX_NTFS_FS_DIR_H */
diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h
deleted file mode 100644
index f30c139bf9ae..000000000000
--- a/fs/ntfs/endian.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * endian.h - Defines for endianness handling in NTFS Linux kernel driver.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_ENDIAN_H
-#define _LINUX_NTFS_ENDIAN_H
-
-#include <asm/byteorder.h>
-#include "types.h"
-
-/*
- * Signed endianness conversion functions.
- */
-
-static inline s16 sle16_to_cpu(sle16 x)
-{
- return le16_to_cpu((__force le16)x);
-}
-
-static inline s32 sle32_to_cpu(sle32 x)
-{
- return le32_to_cpu((__force le32)x);
-}
-
-static inline s64 sle64_to_cpu(sle64 x)
-{
- return le64_to_cpu((__force le64)x);
-}
-
-static inline s16 sle16_to_cpup(sle16 *x)
-{
- return le16_to_cpu(*(__force le16*)x);
-}
-
-static inline s32 sle32_to_cpup(sle32 *x)
-{
- return le32_to_cpu(*(__force le32*)x);
-}
-
-static inline s64 sle64_to_cpup(sle64 *x)
-{
- return le64_to_cpu(*(__force le64*)x);
-}
-
-static inline sle16 cpu_to_sle16(s16 x)
-{
- return (__force sle16)cpu_to_le16(x);
-}
-
-static inline sle32 cpu_to_sle32(s32 x)
-{
- return (__force sle32)cpu_to_le32(x);
-}
-
-static inline sle64 cpu_to_sle64(s64 x)
-{
- return (__force sle64)cpu_to_le64(x);
-}
-
-static inline sle16 cpu_to_sle16p(s16 *x)
-{
- return (__force sle16)cpu_to_le16(*x);
-}
-
-static inline sle32 cpu_to_sle32p(s32 *x)
-{
- return (__force sle32)cpu_to_le32(*x);
-}
-
-static inline sle64 cpu_to_sle64p(s64 *x)
-{
- return (__force sle64)cpu_to_le64(*x);
-}
-
-#endif /* _LINUX_NTFS_ENDIAN_H */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
deleted file mode 100644
index 297c0b9db621..000000000000
--- a/fs/ntfs/file.c
+++ /dev/null
@@ -1,1997 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
- */
-
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/pagevec.h>
-#include <linux/sched/signal.h>
-#include <linux/swap.h>
-#include <linux/uio.h>
-#include <linux/writeback.h>
-
-#include <asm/page.h>
-#include <linux/uaccess.h>
-
-#include "attrib.h"
-#include "bitmap.h"
-#include "inode.h"
-#include "debug.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-
-/**
- * ntfs_file_open - called when an inode is about to be opened
- * @vi: inode to be opened
- * @filp: file structure describing the inode
- *
- * Limit file size to the page cache limit on architectures where unsigned long
- * is 32-bits. This is the most we can do for now without overflowing the page
- * cache page index. Doing it this way means we don't run into problems because
- * of existing too large files. It would be better to allow the user to read
- * the beginning of the file but I doubt very much anyone is going to hit this
- * check on a 32-bit architecture, so there is no point in adding the extra
- * complexity required to support this.
- *
- * On 64-bit architectures, the check is hopefully optimized away by the
- * compiler.
- *
- * After the check passes, just call generic_file_open() to do its work.
- */
-static int ntfs_file_open(struct inode *vi, struct file *filp)
-{
- if (sizeof(unsigned long) < 8) {
- if (i_size_read(vi) > MAX_LFS_FILESIZE)
- return -EOVERFLOW;
- }
- return generic_file_open(vi, filp);
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_attr_extend_initialized - extend the initialized size of an attribute
- * @ni: ntfs inode of the attribute to extend
- * @new_init_size: requested new initialized size in bytes
- *
- * Extend the initialized size of an attribute described by the ntfs inode @ni
- * to @new_init_size bytes. This involves zeroing any non-sparse space between
- * the old initialized size and @new_init_size both in the page cache and on
- * disk (if relevant complete pages are already uptodate in the page cache then
- * these are simply marked dirty).
- *
- * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
- * in the resident attribute case, it is tied to the initialized size and, in
- * the non-resident attribute case, it may not fall below the initialized size.
- *
- * Note that if the attribute is resident, we do not need to touch the page
- * cache at all. This is because if the page cache page is not uptodate we
- * bring it uptodate later, when doing the write to the mft record since we
- * then already have the page mapped. And if the page is uptodate, the
- * non-initialized region will already have been zeroed when the page was
- * brought uptodate and the region may in fact already have been overwritten
- * with new data via mmap() based writes, so we cannot just zero it. And since
- * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
- * is unspecified, we choose not to do zeroing and thus we do not need to touch
- * the page at all. For a more detailed explanation see ntfs_truncate() in
- * fs/ntfs/inode.c.
- *
- * Return 0 on success and -errno on error. In the case that an error is
- * encountered it is possible that the initialized size will already have been
- * incremented some way towards @new_init_size but it is guaranteed that if
- * this is the case, the necessary zeroing will also have happened and that all
- * metadata is self-consistent.
- *
- * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
- * held by the caller.
- */
-static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
-{
- s64 old_init_size;
- loff_t old_i_size;
- pgoff_t index, end_index;
- unsigned long flags;
- struct inode *vi = VFS_I(ni);
- ntfs_inode *base_ni;
- MFT_RECORD *m = NULL;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx = NULL;
- struct address_space *mapping;
- struct page *page = NULL;
- u8 *kattr;
- int err;
- u32 attr_len;
-
- read_lock_irqsave(&ni->size_lock, flags);
- old_init_size = ni->initialized_size;
- old_i_size = i_size_read(vi);
- BUG_ON(new_init_size > ni->allocated_size);
- read_unlock_irqrestore(&ni->size_lock, flags);
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
- "old_initialized_size 0x%llx, "
- "new_initialized_size 0x%llx, i_size 0x%llx.",
- vi->i_ino, (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)old_init_size,
- (unsigned long long)new_init_size, old_i_size);
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /* Use goto to reduce indentation and we need the label below anyway. */
- if (NInoNonResident(ni))
- goto do_non_resident_extend;
- BUG_ON(old_init_size != old_i_size);
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- BUG_ON(a->non_resident);
- /* The total length of the attribute value. */
- attr_len = le32_to_cpu(a->data.resident.value_length);
- BUG_ON(old_i_size != (loff_t)attr_len);
- /*
- * Do the zeroing in the mft record and update the attribute size in
- * the mft record.
- */
- kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
- memset(kattr + attr_len, 0, new_init_size - attr_len);
- a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
- /* Finally, update the sizes in the vfs and ntfs inodes. */
- write_lock_irqsave(&ni->size_lock, flags);
- i_size_write(vi, new_init_size);
- ni->initialized_size = new_init_size;
- write_unlock_irqrestore(&ni->size_lock, flags);
- goto done;
-do_non_resident_extend:
- /*
- * If the new initialized size @new_init_size exceeds the current file
- * size (vfs inode->i_size), we need to extend the file size to the
- * new initialized size.
- */
- if (new_init_size > old_i_size) {
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- BUG_ON(!a->non_resident);
- BUG_ON(old_i_size != (loff_t)
- sle64_to_cpu(a->data.non_resident.data_size));
- a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- /* Update the file size in the vfs inode. */
- i_size_write(vi, new_init_size);
- ntfs_attr_put_search_ctx(ctx);
- ctx = NULL;
- unmap_mft_record(base_ni);
- m = NULL;
- }
- mapping = vi->i_mapping;
- index = old_init_size >> PAGE_SHIFT;
- end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- do {
- /*
- * Read the page. If the page is not present, this will zero
- * the uninitialized regions for us.
- */
- page = read_mapping_page(mapping, index, NULL);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- goto init_err_out;
- }
- /*
- * Update the initialized size in the ntfs inode. This is
- * enough to make ntfs_writepage() work.
- */
- write_lock_irqsave(&ni->size_lock, flags);
- ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT;
- if (ni->initialized_size > new_init_size)
- ni->initialized_size = new_init_size;
- write_unlock_irqrestore(&ni->size_lock, flags);
- /* Set the page dirty so it gets written out. */
- set_page_dirty(page);
- put_page(page);
- /*
- * Play nice with the vm and the rest of the system. This is
- * very much needed as we can potentially be modifying the
- * initialised size from a very small value to a really huge
- * value, e.g.
- * f = open(somefile, O_TRUNC);
- * truncate(f, 10GiB);
- * seek(f, 10GiB);
- * write(f, 1);
- * And this would mean we would be marking dirty hundreds of
- * thousands of pages or as in the above example more than
- * two and a half million pages!
- *
- * TODO: For sparse pages could optimize this workload by using
- * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
- * would be set in read_folio for sparse pages and here we would
- * not need to mark dirty any pages which have this bit set.
- * The only caveat is that we have to clear the bit everywhere
- * where we allocate any clusters that lie in the page or that
- * contain the page.
- *
- * TODO: An even greater optimization would be for us to only
- * call read_folio() on pages which are not in sparse regions as
- * determined from the runlist. This would greatly reduce the
- * number of pages we read and make dirty in the case of sparse
- * files.
- */
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- } while (++index < end_index);
- read_lock_irqsave(&ni->size_lock, flags);
- BUG_ON(ni->initialized_size != new_init_size);
- read_unlock_irqrestore(&ni->size_lock, flags);
- /* Now bring in sync the initialized_size in the mft record. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- goto init_err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto init_err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto init_err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- BUG_ON(!a->non_resident);
- a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
-done:
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
- (unsigned long long)new_init_size, i_size_read(vi));
- return 0;
-init_err_out:
- write_lock_irqsave(&ni->size_lock, flags);
- ni->initialized_size = old_init_size;
- write_unlock_irqrestore(&ni->size_lock, flags);
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- ntfs_debug("Failed. Returning error code %i.", err);
- return err;
-}
-
-static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
- struct iov_iter *from)
-{
- loff_t pos;
- s64 end, ll;
- ssize_t err;
- unsigned long flags;
- struct file *file = iocb->ki_filp;
- struct inode *vi = file_inode(file);
- ntfs_inode *ni = NTFS_I(vi);
- ntfs_volume *vol = ni->vol;
-
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
- "0x%llx, count 0x%zx.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)iocb->ki_pos,
- iov_iter_count(from));
- err = generic_write_checks(iocb, from);
- if (unlikely(err <= 0))
- goto out;
- /*
- * All checks have passed. Before we start doing any writing we want
- * to abort any totally illegal writes.
- */
- BUG_ON(NInoMstProtected(ni));
- BUG_ON(ni->type != AT_DATA);
- /* If file is encrypted, deny access, just like NT4. */
- if (NInoEncrypted(ni)) {
- /* Only $DATA attributes can be encrypted. */
- /*
- * Reminder for later: Encrypted files are _always_
- * non-resident so that the content can always be encrypted.
- */
- ntfs_debug("Denying write access to encrypted file.");
- err = -EACCES;
- goto out;
- }
- if (NInoCompressed(ni)) {
- /* Only unnamed $DATA attribute can be compressed. */
- BUG_ON(ni->name_len);
- /*
- * Reminder for later: If resident, the data is not actually
- * compressed. Only on the switch to non-resident does
- * compression kick in. This is in contrast to encrypted files
- * (see above).
- */
- ntfs_error(vi->i_sb, "Writing to compressed files is not "
- "implemented yet. Sorry.");
- err = -EOPNOTSUPP;
- goto out;
- }
- err = file_remove_privs(file);
- if (unlikely(err))
- goto out;
- /*
- * Our ->update_time method always succeeds thus file_update_time()
- * cannot fail either so there is no need to check the return code.
- */
- file_update_time(file);
- pos = iocb->ki_pos;
- /* The first byte after the last cluster being written to. */
- end = (pos + iov_iter_count(from) + vol->cluster_size_mask) &
- ~(u64)vol->cluster_size_mask;
- /*
- * If the write goes beyond the allocated size, extend the allocation
- * to cover the whole of the write, rounded up to the nearest cluster.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (end > ll) {
- /*
- * Extend the allocation without changing the data size.
- *
- * Note we ensure the allocation is big enough to at least
- * write some data but we do not require the allocation to be
- * complete, i.e. it may be partial.
- */
- ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
- if (likely(ll >= 0)) {
- BUG_ON(pos >= ll);
- /* If the extension was partial truncate the write. */
- if (end > ll) {
- ntfs_debug("Truncating write to inode 0x%lx, "
- "attribute type 0x%x, because "
- "the allocation was only "
- "partially extended.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- iov_iter_truncate(from, ll - pos);
- }
- } else {
- err = ll;
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- /* Perform a partial write if possible or fail. */
- if (pos < ll) {
- ntfs_debug("Truncating write to inode 0x%lx "
- "attribute type 0x%x, because "
- "extending the allocation "
- "failed (error %d).",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type),
- (int)-err);
- iov_iter_truncate(from, ll - pos);
- } else {
- if (err != -ENOSPC)
- ntfs_error(vi->i_sb, "Cannot perform "
- "write to inode "
- "0x%lx, attribute "
- "type 0x%x, because "
- "extending the "
- "allocation failed "
- "(error %ld).",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type),
- (long)-err);
- else
- ntfs_debug("Cannot perform write to "
- "inode 0x%lx, "
- "attribute type 0x%x, "
- "because there is not "
- "space left.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- goto out;
- }
- }
- }
- /*
- * If the write starts beyond the initialized size, extend it up to the
- * beginning of the write and initialize all non-sparse space between
- * the old initialized size and the new one. This automatically also
- * increments the vfs inode->i_size to keep it above or equal to the
- * initialized_size.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (pos > ll) {
- /*
- * Wait for ongoing direct i/o to complete before proceeding.
- * New direct i/o cannot start as we hold i_mutex.
- */
- inode_dio_wait(vi);
- err = ntfs_attr_extend_initialized(ni, pos);
- if (unlikely(err < 0))
- ntfs_error(vi->i_sb, "Cannot perform write to inode "
- "0x%lx, attribute type 0x%x, because "
- "extending the initialized size "
- "failed (error %d).", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- (int)-err);
- }
-out:
- return err;
-}
-
-/**
- * __ntfs_grab_cache_pages - obtain a number of locked pages
- * @mapping: address space mapping from which to obtain page cache pages
- * @index: starting index in @mapping at which to begin obtaining pages
- * @nr_pages: number of page cache pages to obtain
- * @pages: array of pages in which to return the obtained page cache pages
- * @cached_page: allocated but as yet unused page
- *
- * Obtain @nr_pages locked page cache pages from the mapping @mapping and
- * starting at index @index.
- *
- * If a page is newly created, add it to lru list
- *
- * Note, the page locks are obtained in ascending page index order.
- */
-static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
- pgoff_t index, const unsigned nr_pages, struct page **pages,
- struct page **cached_page)
-{
- int err, nr;
-
- BUG_ON(!nr_pages);
- err = nr = 0;
- do {
- pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
- FGP_ACCESSED);
- if (!pages[nr]) {
- if (!*cached_page) {
- *cached_page = page_cache_alloc(mapping);
- if (unlikely(!*cached_page)) {
- err = -ENOMEM;
- goto err_out;
- }
- }
- err = add_to_page_cache_lru(*cached_page, mapping,
- index,
- mapping_gfp_constraint(mapping, GFP_KERNEL));
- if (unlikely(err)) {
- if (err == -EEXIST)
- continue;
- goto err_out;
- }
- pages[nr] = *cached_page;
- *cached_page = NULL;
- }
- index++;
- nr++;
- } while (nr < nr_pages);
-out:
- return err;
-err_out:
- while (nr > 0) {
- unlock_page(pages[--nr]);
- put_page(pages[nr]);
- }
- goto out;
-}
-
-static inline void ntfs_submit_bh_for_read(struct buffer_head *bh)
-{
- lock_buffer(bh);
- get_bh(bh);
- bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, bh);
-}
-
-/**
- * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
- * @pages: array of destination pages
- * @nr_pages: number of pages in @pages
- * @pos: byte position in file at which the write begins
- * @bytes: number of bytes to be written
- *
- * This is called for non-resident attributes from ntfs_file_buffered_write()
- * with i_mutex held on the inode (@pages[0]->mapping->host). There are
- * @nr_pages pages in @pages which are locked but not kmap()ped. The source
- * data has not yet been copied into the @pages.
- *
- * Need to fill any holes with actual clusters, allocate buffers if necessary,
- * ensure all the buffers are mapped, and bring uptodate any buffers that are
- * only partially being written to.
- *
- * If @nr_pages is greater than one, we are guaranteed that the cluster size is
- * greater than PAGE_SIZE, that all pages in @pages are entirely inside
- * the same cluster and that they are the entirety of that cluster, and that
- * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
- *
- * i_size is not to be modified yet.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
- unsigned nr_pages, s64 pos, size_t bytes)
-{
- VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
- LCN lcn;
- s64 bh_pos, vcn_len, end, initialized_size;
- sector_t lcn_block;
- struct folio *folio;
- struct inode *vi;
- ntfs_inode *ni, *base_ni = NULL;
- ntfs_volume *vol;
- runlist_element *rl, *rl2;
- struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
- ntfs_attr_search_ctx *ctx = NULL;
- MFT_RECORD *m = NULL;
- ATTR_RECORD *a = NULL;
- unsigned long flags;
- u32 attr_rec_len = 0;
- unsigned blocksize, u;
- int err, mp_size;
- bool rl_write_locked, was_hole, is_retry;
- unsigned char blocksize_bits;
- struct {
- u8 runlist_merged:1;
- u8 mft_attr_mapped:1;
- u8 mp_rebuilt:1;
- u8 attr_switched:1;
- } status = { 0, 0, 0, 0 };
-
- BUG_ON(!nr_pages);
- BUG_ON(!pages);
- BUG_ON(!*pages);
- vi = pages[0]->mapping->host;
- ni = NTFS_I(vi);
- vol = ni->vol;
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
- "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
- vi->i_ino, ni->type, pages[0]->index, nr_pages,
- (long long)pos, bytes);
- blocksize = vol->sb->s_blocksize;
- blocksize_bits = vol->sb->s_blocksize_bits;
- rl_write_locked = false;
- rl = NULL;
- err = 0;
- vcn = lcn = -1;
- vcn_len = 0;
- lcn_block = -1;
- was_hole = false;
- cpos = pos >> vol->cluster_size_bits;
- end = pos + bytes;
- cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
- /*
- * Loop over each buffer in each folio. Use goto to
- * reduce indentation.
- */
- u = 0;
-do_next_folio:
- folio = page_folio(pages[u]);
- bh_pos = folio_pos(folio);
- head = folio_buffers(folio);
- if (!head)
- /*
- * create_empty_buffers() will create uptodate/dirty
- * buffers if the folio is uptodate/dirty.
- */
- head = create_empty_buffers(folio, blocksize, 0);
- bh = head;
- do {
- VCN cdelta;
- s64 bh_end;
- unsigned bh_cofs;
-
- /* Clear buffer_new on all buffers to reinitialise state. */
- if (buffer_new(bh))
- clear_buffer_new(bh);
- bh_end = bh_pos + blocksize;
- bh_cpos = bh_pos >> vol->cluster_size_bits;
- bh_cofs = bh_pos & vol->cluster_size_mask;
- if (buffer_mapped(bh)) {
- /*
- * The buffer is already mapped. If it is uptodate,
- * ignore it.
- */
- if (buffer_uptodate(bh))
- continue;
- /*
- * The buffer is not uptodate. If the folio is uptodate
- * set the buffer uptodate and otherwise ignore it.
- */
- if (folio_test_uptodate(folio)) {
- set_buffer_uptodate(bh);
- continue;
- }
- /*
- * Neither the folio nor the buffer are uptodate. If
- * the buffer is only partially being written to, we
- * need to read it in before the write, i.e. now.
- */
- if ((bh_pos < pos && bh_end > pos) ||
- (bh_pos < end && bh_end > end)) {
- /*
- * If the buffer is fully or partially within
- * the initialized size, do an actual read.
- * Otherwise, simply zero the buffer.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (bh_pos < initialized_size) {
- ntfs_submit_bh_for_read(bh);
- *wait_bh++ = bh;
- } else {
- folio_zero_range(folio, bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- }
- continue;
- }
- /* Unmapped buffer. Need to map it. */
- bh->b_bdev = vol->sb->s_bdev;
- /*
- * If the current buffer is in the same clusters as the map
- * cache, there is no need to check the runlist again. The
- * map cache is made up of @vcn, which is the first cached file
- * cluster, @vcn_len which is the number of cached file
- * clusters, @lcn is the device cluster corresponding to @vcn,
- * and @lcn_block is the block number corresponding to @lcn.
- */
- cdelta = bh_cpos - vcn;
- if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
-map_buffer_cached:
- BUG_ON(lcn < 0);
- bh->b_blocknr = lcn_block +
- (cdelta << (vol->cluster_size_bits -
- blocksize_bits)) +
- (bh_cofs >> blocksize_bits);
- set_buffer_mapped(bh);
- /*
- * If the folio is uptodate so is the buffer. If the
- * buffer is fully outside the write, we ignore it if
- * it was already allocated and we mark it dirty so it
- * gets written out if we allocated it. On the other
- * hand, if we allocated the buffer but we are not
- * marking it dirty we set buffer_new so we can do
- * error recovery.
- */
- if (folio_test_uptodate(folio)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- if (unlikely(was_hole)) {
- /* We allocated the buffer. */
- clean_bdev_bh_alias(bh);
- if (bh_end <= pos || bh_pos >= end)
- mark_buffer_dirty(bh);
- else
- set_buffer_new(bh);
- }
- continue;
- }
- /* Page is _not_ uptodate. */
- if (likely(!was_hole)) {
- /*
- * Buffer was already allocated. If it is not
- * uptodate and is only partially being written
- * to, we need to read it in before the write,
- * i.e. now.
- */
- if (!buffer_uptodate(bh) && bh_pos < end &&
- bh_end > pos &&
- (bh_pos < pos ||
- bh_end > end)) {
- /*
- * If the buffer is fully or partially
- * within the initialized size, do an
- * actual read. Otherwise, simply zero
- * the buffer.
- */
- read_lock_irqsave(&ni->size_lock,
- flags);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock,
- flags);
- if (bh_pos < initialized_size) {
- ntfs_submit_bh_for_read(bh);
- *wait_bh++ = bh;
- } else {
- folio_zero_range(folio,
- bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- }
- continue;
- }
- /* We allocated the buffer. */
- clean_bdev_bh_alias(bh);
- /*
- * If the buffer is fully outside the write, zero it,
- * set it uptodate, and mark it dirty so it gets
- * written out. If it is partially being written to,
- * zero region surrounding the write but leave it to
- * commit write to do anything else. Finally, if the
- * buffer is fully being overwritten, do nothing.
- */
- if (bh_end <= pos || bh_pos >= end) {
- if (!buffer_uptodate(bh)) {
- folio_zero_range(folio, bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- mark_buffer_dirty(bh);
- continue;
- }
- set_buffer_new(bh);
- if (!buffer_uptodate(bh) &&
- (bh_pos < pos || bh_end > end)) {
- u8 *kaddr;
- unsigned pofs;
-
- kaddr = kmap_local_folio(folio, 0);
- if (bh_pos < pos) {
- pofs = bh_pos & ~PAGE_MASK;
- memset(kaddr + pofs, 0, pos - bh_pos);
- }
- if (bh_end > end) {
- pofs = end & ~PAGE_MASK;
- memset(kaddr + pofs, 0, bh_end - end);
- }
- kunmap_local(kaddr);
- flush_dcache_folio(folio);
- }
- continue;
- }
- /*
- * Slow path: this is the first buffer in the cluster. If it
- * is outside allocated size and is not uptodate, zero it and
- * set it uptodate.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- initialized_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (bh_pos > initialized_size) {
- if (folio_test_uptodate(folio)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- } else if (!buffer_uptodate(bh)) {
- folio_zero_range(folio, bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- continue;
- }
- is_retry = false;
- if (!rl) {
- down_read(&ni->runlist.lock);
-retry_remap:
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target cluster. */
- while (rl->length && rl[1].vcn <= bh_cpos)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
- if (likely(lcn >= 0)) {
- /*
- * Successful remap, setup the map cache and
- * use that to deal with the buffer.
- */
- was_hole = false;
- vcn = bh_cpos;
- vcn_len = rl[1].vcn - vcn;
- lcn_block = lcn << (vol->cluster_size_bits -
- blocksize_bits);
- cdelta = 0;
- /*
- * If the number of remaining clusters touched
- * by the write is smaller or equal to the
- * number of cached clusters, unlock the
- * runlist as the map cache will be used from
- * now on.
- */
- if (likely(vcn + vcn_len >= cend)) {
- if (rl_write_locked) {
- up_write(&ni->runlist.lock);
- rl_write_locked = false;
- } else
- up_read(&ni->runlist.lock);
- rl = NULL;
- }
- goto map_buffer_cached;
- }
- } else
- lcn = LCN_RL_NOT_MAPPED;
- /*
- * If it is not a hole and not out of bounds, the runlist is
- * probably unmapped so try to map it now.
- */
- if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
- if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
- /* Attempt to map runlist. */
- if (!rl_write_locked) {
- /*
- * We need the runlist locked for
- * writing, so if it is locked for
- * reading relock it now and retry in
- * case it changed whilst we dropped
- * the lock.
- */
- up_read(&ni->runlist.lock);
- down_write(&ni->runlist.lock);
- rl_write_locked = true;
- goto retry_remap;
- }
- err = ntfs_map_runlist_nolock(ni, bh_cpos,
- NULL);
- if (likely(!err)) {
- is_retry = true;
- goto retry_remap;
- }
- /*
- * If @vcn is out of bounds, pretend @lcn is
- * LCN_ENOENT. As long as the buffer is out
- * of bounds this will work fine.
- */
- if (err == -ENOENT) {
- lcn = LCN_ENOENT;
- err = 0;
- goto rl_not_mapped_enoent;
- }
- } else
- err = -EIO;
- /* Failed to map the buffer, even after retrying. */
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
- "attribute type 0x%x, vcn 0x%llx, "
- "vcn offset 0x%x, because its "
- "location on disk could not be "
- "determined%s (error code %i).",
- ni->mft_no, ni->type,
- (unsigned long long)bh_cpos,
- (unsigned)bh_pos &
- vol->cluster_size_mask,
- is_retry ? " even after retrying" : "",
- err);
- break;
- }
-rl_not_mapped_enoent:
- /*
- * The buffer is in a hole or out of bounds. We need to fill
- * the hole, unless the buffer is in a cluster which is not
- * touched by the write, in which case we just leave the buffer
- * unmapped. This can only happen when the cluster size is
- * less than the page cache size.
- */
- if (unlikely(vol->cluster_size < PAGE_SIZE)) {
- bh_cend = (bh_end + vol->cluster_size - 1) >>
- vol->cluster_size_bits;
- if ((bh_cend <= cpos || bh_cpos >= cend)) {
- bh->b_blocknr = -1;
- /*
- * If the buffer is uptodate we skip it. If it
- * is not but the folio is uptodate, we can set
- * the buffer uptodate. If the folio is not
- * uptodate, we can clear the buffer and set it
- * uptodate. Whether this is worthwhile is
- * debatable and this could be removed.
- */
- if (folio_test_uptodate(folio)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- } else if (!buffer_uptodate(bh)) {
- folio_zero_range(folio, bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- continue;
- }
- }
- /*
- * Out of bounds buffer is invalid if it was not really out of
- * bounds.
- */
- BUG_ON(lcn != LCN_HOLE);
- /*
- * We need the runlist locked for writing, so if it is locked
- * for reading relock it now and retry in case it changed
- * whilst we dropped the lock.
- */
- BUG_ON(!rl);
- if (!rl_write_locked) {
- up_read(&ni->runlist.lock);
- down_write(&ni->runlist.lock);
- rl_write_locked = true;
- goto retry_remap;
- }
- /* Find the previous last allocated cluster. */
- BUG_ON(rl->lcn != LCN_HOLE);
- lcn = -1;
- rl2 = rl;
- while (--rl2 >= ni->runlist.rl) {
- if (rl2->lcn >= 0) {
- lcn = rl2->lcn + rl2->length;
- break;
- }
- }
- rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
- false);
- if (IS_ERR(rl2)) {
- err = PTR_ERR(rl2);
- ntfs_debug("Failed to allocate cluster, error code %i.",
- err);
- break;
- }
- lcn = rl2->lcn;
- rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
- if (IS_ERR(rl)) {
- err = PTR_ERR(rl);
- if (err != -ENOMEM)
- err = -EIO;
- if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to release "
- "allocated cluster in error "
- "code path. Run chkdsk to "
- "recover the lost cluster.");
- NVolSetErrors(vol);
- }
- ntfs_free(rl2);
- break;
- }
- ni->runlist.rl = rl;
- status.runlist_merged = 1;
- ntfs_debug("Allocated cluster, lcn 0x%llx.",
- (unsigned long long)lcn);
- /* Map and lock the mft record and get the attribute record. */
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- break;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- unmap_mft_record(base_ni);
- break;
- }
- status.mft_attr_mapped = 1;
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- break;
- }
- m = ctx->mrec;
- a = ctx->attr;
- /*
- * Find the runlist element with which the attribute extent
- * starts. Note, we cannot use the _attr_ version because we
- * have mapped the mft record. That is ok because we know the
- * runlist fragment must be mapped already to have ever gotten
- * here, so we can just use the _rl_ version.
- */
- vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
- rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
- BUG_ON(!rl2);
- BUG_ON(!rl2->length);
- BUG_ON(rl2->lcn < LCN_HOLE);
- highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
- /*
- * If @highest_vcn is zero, calculate the real highest_vcn
- * (which can really be zero).
- */
- if (!highest_vcn)
- highest_vcn = (sle64_to_cpu(
- a->data.non_resident.allocated_size) >>
- vol->cluster_size_bits) - 1;
- /*
- * Determine the size of the mapping pairs array for the new
- * extent, i.e. the old extent with the hole filled.
- */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
- highest_vcn);
- if (unlikely(mp_size <= 0)) {
- if (!(err = mp_size))
- err = -EIO;
- ntfs_debug("Failed to get size for mapping pairs "
- "array, error code %i.", err);
- break;
- }
- /*
- * Resize the attribute record to fit the new mapping pairs
- * array.
- */
- attr_rec_len = le32_to_cpu(a->length);
- err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset));
- if (unlikely(err)) {
- BUG_ON(err != -ENOSPC);
- // TODO: Deal with this by using the current attribute
- // and fill it with as much of the mapping pairs
- // array as possible. Then loop over each attribute
- // extent rewriting the mapping pairs arrays as we go
- // along and if when we reach the end we have not
- // enough space, try to resize the last attribute
- // extent and if even that fails, add a new attribute
- // extent.
- // We could also try to resize at each step in the hope
- // that we will not need to rewrite every single extent.
- // Note, we may need to decompress some extents to fill
- // the runlist as we are walking the extents...
- ntfs_error(vol->sb, "Not enough space in the mft "
- "record for the extended attribute "
- "record. This case is not "
- "implemented yet.");
- err = -EOPNOTSUPP;
- break ;
- }
- status.mp_rebuilt = 1;
- /*
- * Generate the mapping pairs array directly into the attribute
- * record.
- */
- err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
- mp_size, rl2, vcn, highest_vcn, NULL);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
- "attribute type 0x%x, because building "
- "the mapping pairs failed with error "
- "code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- err = -EIO;
- break;
- }
- /* Update the highest_vcn but only if it was not set. */
- if (unlikely(!a->data.non_resident.highest_vcn))
- a->data.non_resident.highest_vcn =
- cpu_to_sle64(highest_vcn);
- /*
- * If the attribute is sparse/compressed, update the compressed
- * size in the ntfs_inode structure and the attribute record.
- */
- if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
- /*
- * If we are not in the first attribute extent, switch
- * to it, but first ensure the changes will make it to
- * disk later.
- */
- if (a->data.non_resident.lowest_vcn) {
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(ni->type, ni->name,
- ni->name_len, CASE_SENSITIVE,
- 0, NULL, 0, ctx);
- if (unlikely(err)) {
- status.attr_switched = 1;
- break;
- }
- /* @m is not used any more so do not set it. */
- a = ctx->attr;
- }
- write_lock_irqsave(&ni->size_lock, flags);
- ni->itype.compressed.size += vol->cluster_size;
- a->data.non_resident.compressed_size =
- cpu_to_sle64(ni->itype.compressed.size);
- write_unlock_irqrestore(&ni->size_lock, flags);
- }
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- /* Successfully filled the hole. */
- status.runlist_merged = 0;
- status.mft_attr_mapped = 0;
- status.mp_rebuilt = 0;
- /* Setup the map cache and use that to deal with the buffer. */
- was_hole = true;
- vcn = bh_cpos;
- vcn_len = 1;
- lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
- cdelta = 0;
- /*
- * If the number of remaining clusters in the @pages is smaller
- * or equal to the number of cached clusters, unlock the
- * runlist as the map cache will be used from now on.
- */
- if (likely(vcn + vcn_len >= cend)) {
- up_write(&ni->runlist.lock);
- rl_write_locked = false;
- rl = NULL;
- }
- goto map_buffer_cached;
- } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
- /* If there are no errors, do the next page. */
- if (likely(!err && ++u < nr_pages))
- goto do_next_folio;
- /* If there are no errors, release the runlist lock if we took it. */
- if (likely(!err)) {
- if (unlikely(rl_write_locked)) {
- up_write(&ni->runlist.lock);
- rl_write_locked = false;
- } else if (unlikely(rl))
- up_read(&ni->runlist.lock);
- rl = NULL;
- }
- /* If we issued read requests, let them complete. */
- read_lock_irqsave(&ni->size_lock, flags);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- while (wait_bh > wait) {
- bh = *--wait_bh;
- wait_on_buffer(bh);
- if (likely(buffer_uptodate(bh))) {
- folio = bh->b_folio;
- bh_pos = folio_pos(folio) + bh_offset(bh);
- /*
- * If the buffer overflows the initialized size, need
- * to zero the overflowing region.
- */
- if (unlikely(bh_pos + blocksize > initialized_size)) {
- int ofs = 0;
-
- if (likely(bh_pos < initialized_size))
- ofs = initialized_size - bh_pos;
- folio_zero_segment(folio, bh_offset(bh) + ofs,
- blocksize);
- }
- } else /* if (unlikely(!buffer_uptodate(bh))) */
- err = -EIO;
- }
- if (likely(!err)) {
- /* Clear buffer_new on all buffers. */
- u = 0;
- do {
- bh = head = page_buffers(pages[u]);
- do {
- if (buffer_new(bh))
- clear_buffer_new(bh);
- } while ((bh = bh->b_this_page) != head);
- } while (++u < nr_pages);
- ntfs_debug("Done.");
- return err;
- }
- if (status.attr_switched) {
- /* Get back to the attribute extent we modified. */
- ntfs_attr_reinit_search_ctx(ctx);
- if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
- ntfs_error(vol->sb, "Failed to find required "
- "attribute extent of attribute in "
- "error code path. Run chkdsk to "
- "recover.");
- write_lock_irqsave(&ni->size_lock, flags);
- ni->itype.compressed.size += vol->cluster_size;
- write_unlock_irqrestore(&ni->size_lock, flags);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- /*
- * The only thing that is now wrong is the compressed
- * size of the base attribute extent which chkdsk
- * should be able to fix.
- */
- NVolSetErrors(vol);
- } else {
- m = ctx->mrec;
- a = ctx->attr;
- status.attr_switched = 0;
- }
- }
- /*
- * If the runlist has been modified, need to restore it by punching a
- * hole into it and we then need to deallocate the on-disk cluster as
- * well. Note, we only modify the runlist if we are able to generate a
- * new mapping pairs array, i.e. only when the mapped attribute extent
- * is not switched.
- */
- if (status.runlist_merged && !status.attr_switched) {
- BUG_ON(!rl_write_locked);
- /* Make the file cluster we allocated sparse in the runlist. */
- if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
- ntfs_error(vol->sb, "Failed to punch hole into "
- "attribute runlist in error code "
- "path. Run chkdsk to recover the "
- "lost cluster.");
- NVolSetErrors(vol);
- } else /* if (success) */ {
- status.runlist_merged = 0;
- /*
- * Deallocate the on-disk cluster we allocated but only
- * if we succeeded in punching its vcn out of the
- * runlist.
- */
- down_write(&vol->lcnbmp_lock);
- if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
- ntfs_error(vol->sb, "Failed to release "
- "allocated cluster in error "
- "code path. Run chkdsk to "
- "recover the lost cluster.");
- NVolSetErrors(vol);
- }
- up_write(&vol->lcnbmp_lock);
- }
- }
- /*
- * Resize the attribute record to its old size and rebuild the mapping
- * pairs array. Note, we only can do this if the runlist has been
- * restored to its old state which also implies that the mapped
- * attribute extent is not switched.
- */
- if (status.mp_rebuilt && !status.runlist_merged) {
- if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
- ntfs_error(vol->sb, "Failed to restore attribute "
- "record in error code path. Run "
- "chkdsk to recover.");
- NVolSetErrors(vol);
- } else /* if (success) */ {
- if (ntfs_mapping_pairs_build(vol, (u8*)a +
- le16_to_cpu(a->data.non_resident.
- mapping_pairs_offset), attr_rec_len -
- le16_to_cpu(a->data.non_resident.
- mapping_pairs_offset), ni->runlist.rl,
- vcn, highest_vcn, NULL)) {
- ntfs_error(vol->sb, "Failed to restore "
- "mapping pairs array in error "
- "code path. Run chkdsk to "
- "recover.");
- NVolSetErrors(vol);
- }
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- }
- }
- /* Release the mft record and the attribute. */
- if (status.mft_attr_mapped) {
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- }
- /* Release the runlist lock. */
- if (rl_write_locked)
- up_write(&ni->runlist.lock);
- else if (rl)
- up_read(&ni->runlist.lock);
- /*
- * Zero out any newly allocated blocks to avoid exposing stale data.
- * If BH_New is set, we know that the block was newly allocated above
- * and that it has not been fully zeroed and marked dirty yet.
- */
- nr_pages = u;
- u = 0;
- end = bh_cpos << vol->cluster_size_bits;
- do {
- folio = page_folio(pages[u]);
- bh = head = folio_buffers(folio);
- do {
- if (u == nr_pages &&
- folio_pos(folio) + bh_offset(bh) >= end)
- break;
- if (!buffer_new(bh))
- continue;
- clear_buffer_new(bh);
- if (!buffer_uptodate(bh)) {
- if (folio_test_uptodate(folio))
- set_buffer_uptodate(bh);
- else {
- folio_zero_range(folio, bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- }
- mark_buffer_dirty(bh);
- } while ((bh = bh->b_this_page) != head);
- } while (++u <= nr_pages);
- ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
- return err;
-}
-
-static inline void ntfs_flush_dcache_pages(struct page **pages,
- unsigned nr_pages)
-{
- BUG_ON(!nr_pages);
- /*
- * Warning: Do not do the decrement at the same time as the call to
- * flush_dcache_page() because it is a NULL macro on i386 and hence the
- * decrement never happens so the loop never terminates.
- */
- do {
- --nr_pages;
- flush_dcache_page(pages[nr_pages]);
- } while (nr_pages > 0);
-}
-
-/**
- * ntfs_commit_pages_after_non_resident_write - commit the received data
- * @pages: array of destination pages
- * @nr_pages: number of pages in @pages
- * @pos: byte position in file at which the write begins
- * @bytes: number of bytes to be written
- *
- * See description of ntfs_commit_pages_after_write(), below.
- */
-static inline int ntfs_commit_pages_after_non_resident_write(
- struct page **pages, const unsigned nr_pages,
- s64 pos, size_t bytes)
-{
- s64 end, initialized_size;
- struct inode *vi;
- ntfs_inode *ni, *base_ni;
- struct buffer_head *bh, *head;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- unsigned long flags;
- unsigned blocksize, u;
- int err;
-
- vi = pages[0]->mapping->host;
- ni = NTFS_I(vi);
- blocksize = vi->i_sb->s_blocksize;
- end = pos + bytes;
- u = 0;
- do {
- s64 bh_pos;
- struct page *page;
- bool partial;
-
- page = pages[u];
- bh_pos = (s64)page->index << PAGE_SHIFT;
- bh = head = page_buffers(page);
- partial = false;
- do {
- s64 bh_end;
-
- bh_end = bh_pos + blocksize;
- if (bh_end <= pos || bh_pos >= end) {
- if (!buffer_uptodate(bh))
- partial = true;
- } else {
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- }
- } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
- /*
- * If all buffers are now uptodate but the page is not, set the
- * page uptodate.
- */
- if (!partial && !PageUptodate(page))
- SetPageUptodate(page);
- } while (++u < nr_pages);
- /*
- * Finally, if we do not need to update initialized_size or i_size we
- * are finished.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (end <= initialized_size) {
- ntfs_debug("Done.");
- return 0;
- }
- /*
- * Update initialized_size/i_size as appropriate, both in the inode and
- * the mft record.
- */
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /* Map, pin, and lock the mft record. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- BUG_ON(!NInoNonResident(ni));
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- a = ctx->attr;
- BUG_ON(!a->non_resident);
- write_lock_irqsave(&ni->size_lock, flags);
- BUG_ON(end > ni->allocated_size);
- ni->initialized_size = end;
- a->data.non_resident.initialized_size = cpu_to_sle64(end);
- if (end > i_size_read(vi)) {
- i_size_write(vi, end);
- a->data.non_resident.data_size =
- a->data.non_resident.initialized_size;
- }
- write_unlock_irqrestore(&ni->size_lock, flags);
- /* Mark the mft record dirty, so it gets written back. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- ntfs_debug("Done.");
- return 0;
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
- "code %i).", err);
- if (err != -ENOMEM)
- NVolSetErrors(ni->vol);
- return err;
-}
-
-/**
- * ntfs_commit_pages_after_write - commit the received data
- * @pages: array of destination pages
- * @nr_pages: number of pages in @pages
- * @pos: byte position in file at which the write begins
- * @bytes: number of bytes to be written
- *
- * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
- * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
- * locked but not kmap()ped. The source data has already been copied into the
- * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
- * the data was copied (for non-resident attributes only) and it returned
- * success.
- *
- * Need to set uptodate and mark dirty all buffers within the boundary of the
- * write. If all buffers in a page are uptodate we set the page uptodate, too.
- *
- * Setting the buffers dirty ensures that they get written out later when
- * ntfs_writepage() is invoked by the VM.
- *
- * Finally, we need to update i_size and initialized_size as appropriate both
- * in the inode and the mft record.
- *
- * This is modelled after fs/buffer.c::generic_commit_write(), which marks
- * buffers uptodate and dirty, sets the page uptodate if all buffers in the
- * page are uptodate, and updates i_size if the end of io is beyond i_size. In
- * that case, it also marks the inode dirty.
- *
- * If things have gone as outlined in
- * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
- * content modifications here for non-resident attributes. For resident
- * attributes we need to do the uptodate bringing here which we combine with
- * the copying into the mft record which means we save one atomic kmap.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_commit_pages_after_write(struct page **pages,
- const unsigned nr_pages, s64 pos, size_t bytes)
-{
- s64 end, initialized_size;
- loff_t i_size;
- struct inode *vi;
- ntfs_inode *ni, *base_ni;
- struct page *page;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- char *kattr, *kaddr;
- unsigned long flags;
- u32 attr_len;
- int err;
-
- BUG_ON(!nr_pages);
- BUG_ON(!pages);
- page = pages[0];
- BUG_ON(!page);
- vi = page->mapping->host;
- ni = NTFS_I(vi);
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
- "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
- vi->i_ino, ni->type, page->index, nr_pages,
- (long long)pos, bytes);
- if (NInoNonResident(ni))
- return ntfs_commit_pages_after_non_resident_write(pages,
- nr_pages, pos, bytes);
- BUG_ON(nr_pages > 1);
- /*
- * Attribute is resident, implying it is not compressed, encrypted, or
- * sparse.
- */
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- BUG_ON(NInoNonResident(ni));
- /* Map, pin, and lock the mft record. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- a = ctx->attr;
- BUG_ON(a->non_resident);
- /* The total length of the attribute value. */
- attr_len = le32_to_cpu(a->data.resident.value_length);
- i_size = i_size_read(vi);
- BUG_ON(attr_len != i_size);
- BUG_ON(pos > attr_len);
- end = pos + bytes;
- BUG_ON(end > le32_to_cpu(a->length) -
- le16_to_cpu(a->data.resident.value_offset));
- kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
- kaddr = kmap_atomic(page);
- /* Copy the received data from the page to the mft record. */
- memcpy(kattr + pos, kaddr + pos, bytes);
- /* Update the attribute length if necessary. */
- if (end > attr_len) {
- attr_len = end;
- a->data.resident.value_length = cpu_to_le32(attr_len);
- }
- /*
- * If the page is not uptodate, bring the out of bounds area(s)
- * uptodate by copying data from the mft record to the page.
- */
- if (!PageUptodate(page)) {
- if (pos > 0)
- memcpy(kaddr, kattr, pos);
- if (end < attr_len)
- memcpy(kaddr + end, kattr + end, attr_len - end);
- /* Zero the region outside the end of the attribute value. */
- memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len);
- flush_dcache_page(page);
- SetPageUptodate(page);
- }
- kunmap_atomic(kaddr);
- /* Update initialized_size/i_size if necessary. */
- read_lock_irqsave(&ni->size_lock, flags);
- initialized_size = ni->initialized_size;
- BUG_ON(end > ni->allocated_size);
- read_unlock_irqrestore(&ni->size_lock, flags);
- BUG_ON(initialized_size != i_size);
- if (end > initialized_size) {
- write_lock_irqsave(&ni->size_lock, flags);
- ni->initialized_size = end;
- i_size_write(vi, end);
- write_unlock_irqrestore(&ni->size_lock, flags);
- }
- /* Mark the mft record dirty, so it gets written back. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- ntfs_debug("Done.");
- return 0;
-err_out:
- if (err == -ENOMEM) {
- ntfs_warning(vi->i_sb, "Error allocating memory required to "
- "commit the write.");
- if (PageUptodate(page)) {
- ntfs_warning(vi->i_sb, "Page is uptodate, setting "
- "dirty so the write will be retried "
- "later on by the VM.");
- /*
- * Put the page on mapping->dirty_pages, but leave its
- * buffers' dirty state as-is.
- */
- __set_page_dirty_nobuffers(page);
- err = 0;
- } else
- ntfs_error(vi->i_sb, "Page is not uptodate. Written "
- "data has been lost.");
- } else {
- ntfs_error(vi->i_sb, "Resident attribute commit write failed "
- "with error %i.", err);
- NVolSetErrors(ni->vol);
- }
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- return err;
-}
-
-/*
- * Copy as much as we can into the pages and return the number of bytes which
- * were successfully copied. If a fault is encountered then clear the pages
- * out to (ofs + bytes) and return the number of bytes which were copied.
- */
-static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
- unsigned ofs, struct iov_iter *i, size_t bytes)
-{
- struct page **last_page = pages + nr_pages;
- size_t total = 0;
- unsigned len, copied;
-
- do {
- len = PAGE_SIZE - ofs;
- if (len > bytes)
- len = bytes;
- copied = copy_page_from_iter_atomic(*pages, ofs, len, i);
- total += copied;
- bytes -= copied;
- if (!bytes)
- break;
- if (copied < len)
- goto err;
- ofs = 0;
- } while (++pages < last_page);
-out:
- return total;
-err:
- /* Zero the rest of the target like __copy_from_user(). */
- len = PAGE_SIZE - copied;
- do {
- if (len > bytes)
- len = bytes;
- zero_user(*pages, copied, len);
- bytes -= len;
- copied = 0;
- len = PAGE_SIZE;
- } while (++pages < last_page);
- goto out;
-}
-
-/**
- * ntfs_perform_write - perform buffered write to a file
- * @file: file to write to
- * @i: iov_iter with data to write
- * @pos: byte offset in file at which to begin writing to
- */
-static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
- loff_t pos)
-{
- struct address_space *mapping = file->f_mapping;
- struct inode *vi = mapping->host;
- ntfs_inode *ni = NTFS_I(vi);
- ntfs_volume *vol = ni->vol;
- struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
- struct page *cached_page = NULL;
- VCN last_vcn;
- LCN lcn;
- size_t bytes;
- ssize_t status, written = 0;
- unsigned nr_pages;
-
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
- "0x%llx, count 0x%lx.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)pos,
- (unsigned long)iov_iter_count(i));
- /*
- * If a previous ntfs_truncate() failed, repeat it and abort if it
- * fails again.
- */
- if (unlikely(NInoTruncateFailed(ni))) {
- int err;
-
- inode_dio_wait(vi);
- err = ntfs_truncate(vi);
- if (err || NInoTruncateFailed(ni)) {
- if (!err)
- err = -EIO;
- ntfs_error(vol->sb, "Cannot perform write to inode "
- "0x%lx, attribute type 0x%x, because "
- "ntfs_truncate() failed (error code "
- "%i).", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- return err;
- }
- }
- /*
- * Determine the number of pages per cluster for non-resident
- * attributes.
- */
- nr_pages = 1;
- if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
- nr_pages = vol->cluster_size >> PAGE_SHIFT;
- last_vcn = -1;
- do {
- VCN vcn;
- pgoff_t start_idx;
- unsigned ofs, do_pages, u;
- size_t copied;
-
- start_idx = pos >> PAGE_SHIFT;
- ofs = pos & ~PAGE_MASK;
- bytes = PAGE_SIZE - ofs;
- do_pages = 1;
- if (nr_pages > 1) {
- vcn = pos >> vol->cluster_size_bits;
- if (vcn != last_vcn) {
- last_vcn = vcn;
- /*
- * Get the lcn of the vcn the write is in. If
- * it is a hole, need to lock down all pages in
- * the cluster.
- */
- down_read(&ni->runlist.lock);
- lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
- vol->cluster_size_bits, false);
- up_read(&ni->runlist.lock);
- if (unlikely(lcn < LCN_HOLE)) {
- if (lcn == LCN_ENOMEM)
- status = -ENOMEM;
- else {
- status = -EIO;
- ntfs_error(vol->sb, "Cannot "
- "perform write to "
- "inode 0x%lx, "
- "attribute type 0x%x, "
- "because the attribute "
- "is corrupt.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- }
- break;
- }
- if (lcn == LCN_HOLE) {
- start_idx = (pos & ~(s64)
- vol->cluster_size_mask)
- >> PAGE_SHIFT;
- bytes = vol->cluster_size - (pos &
- vol->cluster_size_mask);
- do_pages = nr_pages;
- }
- }
- }
- if (bytes > iov_iter_count(i))
- bytes = iov_iter_count(i);
-again:
- /*
- * Bring in the user page(s) that we will copy from _first_.
- * Otherwise there is a nasty deadlock on copying from the same
- * page(s) as we are writing to, without it/them being marked
- * up-to-date. Note, at present there is nothing to stop the
- * pages being swapped out between us bringing them into memory
- * and doing the actual copying.
- */
- if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
- status = -EFAULT;
- break;
- }
- /* Get and lock @do_pages starting at index @start_idx. */
- status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
- pages, &cached_page);
- if (unlikely(status))
- break;
- /*
- * For non-resident attributes, we need to fill any holes with
- * actual clusters and ensure all bufferes are mapped. We also
- * need to bring uptodate any buffers that are only partially
- * being written to.
- */
- if (NInoNonResident(ni)) {
- status = ntfs_prepare_pages_for_non_resident_write(
- pages, do_pages, pos, bytes);
- if (unlikely(status)) {
- do {
- unlock_page(pages[--do_pages]);
- put_page(pages[do_pages]);
- } while (do_pages);
- break;
- }
- }
- u = (pos >> PAGE_SHIFT) - pages[0]->index;
- copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
- i, bytes);
- ntfs_flush_dcache_pages(pages + u, do_pages - u);
- status = 0;
- if (likely(copied == bytes)) {
- status = ntfs_commit_pages_after_write(pages, do_pages,
- pos, bytes);
- }
- do {
- unlock_page(pages[--do_pages]);
- put_page(pages[do_pages]);
- } while (do_pages);
- if (unlikely(status < 0)) {
- iov_iter_revert(i, copied);
- break;
- }
- cond_resched();
- if (unlikely(copied < bytes)) {
- iov_iter_revert(i, copied);
- if (copied)
- bytes = copied;
- else if (bytes > PAGE_SIZE - ofs)
- bytes = PAGE_SIZE - ofs;
- goto again;
- }
- pos += copied;
- written += copied;
- balance_dirty_pages_ratelimited(mapping);
- if (fatal_signal_pending(current)) {
- status = -EINTR;
- break;
- }
- } while (iov_iter_count(i));
- if (cached_page)
- put_page(cached_page);
- ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
- written ? "written" : "status", (unsigned long)written,
- (long)status);
- return written ? written : status;
-}
-
-/**
- * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
- * @iocb: IO state structure
- * @from: iov_iter with data to write
- *
- * Basically the same as generic_file_write_iter() except that it ends up
- * up calling ntfs_perform_write() instead of generic_perform_write() and that
- * O_DIRECT is not implemented.
- */
-static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *vi = file_inode(file);
- ssize_t written = 0;
- ssize_t err;
-
- inode_lock(vi);
- /* We can write back this queue in page reclaim. */
- err = ntfs_prepare_file_for_write(iocb, from);
- if (iov_iter_count(from) && !err)
- written = ntfs_perform_write(file, from, iocb->ki_pos);
- inode_unlock(vi);
- iocb->ki_pos += written;
- if (likely(written > 0))
- written = generic_write_sync(iocb, written);
- return written ? written : err;
-}
-
-/**
- * ntfs_file_fsync - sync a file to disk
- * @filp: file to be synced
- * @datasync: if non-zero only flush user data and not metadata
- *
- * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
- * system calls. This function is inspired by fs/buffer.c::file_fsync().
- *
- * If @datasync is false, write the mft record and all associated extent mft
- * records as well as the $DATA attribute and then sync the block device.
- *
- * If @datasync is true and the attribute is non-resident, we skip the writing
- * of the mft record and all associated extent mft records (this might still
- * happen due to the write_inode_now() call).
- *
- * Also, if @datasync is true, we do not wait on the inode to be written out
- * but we always wait on the page cache pages to be written out.
- *
- * Locking: Caller must hold i_mutex on the inode.
- *
- * TODO: We should probably also write all attribute/index inodes associated
- * with this inode but since we have no simple way of getting to them we ignore
- * this problem for now.
- */
-static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *vi = filp->f_mapping->host;
- int err, ret = 0;
-
- ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-
- err = file_write_and_wait_range(filp, start, end);
- if (err)
- return err;
- inode_lock(vi);
-
- BUG_ON(S_ISDIR(vi->i_mode));
- if (!datasync || !NInoNonResident(NTFS_I(vi)))
- ret = __ntfs_write_inode(vi, 1);
- write_inode_now(vi, !datasync);
- /*
- * NOTE: If we were to use mapping->private_list (see ext2 and
- * fs/buffer.c) for dirty blocks then we could optimize the below to be
- * sync_mapping_buffers(vi->i_mapping).
- */
- err = sync_blockdev(vi->i_sb->s_bdev);
- if (unlikely(err && !ret))
- ret = err;
- if (likely(!ret))
- ntfs_debug("Done.");
- else
- ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
- "%u.", datasync ? "data" : "", vi->i_ino, -ret);
- inode_unlock(vi);
- return ret;
-}
-
-#endif /* NTFS_RW */
-
-const struct file_operations ntfs_file_ops = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
-#ifdef NTFS_RW
- .write_iter = ntfs_file_write_iter,
- .fsync = ntfs_file_fsync,
-#endif /* NTFS_RW */
- .mmap = generic_file_mmap,
- .open = ntfs_file_open,
- .splice_read = filemap_splice_read,
-};
-
-const struct inode_operations ntfs_file_inode_ops = {
-#ifdef NTFS_RW
- .setattr = ntfs_setattr,
-#endif /* NTFS_RW */
-};
-
-const struct file_operations ntfs_empty_file_ops = {};
-
-const struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
deleted file mode 100644
index d46c2c03a032..000000000000
--- a/fs/ntfs/index.c
+++ /dev/null
@@ -1,440 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * index.c - NTFS kernel index handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#include <linux/slab.h>
-
-#include "aops.h"
-#include "collate.h"
-#include "debug.h"
-#include "index.h"
-#include "ntfs.h"
-
-/**
- * ntfs_index_ctx_get - allocate and initialize a new index context
- * @idx_ni: ntfs index inode with which to initialize the context
- *
- * Allocate a new index context, initialize it with @idx_ni and return it.
- * Return NULL if allocation failed.
- *
- * Locking: Caller must hold i_mutex on the index inode.
- */
-ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
-{
- ntfs_index_context *ictx;
-
- ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS);
- if (ictx)
- *ictx = (ntfs_index_context){ .idx_ni = idx_ni };
- return ictx;
-}
-
-/**
- * ntfs_index_ctx_put - release an index context
- * @ictx: index context to free
- *
- * Release the index context @ictx, releasing all associated resources.
- *
- * Locking: Caller must hold i_mutex on the index inode.
- */
-void ntfs_index_ctx_put(ntfs_index_context *ictx)
-{
- if (ictx->entry) {
- if (ictx->is_in_root) {
- if (ictx->actx)
- ntfs_attr_put_search_ctx(ictx->actx);
- if (ictx->base_ni)
- unmap_mft_record(ictx->base_ni);
- } else {
- struct page *page = ictx->page;
- if (page) {
- BUG_ON(!PageLocked(page));
- unlock_page(page);
- ntfs_unmap_page(page);
- }
- }
- }
- kmem_cache_free(ntfs_index_ctx_cache, ictx);
- return;
-}
-
-/**
- * ntfs_index_lookup - find a key in an index and return its index entry
- * @key: [IN] key for which to search in the index
- * @key_len: [IN] length of @key in bytes
- * @ictx: [IN/OUT] context describing the index and the returned entry
- *
- * Before calling ntfs_index_lookup(), @ictx must have been obtained from a
- * call to ntfs_index_ctx_get().
- *
- * Look for the @key in the index specified by the index lookup context @ictx.
- * ntfs_index_lookup() walks the contents of the index looking for the @key.
- *
- * If the @key is found in the index, 0 is returned and @ictx is setup to
- * describe the index entry containing the matching @key. @ictx->entry is the
- * index entry and @ictx->data and @ictx->data_len are the index entry data and
- * its length in bytes, respectively.
- *
- * If the @key is not found in the index, -ENOENT is returned and @ictx is
- * setup to describe the index entry whose key collates immediately after the
- * search @key, i.e. this is the position in the index at which an index entry
- * with a key of @key would need to be inserted.
- *
- * If an error occurs return the negative error code and @ictx is left
- * untouched.
- *
- * When finished with the entry and its data, call ntfs_index_ctx_put() to free
- * the context and other associated resources.
- *
- * If the index entry was modified, call flush_dcache_index_entry_page()
- * immediately after the modification and either ntfs_index_entry_mark_dirty()
- * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
- * ensure that the changes are written to disk.
- *
- * Locking: - Caller must hold i_mutex on the index inode.
- * - Each page cache page in the index allocation mapping must be
- * locked whilst being accessed otherwise we may find a corrupt
- * page due to it being under ->writepage at the moment which
- * applies the mst protection fixups before writing out and then
- * removes them again after the write is complete after which it
- * unlocks the page.
- */
-int ntfs_index_lookup(const void *key, const int key_len,
- ntfs_index_context *ictx)
-{
- VCN vcn, old_vcn;
- ntfs_inode *idx_ni = ictx->idx_ni;
- ntfs_volume *vol = idx_ni->vol;
- struct super_block *sb = vol->sb;
- ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino;
- MFT_RECORD *m;
- INDEX_ROOT *ir;
- INDEX_ENTRY *ie;
- INDEX_ALLOCATION *ia;
- u8 *index_end, *kaddr;
- ntfs_attr_search_ctx *actx;
- struct address_space *ia_mapping;
- struct page *page;
- int rc, err = 0;
-
- ntfs_debug("Entering.");
- BUG_ON(!NInoAttr(idx_ni));
- BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION);
- BUG_ON(idx_ni->nr_extents != -1);
- BUG_ON(!base_ni);
- BUG_ON(!key);
- BUG_ON(key_len <= 0);
- if (!ntfs_is_collation_rule_supported(
- idx_ni->itype.index.collation_rule)) {
- ntfs_error(sb, "Index uses unsupported collation rule 0x%x. "
- "Aborting lookup.", le32_to_cpu(
- idx_ni->itype.index.collation_rule));
- return -EOPNOTSUPP;
- }
- /* Get hold of the mft record for the index inode. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- ntfs_error(sb, "map_mft_record() failed with error code %ld.",
- -PTR_ERR(m));
- return PTR_ERR(m);
- }
- actx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!actx)) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Find the index root attribute in the mft record. */
- err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, actx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- ntfs_error(sb, "Index root attribute missing in inode "
- "0x%lx.", idx_ni->mft_no);
- err = -EIO;
- }
- goto err_out;
- }
- /* Get to the index root value (it has been verified in read_inode). */
- ir = (INDEX_ROOT*)((u8*)actx->attr +
- le16_to_cpu(actx->attr->data.resident.value_offset));
- index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ir->index +
- le32_to_cpu(ir->index.entries_offset));
- /*
- * Loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds checks. */
- if ((u8*)ie < (u8*)actx->mrec || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->length) > index_end)
- goto idx_err_out;
- /*
- * The last entry cannot contain a key. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /* Further bounds checks. */
- if ((u32)sizeof(INDEX_ENTRY_HEADER) +
- le16_to_cpu(ie->key_length) >
- le16_to_cpu(ie->data.vi.data_offset) ||
- (u32)le16_to_cpu(ie->data.vi.data_offset) +
- le16_to_cpu(ie->data.vi.data_length) >
- le16_to_cpu(ie->length))
- goto idx_err_out;
- /* If the keys match perfectly, we setup @ictx and return 0. */
- if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
- &ie->key, key_len)) {
-ir_done:
- ictx->is_in_root = true;
- ictx->ir = ir;
- ictx->actx = actx;
- ictx->base_ni = base_ni;
- ictx->ia = NULL;
- ictx->page = NULL;
-done:
- ictx->entry = ie;
- ictx->data = (u8*)ie +
- le16_to_cpu(ie->data.vi.data_offset);
- ictx->data_len = le16_to_cpu(ie->data.vi.data_length);
- ntfs_debug("Done.");
- return err;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
- key_len, &ie->key, le16_to_cpu(ie->key_length));
- /*
- * If @key collates before the key of the current entry, there
- * is definitely no such key in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /*
- * A match should never happen as the memcmp() call should have
- * cought it, but we still treat it correctly.
- */
- if (!rc)
- goto ir_done;
- /* The keys are not equal, continue the search. */
- }
- /*
- * We have finished with this index without success. Check for the
- * presence of a child node and if not present setup @ictx and return
- * -ENOENT.
- */
- if (!(ie->flags & INDEX_ENTRY_NODE)) {
- ntfs_debug("Entry not found.");
- err = -ENOENT;
- goto ir_done;
- } /* Child node present, descend into it. */
- /* Consistency check: Verify that an index allocation exists. */
- if (!NInoIndexAllocPresent(idx_ni)) {
- ntfs_error(sb, "No index allocation attribute but index entry "
- "requires one. Inode 0x%lx is corrupt or "
- "driver bug.", idx_ni->mft_no);
- goto err_out;
- }
- /* Get the starting vcn of the index_block holding the child node. */
- vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
- ia_mapping = VFS_I(idx_ni)->i_mapping;
- /*
- * We are done with the index root and the mft record. Release them,
- * otherwise we deadlock with ntfs_map_page().
- */
- ntfs_attr_put_search_ctx(actx);
- unmap_mft_record(base_ni);
- m = NULL;
- actx = NULL;
-descend_into_child_node:
- /*
- * Convert vcn to index into the index allocation attribute in units
- * of PAGE_SIZE and map the page cache page, reading it from
- * disk if necessary.
- */
- page = ntfs_map_page(ia_mapping, vcn <<
- idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
- if (IS_ERR(page)) {
- ntfs_error(sb, "Failed to map index page, error %ld.",
- -PTR_ERR(page));
- err = PTR_ERR(page);
- goto err_out;
- }
- lock_page(page);
- kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
- /* Get to the index allocation block. */
- ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
- /* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Out of bounds check failed. Corrupt inode "
- "0x%lx or driver bug.", idx_ni->mft_no);
- goto unm_err_out;
- }
- /* Catch multi sector transfer fixup errors. */
- if (unlikely(!ntfs_is_indx_record(ia->magic))) {
- ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. "
- "Corrupt inode 0x%lx. Run chkdsk.",
- (long long)vcn, idx_ni->mft_no);
- goto unm_err_out;
- }
- if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
- ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
- "different from expected VCN (0x%llx). Inode "
- "0x%lx is corrupt or driver bug.",
- (unsigned long long)
- sle64_to_cpu(ia->index_block_vcn),
- (unsigned long long)vcn, idx_ni->mft_no);
- goto unm_err_out;
- }
- if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
- idx_ni->itype.index.block_size) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has "
- "a size (%u) differing from the index "
- "specified size (%u). Inode is corrupt or "
- "driver bug.", (unsigned long long)vcn,
- idx_ni->mft_no,
- le32_to_cpu(ia->index.allocated_size) + 0x18,
- idx_ni->itype.index.block_size);
- goto unm_err_out;
- }
- index_end = (u8*)ia + idx_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
- "crosses page boundary. Impossible! Cannot "
- "access! This is probably a bug in the "
- "driver.", (unsigned long long)vcn,
- idx_ni->mft_no);
- goto unm_err_out;
- }
- index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
- if (index_end > (u8*)ia + idx_ni->itype.index.block_size) {
- ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode "
- "0x%lx exceeds maximum size.",
- (unsigned long long)vcn, idx_ni->mft_no);
- goto unm_err_out;
- }
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ia->index +
- le32_to_cpu(ia->index.entries_offset));
- /*
- * Iterate similar to above big loop but applied to index buffer, thus
- * loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds checks. */
- if ((u8*)ie < (u8*)ia || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->length) > index_end) {
- ntfs_error(sb, "Index entry out of bounds in inode "
- "0x%lx.", idx_ni->mft_no);
- goto unm_err_out;
- }
- /*
- * The last entry cannot contain a key. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /* Further bounds checks. */
- if ((u32)sizeof(INDEX_ENTRY_HEADER) +
- le16_to_cpu(ie->key_length) >
- le16_to_cpu(ie->data.vi.data_offset) ||
- (u32)le16_to_cpu(ie->data.vi.data_offset) +
- le16_to_cpu(ie->data.vi.data_length) >
- le16_to_cpu(ie->length)) {
- ntfs_error(sb, "Index entry out of bounds in inode "
- "0x%lx.", idx_ni->mft_no);
- goto unm_err_out;
- }
- /* If the keys match perfectly, we setup @ictx and return 0. */
- if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
- &ie->key, key_len)) {
-ia_done:
- ictx->is_in_root = false;
- ictx->actx = NULL;
- ictx->base_ni = NULL;
- ictx->ia = ia;
- ictx->page = page;
- goto done;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
- key_len, &ie->key, le16_to_cpu(ie->key_length));
- /*
- * If @key collates before the key of the current entry, there
- * is definitely no such key in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /*
- * A match should never happen as the memcmp() call should have
- * cought it, but we still treat it correctly.
- */
- if (!rc)
- goto ia_done;
- /* The keys are not equal, continue the search. */
- }
- /*
- * We have finished with this index buffer without success. Check for
- * the presence of a child node and if not present return -ENOENT.
- */
- if (!(ie->flags & INDEX_ENTRY_NODE)) {
- ntfs_debug("Entry not found.");
- err = -ENOENT;
- goto ia_done;
- }
- if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
- ntfs_error(sb, "Index entry with child node found in a leaf "
- "node in inode 0x%lx.", idx_ni->mft_no);
- goto unm_err_out;
- }
- /* Child node present, descend into it. */
- old_vcn = vcn;
- vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
- if (vcn >= 0) {
- /*
- * If vcn is in the same page cache page as old_vcn we recycle
- * the mapped page.
- */
- if (old_vcn << vol->cluster_size_bits >>
- PAGE_SHIFT == vcn <<
- vol->cluster_size_bits >>
- PAGE_SHIFT)
- goto fast_descend_into_child_node;
- unlock_page(page);
- ntfs_unmap_page(page);
- goto descend_into_child_node;
- }
- ntfs_error(sb, "Negative child node vcn in inode 0x%lx.",
- idx_ni->mft_no);
-unm_err_out:
- unlock_page(page);
- ntfs_unmap_page(page);
-err_out:
- if (!err)
- err = -EIO;
- if (actx)
- ntfs_attr_put_search_ctx(actx);
- if (m)
- unmap_mft_record(base_ni);
- return err;
-idx_err_out:
- ntfs_error(sb, "Corrupt index. Aborting lookup.");
- goto err_out;
-}
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
deleted file mode 100644
index bb3c3ae55138..000000000000
--- a/fs/ntfs/index.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS
- * project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_INDEX_H
-#define _LINUX_NTFS_INDEX_H
-
-#include <linux/fs.h>
-
-#include "types.h"
-#include "layout.h"
-#include "inode.h"
-#include "attrib.h"
-#include "mft.h"
-#include "aops.h"
-
-/**
- * @idx_ni: index inode containing the @entry described by this context
- * @entry: index entry (points into @ir or @ia)
- * @data: index entry data (points into @entry)
- * @data_len: length in bytes of @data
- * @is_in_root: 'true' if @entry is in @ir and 'false' if it is in @ia
- * @ir: index root if @is_in_root and NULL otherwise
- * @actx: attribute search context if @is_in_root and NULL otherwise
- * @base_ni: base inode if @is_in_root and NULL otherwise
- * @ia: index block if @is_in_root is 'false' and NULL otherwise
- * @page: page if @is_in_root is 'false' and NULL otherwise
- *
- * @idx_ni is the index inode this context belongs to.
- *
- * @entry is the index entry described by this context. @data and @data_len
- * are the index entry data and its length in bytes, respectively. @data
- * simply points into @entry. This is probably what the user is interested in.
- *
- * If @is_in_root is 'true', @entry is in the index root attribute @ir described
- * by the attribute search context @actx and the base inode @base_ni. @ia and
- * @page are NULL in this case.
- *
- * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia
- * and @page point to the index allocation block and the mapped, locked page it
- * is in, respectively. @ir, @actx and @base_ni are NULL in this case.
- *
- * To obtain a context call ntfs_index_ctx_get().
- *
- * We use this context to allow ntfs_index_lookup() to return the found index
- * @entry and its @data without having to allocate a buffer and copy the @entry
- * and/or its @data into it.
- *
- * When finished with the @entry and its @data, call ntfs_index_ctx_put() to
- * free the context and other associated resources.
- *
- * If the index entry was modified, call flush_dcache_index_entry_page()
- * immediately after the modification and either ntfs_index_entry_mark_dirty()
- * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
- * ensure that the changes are written to disk.
- */
-typedef struct {
- ntfs_inode *idx_ni;
- INDEX_ENTRY *entry;
- void *data;
- u16 data_len;
- bool is_in_root;
- INDEX_ROOT *ir;
- ntfs_attr_search_ctx *actx;
- ntfs_inode *base_ni;
- INDEX_ALLOCATION *ia;
- struct page *page;
-} ntfs_index_context;
-
-extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni);
-extern void ntfs_index_ctx_put(ntfs_index_context *ictx);
-
-extern int ntfs_index_lookup(const void *key, const int key_len,
- ntfs_index_context *ictx);
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries
- * @ictx: ntfs index context describing the index entry
- *
- * Call flush_dcache_page() for the page in which an index entry resides.
- *
- * This must be called every time an index entry is modified, just after the
- * modification.
- *
- * If the index entry is in the index root attribute, simply flush the page
- * containing the mft record containing the index root attribute.
- *
- * If the index entry is in an index block belonging to the index allocation
- * attribute, simply flush the page cache page containing the index block.
- */
-static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx)
-{
- if (ictx->is_in_root)
- flush_dcache_mft_record_page(ictx->actx->ntfs_ino);
- else
- flush_dcache_page(ictx->page);
-}
-
-/**
- * ntfs_index_entry_mark_dirty - mark an index entry dirty
- * @ictx: ntfs index context describing the index entry
- *
- * Mark the index entry described by the index entry context @ictx dirty.
- *
- * If the index entry is in the index root attribute, simply mark the mft
- * record containing the index root attribute dirty. This ensures the mft
- * record, and hence the index root attribute, will be written out to disk
- * later.
- *
- * If the index entry is in an index block belonging to the index allocation
- * attribute, mark the buffers belonging to the index record as well as the
- * page cache page the index block is in dirty. This automatically marks the
- * VFS inode of the ntfs index inode to which the index entry belongs dirty,
- * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the
- * dirty index block, will be written out to disk later.
- */
-static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
-{
- if (ictx->is_in_root)
- mark_mft_record_dirty(ictx->actx->ntfs_ino);
- else
- mark_ntfs_record_dirty(ictx->page,
- (u8*)ictx->ia - (u8*)page_address(ictx->page));
-}
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_INDEX_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
deleted file mode 100644
index aba1e22db4e9..000000000000
--- a/fs/ntfs/inode.c
+++ /dev/null
@@ -1,3102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * inode.c - NTFS kernel inode handling.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- */
-
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/mount.h>
-#include <linux/mutex.h>
-#include <linux/pagemap.h>
-#include <linux/quotaops.h>
-#include <linux/slab.h>
-#include <linux/log2.h>
-
-#include "aops.h"
-#include "attrib.h"
-#include "bitmap.h"
-#include "dir.h"
-#include "debug.h"
-#include "inode.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "time.h"
-#include "ntfs.h"
-
-/**
- * ntfs_test_inode - compare two (possibly fake) inodes for equality
- * @vi: vfs inode which to test
- * @data: data which is being tested with
- *
- * Compare the ntfs attribute embedded in the ntfs specific part of the vfs
- * inode @vi for equality with the ntfs attribute @data.
- *
- * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
- * @na->name and @na->name_len are then ignored.
- *
- * Return 1 if the attributes match and 0 if not.
- *
- * NOTE: This function runs with the inode_hash_lock spin lock held so it is not
- * allowed to sleep.
- */
-int ntfs_test_inode(struct inode *vi, void *data)
-{
- ntfs_attr *na = (ntfs_attr *)data;
- ntfs_inode *ni;
-
- if (vi->i_ino != na->mft_no)
- return 0;
- ni = NTFS_I(vi);
- /* If !NInoAttr(ni), @vi is a normal file or directory inode. */
- if (likely(!NInoAttr(ni))) {
- /* If not looking for a normal inode this is a mismatch. */
- if (unlikely(na->type != AT_UNUSED))
- return 0;
- } else {
- /* A fake inode describing an attribute. */
- if (ni->type != na->type)
- return 0;
- if (ni->name_len != na->name_len)
- return 0;
- if (na->name_len && memcmp(ni->name, na->name,
- na->name_len * sizeof(ntfschar)))
- return 0;
- }
- /* Match! */
- return 1;
-}
-
-/**
- * ntfs_init_locked_inode - initialize an inode
- * @vi: vfs inode to initialize
- * @data: data which to initialize @vi to
- *
- * Initialize the vfs inode @vi with the values from the ntfs attribute @data in
- * order to enable ntfs_test_inode() to do its work.
- *
- * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
- * In that case, @na->name and @na->name_len should be set to NULL and 0,
- * respectively. Although that is not strictly necessary as
- * ntfs_read_locked_inode() will fill them in later.
- *
- * Return 0 on success and -errno on error.
- *
- * NOTE: This function runs with the inode->i_lock spin lock held so it is not
- * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
- */
-static int ntfs_init_locked_inode(struct inode *vi, void *data)
-{
- ntfs_attr *na = (ntfs_attr *)data;
- ntfs_inode *ni = NTFS_I(vi);
-
- vi->i_ino = na->mft_no;
-
- ni->type = na->type;
- if (na->type == AT_INDEX_ALLOCATION)
- NInoSetMstProtected(ni);
-
- ni->name = na->name;
- ni->name_len = na->name_len;
-
- /* If initializing a normal inode, we are done. */
- if (likely(na->type == AT_UNUSED)) {
- BUG_ON(na->name);
- BUG_ON(na->name_len);
- return 0;
- }
-
- /* It is a fake inode. */
- NInoSetAttr(ni);
-
- /*
- * We have I30 global constant as an optimization as it is the name
- * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC
- * allocation but that is ok. And most attributes are unnamed anyway,
- * thus the fraction of named attributes with name != I30 is actually
- * absolutely tiny.
- */
- if (na->name_len && na->name != I30) {
- unsigned int i;
-
- BUG_ON(!na->name);
- i = na->name_len * sizeof(ntfschar);
- ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
- if (!ni->name)
- return -ENOMEM;
- memcpy(ni->name, na->name, i);
- ni->name[na->name_len] = 0;
- }
- return 0;
-}
-
-static int ntfs_read_locked_inode(struct inode *vi);
-static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
-static int ntfs_read_locked_index_inode(struct inode *base_vi,
- struct inode *vi);
-
-/**
- * ntfs_iget - obtain a struct inode corresponding to a specific normal inode
- * @sb: super block of mounted volume
- * @mft_no: mft record number / inode number to obtain
- *
- * Obtain the struct inode corresponding to a specific normal inode (i.e. a
- * file or directory).
- *
- * If the inode is in the cache, it is just returned with an increased
- * reference count. Otherwise, a new struct inode is allocated and initialized,
- * and finally ntfs_read_locked_inode() is called to read in the inode and
- * fill in the remainder of the inode structure.
- *
- * Return the struct inode on success. Check the return value with IS_ERR() and
- * if true, the function failed and the error code is obtained from PTR_ERR().
- */
-struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
-{
- struct inode *vi;
- int err;
- ntfs_attr na;
-
- na.mft_no = mft_no;
- na.type = AT_UNUSED;
- na.name = NULL;
- na.name_len = 0;
-
- vi = iget5_locked(sb, mft_no, ntfs_test_inode,
- ntfs_init_locked_inode, &na);
- if (unlikely(!vi))
- return ERR_PTR(-ENOMEM);
-
- err = 0;
-
- /* If this is a freshly allocated inode, need to read it now. */
- if (vi->i_state & I_NEW) {
- err = ntfs_read_locked_inode(vi);
- unlock_new_inode(vi);
- }
- /*
- * There is no point in keeping bad inodes around if the failure was
- * due to ENOMEM. We want to be able to retry again later.
- */
- if (unlikely(err == -ENOMEM)) {
- iput(vi);
- vi = ERR_PTR(err);
- }
- return vi;
-}
-
-/**
- * ntfs_attr_iget - obtain a struct inode corresponding to an attribute
- * @base_vi: vfs base inode containing the attribute
- * @type: attribute type
- * @name: Unicode name of the attribute (NULL if unnamed)
- * @name_len: length of @name in Unicode characters (0 if unnamed)
- *
- * Obtain the (fake) struct inode corresponding to the attribute specified by
- * @type, @name, and @name_len, which is present in the base mft record
- * specified by the vfs inode @base_vi.
- *
- * If the attribute inode is in the cache, it is just returned with an
- * increased reference count. Otherwise, a new struct inode is allocated and
- * initialized, and finally ntfs_read_locked_attr_inode() is called to read the
- * attribute and fill in the inode structure.
- *
- * Note, for index allocation attributes, you need to use ntfs_index_iget()
- * instead of ntfs_attr_iget() as working with indices is a lot more complex.
- *
- * Return the struct inode of the attribute inode on success. Check the return
- * value with IS_ERR() and if true, the function failed and the error code is
- * obtained from PTR_ERR().
- */
-struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
- ntfschar *name, u32 name_len)
-{
- struct inode *vi;
- int err;
- ntfs_attr na;
-
- /* Make sure no one calls ntfs_attr_iget() for indices. */
- BUG_ON(type == AT_INDEX_ALLOCATION);
-
- na.mft_no = base_vi->i_ino;
- na.type = type;
- na.name = name;
- na.name_len = name_len;
-
- vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
- ntfs_init_locked_inode, &na);
- if (unlikely(!vi))
- return ERR_PTR(-ENOMEM);
-
- err = 0;
-
- /* If this is a freshly allocated inode, need to read it now. */
- if (vi->i_state & I_NEW) {
- err = ntfs_read_locked_attr_inode(base_vi, vi);
- unlock_new_inode(vi);
- }
- /*
- * There is no point in keeping bad attribute inodes around. This also
- * simplifies things in that we never need to check for bad attribute
- * inodes elsewhere.
- */
- if (unlikely(err)) {
- iput(vi);
- vi = ERR_PTR(err);
- }
- return vi;
-}
-
-/**
- * ntfs_index_iget - obtain a struct inode corresponding to an index
- * @base_vi: vfs base inode containing the index related attributes
- * @name: Unicode name of the index
- * @name_len: length of @name in Unicode characters
- *
- * Obtain the (fake) struct inode corresponding to the index specified by @name
- * and @name_len, which is present in the base mft record specified by the vfs
- * inode @base_vi.
- *
- * If the index inode is in the cache, it is just returned with an increased
- * reference count. Otherwise, a new struct inode is allocated and
- * initialized, and finally ntfs_read_locked_index_inode() is called to read
- * the index related attributes and fill in the inode structure.
- *
- * Return the struct inode of the index inode on success. Check the return
- * value with IS_ERR() and if true, the function failed and the error code is
- * obtained from PTR_ERR().
- */
-struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
- u32 name_len)
-{
- struct inode *vi;
- int err;
- ntfs_attr na;
-
- na.mft_no = base_vi->i_ino;
- na.type = AT_INDEX_ALLOCATION;
- na.name = name;
- na.name_len = name_len;
-
- vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
- ntfs_init_locked_inode, &na);
- if (unlikely(!vi))
- return ERR_PTR(-ENOMEM);
-
- err = 0;
-
- /* If this is a freshly allocated inode, need to read it now. */
- if (vi->i_state & I_NEW) {
- err = ntfs_read_locked_index_inode(base_vi, vi);
- unlock_new_inode(vi);
- }
- /*
- * There is no point in keeping bad index inodes around. This also
- * simplifies things in that we never need to check for bad index
- * inodes elsewhere.
- */
- if (unlikely(err)) {
- iput(vi);
- vi = ERR_PTR(err);
- }
- return vi;
-}
-
-struct inode *ntfs_alloc_big_inode(struct super_block *sb)
-{
- ntfs_inode *ni;
-
- ntfs_debug("Entering.");
- ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS);
- if (likely(ni != NULL)) {
- ni->state = 0;
- return VFS_I(ni);
- }
- ntfs_error(sb, "Allocation of NTFS big inode structure failed.");
- return NULL;
-}
-
-void ntfs_free_big_inode(struct inode *inode)
-{
- kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
-}
-
-static inline ntfs_inode *ntfs_alloc_extent_inode(void)
-{
- ntfs_inode *ni;
-
- ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS);
- if (likely(ni != NULL)) {
- ni->state = 0;
- return ni;
- }
- ntfs_error(NULL, "Allocation of NTFS inode structure failed.");
- return NULL;
-}
-
-static void ntfs_destroy_extent_inode(ntfs_inode *ni)
-{
- ntfs_debug("Entering.");
- BUG_ON(ni->page);
- if (!atomic_dec_and_test(&ni->count))
- BUG();
- kmem_cache_free(ntfs_inode_cache, ni);
-}
-
-/*
- * The attribute runlist lock has separate locking rules from the
- * normal runlist lock, so split the two lock-classes:
- */
-static struct lock_class_key attr_list_rl_lock_class;
-
-/**
- * __ntfs_init_inode - initialize ntfs specific part of an inode
- * @sb: super block of mounted volume
- * @ni: freshly allocated ntfs inode which to initialize
- *
- * Initialize an ntfs inode to defaults.
- *
- * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left
- * untouched. Make sure to initialize them elsewhere.
- *
- * Return zero on success and -ENOMEM on error.
- */
-void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
-{
- ntfs_debug("Entering.");
- rwlock_init(&ni->size_lock);
- ni->initialized_size = ni->allocated_size = 0;
- ni->seq_no = 0;
- atomic_set(&ni->count, 1);
- ni->vol = NTFS_SB(sb);
- ntfs_init_runlist(&ni->runlist);
- mutex_init(&ni->mrec_lock);
- ni->page = NULL;
- ni->page_ofs = 0;
- ni->attr_list_size = 0;
- ni->attr_list = NULL;
- ntfs_init_runlist(&ni->attr_list_rl);
- lockdep_set_class(&ni->attr_list_rl.lock,
- &attr_list_rl_lock_class);
- ni->itype.index.block_size = 0;
- ni->itype.index.vcn_size = 0;
- ni->itype.index.collation_rule = 0;
- ni->itype.index.block_size_bits = 0;
- ni->itype.index.vcn_size_bits = 0;
- mutex_init(&ni->extent_lock);
- ni->nr_extents = 0;
- ni->ext.base_ntfs_ino = NULL;
-}
-
-/*
- * Extent inodes get MFT-mapped in a nested way, while the base inode
- * is still mapped. Teach this nesting to the lock validator by creating
- * a separate class for nested inode's mrec_lock's:
- */
-static struct lock_class_key extent_inode_mrec_lock_key;
-
-inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
- unsigned long mft_no)
-{
- ntfs_inode *ni = ntfs_alloc_extent_inode();
-
- ntfs_debug("Entering.");
- if (likely(ni != NULL)) {
- __ntfs_init_inode(sb, ni);
- lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key);
- ni->mft_no = mft_no;
- ni->type = AT_UNUSED;
- ni->name = NULL;
- ni->name_len = 0;
- }
- return ni;
-}
-
-/**
- * ntfs_is_extended_system_file - check if a file is in the $Extend directory
- * @ctx: initialized attribute search context
- *
- * Search all file name attributes in the inode described by the attribute
- * search context @ctx and check if any of the names are in the $Extend system
- * directory.
- *
- * Return values:
- * 1: file is in $Extend directory
- * 0: file is not in $Extend directory
- * -errno: failed to determine if the file is in the $Extend directory
- */
-static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx)
-{
- int nr_links, err;
-
- /* Restart search. */
- ntfs_attr_reinit_search_ctx(ctx);
-
- /* Get number of hard links. */
- nr_links = le16_to_cpu(ctx->mrec->link_count);
-
- /* Loop through all hard links. */
- while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0,
- ctx))) {
- FILE_NAME_ATTR *file_name_attr;
- ATTR_RECORD *attr = ctx->attr;
- u8 *p, *p2;
-
- nr_links--;
- /*
- * Maximum sanity checking as we are called on an inode that
- * we suspect might be corrupt.
- */
- p = (u8*)attr + le32_to_cpu(attr->length);
- if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec +
- le32_to_cpu(ctx->mrec->bytes_in_use)) {
-err_corrupt_attr:
- ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name "
- "attribute. You should run chkdsk.");
- return -EIO;
- }
- if (attr->non_resident) {
- ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file "
- "name. You should run chkdsk.");
- return -EIO;
- }
- if (attr->flags) {
- ntfs_error(ctx->ntfs_ino->vol->sb, "File name with "
- "invalid flags. You should run "
- "chkdsk.");
- return -EIO;
- }
- if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) {
- ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file "
- "name. You should run chkdsk.");
- return -EIO;
- }
- file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
- le16_to_cpu(attr->data.resident.value_offset));
- p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length);
- if (p2 < (u8*)attr || p2 > p)
- goto err_corrupt_attr;
- /* This attribute is ok, but is it in the $Extend directory? */
- if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend)
- return 1; /* YES, it's an extended system file. */
- }
- if (unlikely(err != -ENOENT))
- return err;
- if (unlikely(nr_links)) {
- ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count "
- "doesn't match number of name attributes. You "
- "should run chkdsk.");
- return -EIO;
- }
- return 0; /* NO, it is not an extended system file. */
-}
-
-/**
- * ntfs_read_locked_inode - read an inode from its device
- * @vi: inode to read
- *
- * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode
- * described by @vi into memory from the device.
- *
- * The only fields in @vi that we need to/can look at when the function is
- * called are i_sb, pointing to the mounted device's super block, and i_ino,
- * the number of the inode to load.
- *
- * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino
- * for reading and sets up the necessary @vi fields as well as initializing
- * the ntfs inode.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- * i_count is set to 1, so it is not going to go away
- * i_flags is set to 0 and we have no business touching it. Only an ioctl()
- * is allowed to write to them. We should of course be honouring them but
- * we need to do that using the IS_* macros defined in include/linux/fs.h.
- * In any case ntfs_read_locked_inode() has nothing to do with i_flags.
- *
- * Return 0 on success and -errno on error. In the error case, the inode will
- * have had make_bad_inode() executed on it.
- */
-static int ntfs_read_locked_inode(struct inode *vi)
-{
- ntfs_volume *vol = NTFS_SB(vi->i_sb);
- ntfs_inode *ni;
- struct inode *bvi;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- STANDARD_INFORMATION *si;
- ntfs_attr_search_ctx *ctx;
- int err = 0;
-
- ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
-
- /* Setup the generic vfs inode parts now. */
- vi->i_uid = vol->uid;
- vi->i_gid = vol->gid;
- vi->i_mode = 0;
-
- /*
- * Initialize the ntfs specific part of @vi special casing
- * FILE_MFT which we need to do at mount time.
- */
- if (vi->i_ino != FILE_MFT)
- ntfs_init_big_inode(vi);
- ni = NTFS_I(vi);
-
- m = map_mft_record(ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto unm_err_out;
- }
-
- if (!(m->flags & MFT_RECORD_IN_USE)) {
- ntfs_error(vi->i_sb, "Inode is not in use!");
- goto unm_err_out;
- }
- if (m->base_mft_record) {
- ntfs_error(vi->i_sb, "Inode is an extent inode!");
- goto unm_err_out;
- }
-
- /* Transfer information from mft record into vfs and ntfs inodes. */
- vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
-
- /*
- * FIXME: Keep in mind that link_count is two for files which have both
- * a long file name and a short file name as separate entries, so if
- * we are hiding short file names this will be too high. Either we need
- * to account for the short file names by subtracting them or we need
- * to make sure we delete files even though i_nlink is not zero which
- * might be tricky due to vfs interactions. Need to think about this
- * some more when implementing the unlink command.
- */
- set_nlink(vi, le16_to_cpu(m->link_count));
- /*
- * FIXME: Reparse points can have the directory bit set even though
- * they would be S_IFLNK. Need to deal with this further below when we
- * implement reparse points / symbolic links but it will do for now.
- * Also if not a directory, it could be something else, rather than
- * a regular file. But again, will do for now.
- */
- /* Everyone gets all permissions. */
- vi->i_mode |= S_IRWXUGO;
- /* If read-only, no one gets write permissions. */
- if (IS_RDONLY(vi))
- vi->i_mode &= ~S_IWUGO;
- if (m->flags & MFT_RECORD_IS_DIRECTORY) {
- vi->i_mode |= S_IFDIR;
- /*
- * Apply the directory permissions mask set in the mount
- * options.
- */
- vi->i_mode &= ~vol->dmask;
- /* Things break without this kludge! */
- if (vi->i_nlink > 1)
- set_nlink(vi, 1);
- } else {
- vi->i_mode |= S_IFREG;
- /* Apply the file permissions mask set in the mount options. */
- vi->i_mode &= ~vol->fmask;
- }
- /*
- * Find the standard information attribute in the mft record. At this
- * stage we haven't setup the attribute list stuff yet, so this could
- * in fact fail if the standard information is in an extent record, but
- * I don't think this actually ever happens.
- */
- err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0,
- ctx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- /*
- * TODO: We should be performing a hot fix here (if the
- * recover mount option is set) by creating a new
- * attribute.
- */
- ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute "
- "is missing.");
- }
- goto unm_err_out;
- }
- a = ctx->attr;
- /* Get the standard information attribute value. */
- if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset)
- + le32_to_cpu(a->data.resident.value_length) >
- (u8 *)ctx->mrec + vol->mft_record_size) {
- ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode.");
- goto unm_err_out;
- }
- si = (STANDARD_INFORMATION*)((u8*)a +
- le16_to_cpu(a->data.resident.value_offset));
-
- /* Transfer information from the standard information into vi. */
- /*
- * Note: The i_?times do not quite map perfectly onto the NTFS times,
- * but they are close enough, and in the end it doesn't really matter
- * that much...
- */
- /*
- * mtime is the last change of the data within the file. Not changed
- * when only metadata is changed, e.g. a rename doesn't affect mtime.
- */
- inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time));
- /*
- * ctime is the last change of the metadata of the file. This obviously
- * always changes, when mtime is changed. ctime can be changed on its
- * own, mtime is then not changed, e.g. when a file is renamed.
- */
- inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time));
- /*
- * Last access to the data within the file. Not changed during a rename
- * for example but changed whenever the file is written to.
- */
- inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time));
-
- /* Find the attribute list attribute if present. */
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
- if (err) {
- if (unlikely(err != -ENOENT)) {
- ntfs_error(vi->i_sb, "Failed to lookup attribute list "
- "attribute.");
- goto unm_err_out;
- }
- } else /* if (!err) */ {
- if (vi->i_ino == FILE_MFT)
- goto skip_attr_list_load;
- ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
- NInoSetAttrList(ni);
- a = ctx->attr;
- if (a->flags & ATTR_COMPRESSION_MASK) {
- ntfs_error(vi->i_sb, "Attribute list attribute is "
- "compressed.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_ENCRYPTED ||
- a->flags & ATTR_IS_SPARSE) {
- if (a->non_resident) {
- ntfs_error(vi->i_sb, "Non-resident attribute "
- "list attribute is encrypted/"
- "sparse.");
- goto unm_err_out;
- }
- ntfs_warning(vi->i_sb, "Resident attribute list "
- "attribute in inode 0x%lx is marked "
- "encrypted/sparse which is not true. "
- "However, Windows allows this and "
- "chkdsk does not detect or correct it "
- "so we will just ignore the invalid "
- "flags and pretend they are not set.",
- vi->i_ino);
- }
- /* Now allocate memory for the attribute list. */
- ni->attr_list_size = (u32)ntfs_attr_size(a);
- ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
- if (!ni->attr_list) {
- ntfs_error(vi->i_sb, "Not enough memory to allocate "
- "buffer for attribute list.");
- err = -ENOMEM;
- goto unm_err_out;
- }
- if (a->non_resident) {
- NInoSetAttrListNonResident(ni);
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(vi->i_sb, "Attribute list has non "
- "zero lowest_vcn.");
- goto unm_err_out;
- }
- /*
- * Setup the runlist. No need for locking as we have
- * exclusive access to the inode at this time.
- */
- ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
- a, NULL);
- if (IS_ERR(ni->attr_list_rl.rl)) {
- err = PTR_ERR(ni->attr_list_rl.rl);
- ni->attr_list_rl.rl = NULL;
- ntfs_error(vi->i_sb, "Mapping pairs "
- "decompression failed.");
- goto unm_err_out;
- }
- /* Now load the attribute list. */
- if ((err = load_attribute_list(vol, &ni->attr_list_rl,
- ni->attr_list, ni->attr_list_size,
- sle64_to_cpu(a->data.non_resident.
- initialized_size)))) {
- ntfs_error(vi->i_sb, "Failed to load "
- "attribute list attribute.");
- goto unm_err_out;
- }
- } else /* if (!a->non_resident) */ {
- if ((u8*)a + le16_to_cpu(a->data.resident.value_offset)
- + le32_to_cpu(
- a->data.resident.value_length) >
- (u8*)ctx->mrec + vol->mft_record_size) {
- ntfs_error(vi->i_sb, "Corrupt attribute list "
- "in inode.");
- goto unm_err_out;
- }
- /* Now copy the attribute list. */
- memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
- a->data.resident.value_offset),
- le32_to_cpu(
- a->data.resident.value_length));
- }
- }
-skip_attr_list_load:
- /*
- * If an attribute list is present we now have the attribute list value
- * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes.
- */
- if (S_ISDIR(vi->i_mode)) {
- loff_t bvi_size;
- ntfs_inode *bni;
- INDEX_ROOT *ir;
- u8 *ir_end, *index_end;
-
- /* It is a directory, find index root attribute. */
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE,
- 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- // FIXME: File is corrupt! Hot-fix with empty
- // index root attribute if recovery option is
- // set.
- ntfs_error(vi->i_sb, "$INDEX_ROOT attribute "
- "is missing.");
- }
- goto unm_err_out;
- }
- a = ctx->attr;
- /* Set up the state. */
- if (unlikely(a->non_resident)) {
- ntfs_error(vol->sb, "$INDEX_ROOT attribute is not "
- "resident.");
- goto unm_err_out;
- }
- /* Ensure the attribute name is placed before the value. */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(a->data.resident.value_offset)))) {
- ntfs_error(vol->sb, "$INDEX_ROOT attribute name is "
- "placed after the attribute value.");
- goto unm_err_out;
- }
- /*
- * Compressed/encrypted index root just means that the newly
- * created files in that directory should be created compressed/
- * encrypted. However index root cannot be both compressed and
- * encrypted.
- */
- if (a->flags & ATTR_COMPRESSION_MASK)
- NInoSetCompressed(ni);
- if (a->flags & ATTR_IS_ENCRYPTED) {
- if (a->flags & ATTR_COMPRESSION_MASK) {
- ntfs_error(vi->i_sb, "Found encrypted and "
- "compressed attribute.");
- goto unm_err_out;
- }
- NInoSetEncrypted(ni);
- }
- if (a->flags & ATTR_IS_SPARSE)
- NInoSetSparse(ni);
- ir = (INDEX_ROOT*)((u8*)a +
- le16_to_cpu(a->data.resident.value_offset));
- ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
- if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
- ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
- "corrupt.");
- goto unm_err_out;
- }
- index_end = (u8*)&ir->index +
- le32_to_cpu(ir->index.index_length);
- if (index_end > ir_end) {
- ntfs_error(vi->i_sb, "Directory index is corrupt.");
- goto unm_err_out;
- }
- if (ir->type != AT_FILE_NAME) {
- ntfs_error(vi->i_sb, "Indexed attribute is not "
- "$FILE_NAME.");
- goto unm_err_out;
- }
- if (ir->collation_rule != COLLATION_FILE_NAME) {
- ntfs_error(vi->i_sb, "Index collation rule is not "
- "COLLATION_FILE_NAME.");
- goto unm_err_out;
- }
- ni->itype.index.collation_rule = ir->collation_rule;
- ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
- if (ni->itype.index.block_size &
- (ni->itype.index.block_size - 1)) {
- ntfs_error(vi->i_sb, "Index block size (%u) is not a "
- "power of two.",
- ni->itype.index.block_size);
- goto unm_err_out;
- }
- if (ni->itype.index.block_size > PAGE_SIZE) {
- ntfs_error(vi->i_sb, "Index block size (%u) > "
- "PAGE_SIZE (%ld) is not "
- "supported. Sorry.",
- ni->itype.index.block_size,
- PAGE_SIZE);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
- ntfs_error(vi->i_sb, "Index block size (%u) < "
- "NTFS_BLOCK_SIZE (%i) is not "
- "supported. Sorry.",
- ni->itype.index.block_size,
- NTFS_BLOCK_SIZE);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- ni->itype.index.block_size_bits =
- ffs(ni->itype.index.block_size) - 1;
- /* Determine the size of a vcn in the directory index. */
- if (vol->cluster_size <= ni->itype.index.block_size) {
- ni->itype.index.vcn_size = vol->cluster_size;
- ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
- } else {
- ni->itype.index.vcn_size = vol->sector_size;
- ni->itype.index.vcn_size_bits = vol->sector_size_bits;
- }
-
- /* Setup the index allocation attribute, even if not present. */
- NInoSetMstProtected(ni);
- ni->type = AT_INDEX_ALLOCATION;
- ni->name = I30;
- ni->name_len = 4;
-
- if (!(ir->index.flags & LARGE_INDEX)) {
- /* No index allocation. */
- vi->i_size = ni->initialized_size =
- ni->allocated_size = 0;
- /* We are done with the mft record, so we release it. */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- m = NULL;
- ctx = NULL;
- goto skip_large_dir_stuff;
- } /* LARGE_INDEX: Index allocation present. Setup state. */
- NInoSetIndexAllocPresent(ni);
- /* Find index allocation attribute. */
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION "
- "attribute is not present but "
- "$INDEX_ROOT indicated it is.");
- else
- ntfs_error(vi->i_sb, "Failed to lookup "
- "$INDEX_ALLOCATION "
- "attribute.");
- goto unm_err_out;
- }
- a = ctx->attr;
- if (!a->non_resident) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
- "is resident.");
- goto unm_err_out;
- }
- /*
- * Ensure the attribute name is placed before the mapping pairs
- * array.
- */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset)))) {
- ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name "
- "is placed after the mapping pairs "
- "array.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_ENCRYPTED) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
- "is encrypted.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_SPARSE) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
- "is sparse.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_COMPRESSION_MASK) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
- "is compressed.");
- goto unm_err_out;
- }
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(vi->i_sb, "First extent of "
- "$INDEX_ALLOCATION attribute has non "
- "zero lowest_vcn.");
- goto unm_err_out;
- }
- vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
- ni->initialized_size = sle64_to_cpu(
- a->data.non_resident.initialized_size);
- ni->allocated_size = sle64_to_cpu(
- a->data.non_resident.allocated_size);
- /*
- * We are done with the mft record, so we release it. Otherwise
- * we would deadlock in ntfs_attr_iget().
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- m = NULL;
- ctx = NULL;
- /* Get the index bitmap attribute inode. */
- bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4);
- if (IS_ERR(bvi)) {
- ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
- err = PTR_ERR(bvi);
- goto unm_err_out;
- }
- bni = NTFS_I(bvi);
- if (NInoCompressed(bni) || NInoEncrypted(bni) ||
- NInoSparse(bni)) {
- ntfs_error(vi->i_sb, "$BITMAP attribute is compressed "
- "and/or encrypted and/or sparse.");
- goto iput_unm_err_out;
- }
- /* Consistency check bitmap size vs. index allocation size. */
- bvi_size = i_size_read(bvi);
- if ((bvi_size << 3) < (vi->i_size >>
- ni->itype.index.block_size_bits)) {
- ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) "
- "for index allocation (0x%llx).",
- bvi_size << 3, vi->i_size);
- goto iput_unm_err_out;
- }
- /* No longer need the bitmap attribute inode. */
- iput(bvi);
-skip_large_dir_stuff:
- /* Setup the operations for this inode. */
- vi->i_op = &ntfs_dir_inode_ops;
- vi->i_fop = &ntfs_dir_ops;
- vi->i_mapping->a_ops = &ntfs_mst_aops;
- } else {
- /* It is a file. */
- ntfs_attr_reinit_search_ctx(ctx);
-
- /* Setup the data attribute, even if not present. */
- ni->type = AT_DATA;
- ni->name = NULL;
- ni->name_len = 0;
-
- /* Find first extent of the unnamed data attribute. */
- err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- vi->i_size = ni->initialized_size =
- ni->allocated_size = 0;
- if (err != -ENOENT) {
- ntfs_error(vi->i_sb, "Failed to lookup $DATA "
- "attribute.");
- goto unm_err_out;
- }
- /*
- * FILE_Secure does not have an unnamed $DATA
- * attribute, so we special case it here.
- */
- if (vi->i_ino == FILE_Secure)
- goto no_data_attr_special_case;
- /*
- * Most if not all the system files in the $Extend
- * system directory do not have unnamed data
- * attributes so we need to check if the parent
- * directory of the file is FILE_Extend and if it is
- * ignore this error. To do this we need to get the
- * name of this inode from the mft record as the name
- * contains the back reference to the parent directory.
- */
- if (ntfs_is_extended_system_file(ctx) > 0)
- goto no_data_attr_special_case;
- // FIXME: File is corrupt! Hot-fix with empty data
- // attribute if recovery option is set.
- ntfs_error(vi->i_sb, "$DATA attribute is missing.");
- goto unm_err_out;
- }
- a = ctx->attr;
- /* Setup the state. */
- if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
- if (a->flags & ATTR_COMPRESSION_MASK) {
- NInoSetCompressed(ni);
- if (vol->cluster_size > 4096) {
- ntfs_error(vi->i_sb, "Found "
- "compressed data but "
- "compression is "
- "disabled due to "
- "cluster size (%i) > "
- "4kiB.",
- vol->cluster_size);
- goto unm_err_out;
- }
- if ((a->flags & ATTR_COMPRESSION_MASK)
- != ATTR_IS_COMPRESSED) {
- ntfs_error(vi->i_sb, "Found unknown "
- "compression method "
- "or corrupt file.");
- goto unm_err_out;
- }
- }
- if (a->flags & ATTR_IS_SPARSE)
- NInoSetSparse(ni);
- }
- if (a->flags & ATTR_IS_ENCRYPTED) {
- if (NInoCompressed(ni)) {
- ntfs_error(vi->i_sb, "Found encrypted and "
- "compressed data.");
- goto unm_err_out;
- }
- NInoSetEncrypted(ni);
- }
- if (a->non_resident) {
- NInoSetNonResident(ni);
- if (NInoCompressed(ni) || NInoSparse(ni)) {
- if (NInoCompressed(ni) && a->data.non_resident.
- compression_unit != 4) {
- ntfs_error(vi->i_sb, "Found "
- "non-standard "
- "compression unit (%u "
- "instead of 4). "
- "Cannot handle this.",
- a->data.non_resident.
- compression_unit);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- if (a->data.non_resident.compression_unit) {
- ni->itype.compressed.block_size = 1U <<
- (a->data.non_resident.
- compression_unit +
- vol->cluster_size_bits);
- ni->itype.compressed.block_size_bits =
- ffs(ni->itype.
- compressed.
- block_size) - 1;
- ni->itype.compressed.block_clusters =
- 1U << a->data.
- non_resident.
- compression_unit;
- } else {
- ni->itype.compressed.block_size = 0;
- ni->itype.compressed.block_size_bits =
- 0;
- ni->itype.compressed.block_clusters =
- 0;
- }
- ni->itype.compressed.size = sle64_to_cpu(
- a->data.non_resident.
- compressed_size);
- }
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(vi->i_sb, "First extent of $DATA "
- "attribute has non zero "
- "lowest_vcn.");
- goto unm_err_out;
- }
- vi->i_size = sle64_to_cpu(
- a->data.non_resident.data_size);
- ni->initialized_size = sle64_to_cpu(
- a->data.non_resident.initialized_size);
- ni->allocated_size = sle64_to_cpu(
- a->data.non_resident.allocated_size);
- } else { /* Resident attribute. */
- vi->i_size = ni->initialized_size = le32_to_cpu(
- a->data.resident.value_length);
- ni->allocated_size = le32_to_cpu(a->length) -
- le16_to_cpu(
- a->data.resident.value_offset);
- if (vi->i_size > ni->allocated_size) {
- ntfs_error(vi->i_sb, "Resident data attribute "
- "is corrupt (size exceeds "
- "allocation).");
- goto unm_err_out;
- }
- }
-no_data_attr_special_case:
- /* We are done with the mft record, so we release it. */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- m = NULL;
- ctx = NULL;
- /* Setup the operations for this inode. */
- vi->i_op = &ntfs_file_inode_ops;
- vi->i_fop = &ntfs_file_ops;
- vi->i_mapping->a_ops = &ntfs_normal_aops;
- if (NInoMstProtected(ni))
- vi->i_mapping->a_ops = &ntfs_mst_aops;
- else if (NInoCompressed(ni))
- vi->i_mapping->a_ops = &ntfs_compressed_aops;
- }
- /*
- * The number of 512-byte blocks used on disk (for stat). This is in so
- * far inaccurate as it doesn't account for any named streams or other
- * special non-resident attributes, but that is how Windows works, too,
- * so we are at least consistent with Windows, if not entirely
- * consistent with the Linux Way. Doing it the Linux Way would cause a
- * significant slowdown as it would involve iterating over all
- * attributes in the mft record and adding the allocated/compressed
- * sizes of all non-resident attributes present to give us the Linux
- * correct size that should go into i_blocks (after division by 512).
- */
- if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni)))
- vi->i_blocks = ni->itype.compressed.size >> 9;
- else
- vi->i_blocks = ni->allocated_size >> 9;
- ntfs_debug("Done.");
- return 0;
-iput_unm_err_out:
- iput(bvi);
-unm_err_out:
- if (!err)
- err = -EIO;
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(ni);
-err_out:
- ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt "
- "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino);
- make_bad_inode(vi);
- if (err != -EOPNOTSUPP && err != -ENOMEM)
- NVolSetErrors(vol);
- return err;
-}
-
-/**
- * ntfs_read_locked_attr_inode - read an attribute inode from its base inode
- * @base_vi: base inode
- * @vi: attribute inode to read
- *
- * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the
- * attribute inode described by @vi into memory from the base mft record
- * described by @base_ni.
- *
- * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for
- * reading and looks up the attribute described by @vi before setting up the
- * necessary fields in @vi as well as initializing the ntfs inode.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- * i_count is set to 1, so it is not going to go away
- *
- * Return 0 on success and -errno on error. In the error case, the inode will
- * have had make_bad_inode() executed on it.
- *
- * Note this cannot be called for AT_INDEX_ALLOCATION.
- */
-static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
-{
- ntfs_volume *vol = NTFS_SB(vi->i_sb);
- ntfs_inode *ni, *base_ni;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
- int err = 0;
-
- ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
-
- ntfs_init_big_inode(vi);
-
- ni = NTFS_I(vi);
- base_ni = NTFS_I(base_vi);
-
- /* Just mirror the values from the base inode. */
- vi->i_uid = base_vi->i_uid;
- vi->i_gid = base_vi->i_gid;
- set_nlink(vi, base_vi->i_nlink);
- inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
- inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
- inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
- vi->i_generation = ni->seq_no = base_ni->seq_no;
-
- /* Set inode type to zero but preserve permissions. */
- vi->i_mode = base_vi->i_mode & ~S_IFMT;
-
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- /* Find the attribute. */
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err))
- goto unm_err_out;
- a = ctx->attr;
- if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
- if (a->flags & ATTR_COMPRESSION_MASK) {
- NInoSetCompressed(ni);
- if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
- ni->name_len)) {
- ntfs_error(vi->i_sb, "Found compressed "
- "non-data or named data "
- "attribute. Please report "
- "you saw this message to "
- "linux-ntfs-dev@lists."
- "sourceforge.net");
- goto unm_err_out;
- }
- if (vol->cluster_size > 4096) {
- ntfs_error(vi->i_sb, "Found compressed "
- "attribute but compression is "
- "disabled due to cluster size "
- "(%i) > 4kiB.",
- vol->cluster_size);
- goto unm_err_out;
- }
- if ((a->flags & ATTR_COMPRESSION_MASK) !=
- ATTR_IS_COMPRESSED) {
- ntfs_error(vi->i_sb, "Found unknown "
- "compression method.");
- goto unm_err_out;
- }
- }
- /*
- * The compressed/sparse flag set in an index root just means
- * to compress all files.
- */
- if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
- ntfs_error(vi->i_sb, "Found mst protected attribute "
- "but the attribute is %s. Please "
- "report you saw this message to "
- "linux-ntfs-dev@lists.sourceforge.net",
- NInoCompressed(ni) ? "compressed" :
- "sparse");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_SPARSE)
- NInoSetSparse(ni);
- }
- if (a->flags & ATTR_IS_ENCRYPTED) {
- if (NInoCompressed(ni)) {
- ntfs_error(vi->i_sb, "Found encrypted and compressed "
- "data.");
- goto unm_err_out;
- }
- /*
- * The encryption flag set in an index root just means to
- * encrypt all files.
- */
- if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
- ntfs_error(vi->i_sb, "Found mst protected attribute "
- "but the attribute is encrypted. "
- "Please report you saw this message "
- "to linux-ntfs-dev@lists.sourceforge."
- "net");
- goto unm_err_out;
- }
- if (ni->type != AT_DATA) {
- ntfs_error(vi->i_sb, "Found encrypted non-data "
- "attribute.");
- goto unm_err_out;
- }
- NInoSetEncrypted(ni);
- }
- if (!a->non_resident) {
- /* Ensure the attribute name is placed before the value. */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(a->data.resident.value_offset)))) {
- ntfs_error(vol->sb, "Attribute name is placed after "
- "the attribute value.");
- goto unm_err_out;
- }
- if (NInoMstProtected(ni)) {
- ntfs_error(vi->i_sb, "Found mst protected attribute "
- "but the attribute is resident. "
- "Please report you saw this message to "
- "linux-ntfs-dev@lists.sourceforge.net");
- goto unm_err_out;
- }
- vi->i_size = ni->initialized_size = le32_to_cpu(
- a->data.resident.value_length);
- ni->allocated_size = le32_to_cpu(a->length) -
- le16_to_cpu(a->data.resident.value_offset);
- if (vi->i_size > ni->allocated_size) {
- ntfs_error(vi->i_sb, "Resident attribute is corrupt "
- "(size exceeds allocation).");
- goto unm_err_out;
- }
- } else {
- NInoSetNonResident(ni);
- /*
- * Ensure the attribute name is placed before the mapping pairs
- * array.
- */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset)))) {
- ntfs_error(vol->sb, "Attribute name is placed after "
- "the mapping pairs array.");
- goto unm_err_out;
- }
- if (NInoCompressed(ni) || NInoSparse(ni)) {
- if (NInoCompressed(ni) && a->data.non_resident.
- compression_unit != 4) {
- ntfs_error(vi->i_sb, "Found non-standard "
- "compression unit (%u instead "
- "of 4). Cannot handle this.",
- a->data.non_resident.
- compression_unit);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- if (a->data.non_resident.compression_unit) {
- ni->itype.compressed.block_size = 1U <<
- (a->data.non_resident.
- compression_unit +
- vol->cluster_size_bits);
- ni->itype.compressed.block_size_bits =
- ffs(ni->itype.compressed.
- block_size) - 1;
- ni->itype.compressed.block_clusters = 1U <<
- a->data.non_resident.
- compression_unit;
- } else {
- ni->itype.compressed.block_size = 0;
- ni->itype.compressed.block_size_bits = 0;
- ni->itype.compressed.block_clusters = 0;
- }
- ni->itype.compressed.size = sle64_to_cpu(
- a->data.non_resident.compressed_size);
- }
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(vi->i_sb, "First extent of attribute has "
- "non-zero lowest_vcn.");
- goto unm_err_out;
- }
- vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
- ni->initialized_size = sle64_to_cpu(
- a->data.non_resident.initialized_size);
- ni->allocated_size = sle64_to_cpu(
- a->data.non_resident.allocated_size);
- }
- vi->i_mapping->a_ops = &ntfs_normal_aops;
- if (NInoMstProtected(ni))
- vi->i_mapping->a_ops = &ntfs_mst_aops;
- else if (NInoCompressed(ni))
- vi->i_mapping->a_ops = &ntfs_compressed_aops;
- if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
- vi->i_blocks = ni->itype.compressed.size >> 9;
- else
- vi->i_blocks = ni->allocated_size >> 9;
- /*
- * Make sure the base inode does not go away and attach it to the
- * attribute inode.
- */
- igrab(base_vi);
- ni->ext.base_ntfs_ino = base_ni;
- ni->nr_extents = -1;
-
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
-
- ntfs_debug("Done.");
- return 0;
-
-unm_err_out:
- if (!err)
- err = -EIO;
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
-err_out:
- ntfs_error(vol->sb, "Failed with error code %i while reading attribute "
- "inode (mft_no 0x%lx, type 0x%x, name_len %i). "
- "Marking corrupt inode and base inode 0x%lx as bad. "
- "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len,
- base_vi->i_ino);
- make_bad_inode(vi);
- if (err != -ENOMEM)
- NVolSetErrors(vol);
- return err;
-}
-
-/**
- * ntfs_read_locked_index_inode - read an index inode from its base inode
- * @base_vi: base inode
- * @vi: index inode to read
- *
- * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the
- * index inode described by @vi into memory from the base mft record described
- * by @base_ni.
- *
- * ntfs_read_locked_index_inode() maps, pins and locks the base inode for
- * reading and looks up the attributes relating to the index described by @vi
- * before setting up the necessary fields in @vi as well as initializing the
- * ntfs inode.
- *
- * Note, index inodes are essentially attribute inodes (NInoAttr() is true)
- * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they
- * are setup like directory inodes since directories are a special case of
- * indices ao they need to be treated in much the same way. Most importantly,
- * for small indices the index allocation attribute might not actually exist.
- * However, the index root attribute always exists but this does not need to
- * have an inode associated with it and this is why we define a new inode type
- * index. Also, like for directories, we need to have an attribute inode for
- * the bitmap attribute corresponding to the index allocation attribute and we
- * can store this in the appropriate field of the inode, just like we do for
- * normal directory inodes.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- * i_count is set to 1, so it is not going to go away
- *
- * Return 0 on success and -errno on error. In the error case, the inode will
- * have had make_bad_inode() executed on it.
- */
-static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
-{
- loff_t bvi_size;
- ntfs_volume *vol = NTFS_SB(vi->i_sb);
- ntfs_inode *ni, *base_ni, *bni;
- struct inode *bvi;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
- INDEX_ROOT *ir;
- u8 *ir_end, *index_end;
- int err = 0;
-
- ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
- ntfs_init_big_inode(vi);
- ni = NTFS_I(vi);
- base_ni = NTFS_I(base_vi);
- /* Just mirror the values from the base inode. */
- vi->i_uid = base_vi->i_uid;
- vi->i_gid = base_vi->i_gid;
- set_nlink(vi, base_vi->i_nlink);
- inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
- inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
- inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
- vi->i_generation = ni->seq_no = base_ni->seq_no;
- /* Set inode type to zero but preserve permissions. */
- vi->i_mode = base_vi->i_mode & ~S_IFMT;
- /* Map the mft record for the base inode. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- /* Find the index root attribute. */
- err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
- "missing.");
- goto unm_err_out;
- }
- a = ctx->attr;
- /* Set up the state. */
- if (unlikely(a->non_resident)) {
- ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident.");
- goto unm_err_out;
- }
- /* Ensure the attribute name is placed before the value. */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(a->data.resident.value_offset)))) {
- ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed "
- "after the attribute value.");
- goto unm_err_out;
- }
- /*
- * Compressed/encrypted/sparse index root is not allowed, except for
- * directories of course but those are not dealt with here.
- */
- if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
- ATTR_IS_SPARSE)) {
- ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
- "root attribute.");
- goto unm_err_out;
- }
- ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset));
- ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
- if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
- ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt.");
- goto unm_err_out;
- }
- index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
- if (index_end > ir_end) {
- ntfs_error(vi->i_sb, "Index is corrupt.");
- goto unm_err_out;
- }
- if (ir->type) {
- ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).",
- le32_to_cpu(ir->type));
- goto unm_err_out;
- }
- ni->itype.index.collation_rule = ir->collation_rule;
- ntfs_debug("Index collation rule is 0x%x.",
- le32_to_cpu(ir->collation_rule));
- ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
- if (!is_power_of_2(ni->itype.index.block_size)) {
- ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
- "two.", ni->itype.index.block_size);
- goto unm_err_out;
- }
- if (ni->itype.index.block_size > PAGE_SIZE) {
- ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE "
- "(%ld) is not supported. Sorry.",
- ni->itype.index.block_size, PAGE_SIZE);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
- ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE "
- "(%i) is not supported. Sorry.",
- ni->itype.index.block_size, NTFS_BLOCK_SIZE);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1;
- /* Determine the size of a vcn in the index. */
- if (vol->cluster_size <= ni->itype.index.block_size) {
- ni->itype.index.vcn_size = vol->cluster_size;
- ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
- } else {
- ni->itype.index.vcn_size = vol->sector_size;
- ni->itype.index.vcn_size_bits = vol->sector_size_bits;
- }
- /* Check for presence of index allocation attribute. */
- if (!(ir->index.flags & LARGE_INDEX)) {
- /* No index allocation. */
- vi->i_size = ni->initialized_size = ni->allocated_size = 0;
- /* We are done with the mft record, so we release it. */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- m = NULL;
- ctx = NULL;
- goto skip_large_index_stuff;
- } /* LARGE_INDEX: Index allocation present. Setup state. */
- NInoSetIndexAllocPresent(ni);
- /* Find index allocation attribute. */
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
- "not present but $INDEX_ROOT "
- "indicated it is.");
- else
- ntfs_error(vi->i_sb, "Failed to lookup "
- "$INDEX_ALLOCATION attribute.");
- goto unm_err_out;
- }
- a = ctx->attr;
- if (!a->non_resident) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
- "resident.");
- goto unm_err_out;
- }
- /*
- * Ensure the attribute name is placed before the mapping pairs array.
- */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset)))) {
- ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is "
- "placed after the mapping pairs array.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_ENCRYPTED) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
- "encrypted.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_SPARSE) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_COMPRESSION_MASK) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
- "compressed.");
- goto unm_err_out;
- }
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION "
- "attribute has non zero lowest_vcn.");
- goto unm_err_out;
- }
- vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
- ni->initialized_size = sle64_to_cpu(
- a->data.non_resident.initialized_size);
- ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size);
- /*
- * We are done with the mft record, so we release it. Otherwise
- * we would deadlock in ntfs_attr_iget().
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- m = NULL;
- ctx = NULL;
- /* Get the index bitmap attribute inode. */
- bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len);
- if (IS_ERR(bvi)) {
- ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
- err = PTR_ERR(bvi);
- goto unm_err_out;
- }
- bni = NTFS_I(bvi);
- if (NInoCompressed(bni) || NInoEncrypted(bni) ||
- NInoSparse(bni)) {
- ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or "
- "encrypted and/or sparse.");
- goto iput_unm_err_out;
- }
- /* Consistency check bitmap size vs. index allocation size. */
- bvi_size = i_size_read(bvi);
- if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) {
- ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for "
- "index allocation (0x%llx).", bvi_size << 3,
- vi->i_size);
- goto iput_unm_err_out;
- }
- iput(bvi);
-skip_large_index_stuff:
- /* Setup the operations for this index inode. */
- vi->i_mapping->a_ops = &ntfs_mst_aops;
- vi->i_blocks = ni->allocated_size >> 9;
- /*
- * Make sure the base inode doesn't go away and attach it to the
- * index inode.
- */
- igrab(base_vi);
- ni->ext.base_ntfs_ino = base_ni;
- ni->nr_extents = -1;
-
- ntfs_debug("Done.");
- return 0;
-iput_unm_err_out:
- iput(bvi);
-unm_err_out:
- if (!err)
- err = -EIO;
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
-err_out:
- ntfs_error(vi->i_sb, "Failed with error code %i while reading index "
- "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino,
- ni->name_len);
- make_bad_inode(vi);
- if (err != -EOPNOTSUPP && err != -ENOMEM)
- NVolSetErrors(vol);
- return err;
-}
-
-/*
- * The MFT inode has special locking, so teach the lock validator
- * about this by splitting off the locking rules of the MFT from
- * the locking rules of other inodes. The MFT inode can never be
- * accessed from the VFS side (or even internally), only by the
- * map_mft functions.
- */
-static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key;
-
-/**
- * ntfs_read_inode_mount - special read_inode for mount time use only
- * @vi: inode to read
- *
- * Read inode FILE_MFT at mount time, only called with super_block lock
- * held from within the read_super() code path.
- *
- * This function exists because when it is called the page cache for $MFT/$DATA
- * is not initialized and hence we cannot get at the contents of mft records
- * by calling map_mft_record*().
- *
- * Further it needs to cope with the circular references problem, i.e. cannot
- * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because
- * we do not know where the other extent mft records are yet and again, because
- * we cannot call map_mft_record*() yet. Obviously this applies only when an
- * attribute list is actually present in $MFT inode.
- *
- * We solve these problems by starting with the $DATA attribute before anything
- * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each
- * extent is found, we ntfs_mapping_pairs_decompress() including the implied
- * ntfs_runlists_merge(). Each step of the iteration necessarily provides
- * sufficient information for the next step to complete.
- *
- * This should work but there are two possible pit falls (see inline comments
- * below), but only time will tell if they are real pits or just smoke...
- */
-int ntfs_read_inode_mount(struct inode *vi)
-{
- VCN next_vcn, last_vcn, highest_vcn;
- s64 block;
- struct super_block *sb = vi->i_sb;
- ntfs_volume *vol = NTFS_SB(sb);
- struct buffer_head *bh;
- ntfs_inode *ni;
- MFT_RECORD *m = NULL;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
- unsigned int i, nr_blocks;
- int err;
-
- ntfs_debug("Entering.");
-
- /* Initialize the ntfs specific part of @vi. */
- ntfs_init_big_inode(vi);
-
- ni = NTFS_I(vi);
-
- /* Setup the data attribute. It is special as it is mst protected. */
- NInoSetNonResident(ni);
- NInoSetMstProtected(ni);
- NInoSetSparseDisabled(ni);
- ni->type = AT_DATA;
- ni->name = NULL;
- ni->name_len = 0;
- /*
- * This sets up our little cheat allowing us to reuse the async read io
- * completion handler for directories.
- */
- ni->itype.index.block_size = vol->mft_record_size;
- ni->itype.index.block_size_bits = vol->mft_record_size_bits;
-
- /* Very important! Needed to be able to call map_mft_record*(). */
- vol->mft_ino = vi;
-
- /* Allocate enough memory to read the first mft record. */
- if (vol->mft_record_size > 64 * 1024) {
- ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).",
- vol->mft_record_size);
- goto err_out;
- }
- i = vol->mft_record_size;
- if (i < sb->s_blocksize)
- i = sb->s_blocksize;
- m = (MFT_RECORD*)ntfs_malloc_nofs(i);
- if (!m) {
- ntfs_error(sb, "Failed to allocate buffer for $MFT record 0.");
- goto err_out;
- }
-
- /* Determine the first block of the $MFT/$DATA attribute. */
- block = vol->mft_lcn << vol->cluster_size_bits >>
- sb->s_blocksize_bits;
- nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits;
- if (!nr_blocks)
- nr_blocks = 1;
-
- /* Load $MFT/$DATA's first mft record. */
- for (i = 0; i < nr_blocks; i++) {
- bh = sb_bread(sb, block++);
- if (!bh) {
- ntfs_error(sb, "Device read failed.");
- goto err_out;
- }
- memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data,
- sb->s_blocksize);
- brelse(bh);
- }
-
- if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) {
- ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.",
- le32_to_cpu(m->bytes_allocated), vol->mft_record_size);
- goto err_out;
- }
-
- /* Apply the mst fixups. */
- if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
- /* FIXME: Try to use the $MFTMirr now. */
- ntfs_error(sb, "MST fixup failed. $MFT is corrupt.");
- goto err_out;
- }
-
- /* Sanity check offset to the first attribute */
- if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) {
- ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.",
- le16_to_cpu(m->attrs_offset));
- goto err_out;
- }
-
- /* Need this to sanity check attribute list references to $MFT. */
- vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
-
- /* Provides read_folio() for map_mft_record(). */
- vi->i_mapping->a_ops = &ntfs_mst_aops;
-
- ctx = ntfs_attr_get_search_ctx(ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto err_out;
- }
-
- /* Find the attribute list attribute if present. */
- err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
- if (err) {
- if (unlikely(err != -ENOENT)) {
- ntfs_error(sb, "Failed to lookup attribute list "
- "attribute. You should run chkdsk.");
- goto put_err_out;
- }
- } else /* if (!err) */ {
- ATTR_LIST_ENTRY *al_entry, *next_al_entry;
- u8 *al_end;
- static const char *es = " Not allowed. $MFT is corrupt. "
- "You should run chkdsk.";
-
- ntfs_debug("Attribute list attribute found in $MFT.");
- NInoSetAttrList(ni);
- a = ctx->attr;
- if (a->flags & ATTR_COMPRESSION_MASK) {
- ntfs_error(sb, "Attribute list attribute is "
- "compressed.%s", es);
- goto put_err_out;
- }
- if (a->flags & ATTR_IS_ENCRYPTED ||
- a->flags & ATTR_IS_SPARSE) {
- if (a->non_resident) {
- ntfs_error(sb, "Non-resident attribute list "
- "attribute is encrypted/"
- "sparse.%s", es);
- goto put_err_out;
- }
- ntfs_warning(sb, "Resident attribute list attribute "
- "in $MFT system file is marked "
- "encrypted/sparse which is not true. "
- "However, Windows allows this and "
- "chkdsk does not detect or correct it "
- "so we will just ignore the invalid "
- "flags and pretend they are not set.");
- }
- /* Now allocate memory for the attribute list. */
- ni->attr_list_size = (u32)ntfs_attr_size(a);
- if (!ni->attr_list_size) {
- ntfs_error(sb, "Attr_list_size is zero");
- goto put_err_out;
- }
- ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
- if (!ni->attr_list) {
- ntfs_error(sb, "Not enough memory to allocate buffer "
- "for attribute list.");
- goto put_err_out;
- }
- if (a->non_resident) {
- NInoSetAttrListNonResident(ni);
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(sb, "Attribute list has non zero "
- "lowest_vcn. $MFT is corrupt. "
- "You should run chkdsk.");
- goto put_err_out;
- }
- /* Setup the runlist. */
- ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
- a, NULL);
- if (IS_ERR(ni->attr_list_rl.rl)) {
- err = PTR_ERR(ni->attr_list_rl.rl);
- ni->attr_list_rl.rl = NULL;
- ntfs_error(sb, "Mapping pairs decompression "
- "failed with error code %i.",
- -err);
- goto put_err_out;
- }
- /* Now load the attribute list. */
- if ((err = load_attribute_list(vol, &ni->attr_list_rl,
- ni->attr_list, ni->attr_list_size,
- sle64_to_cpu(a->data.
- non_resident.initialized_size)))) {
- ntfs_error(sb, "Failed to load attribute list "
- "attribute with error code %i.",
- -err);
- goto put_err_out;
- }
- } else /* if (!ctx.attr->non_resident) */ {
- if ((u8*)a + le16_to_cpu(
- a->data.resident.value_offset) +
- le32_to_cpu(
- a->data.resident.value_length) >
- (u8*)ctx->mrec + vol->mft_record_size) {
- ntfs_error(sb, "Corrupt attribute list "
- "attribute.");
- goto put_err_out;
- }
- /* Now copy the attribute list. */
- memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
- a->data.resident.value_offset),
- le32_to_cpu(
- a->data.resident.value_length));
- }
- /* The attribute list is now setup in memory. */
- /*
- * FIXME: I don't know if this case is actually possible.
- * According to logic it is not possible but I have seen too
- * many weird things in MS software to rely on logic... Thus we
- * perform a manual search and make sure the first $MFT/$DATA
- * extent is in the base inode. If it is not we abort with an
- * error and if we ever see a report of this error we will need
- * to do some magic in order to have the necessary mft record
- * loaded and in the right place in the page cache. But
- * hopefully logic will prevail and this never happens...
- */
- al_entry = (ATTR_LIST_ENTRY*)ni->attr_list;
- al_end = (u8*)al_entry + ni->attr_list_size;
- for (;; al_entry = next_al_entry) {
- /* Out of bounds check. */
- if ((u8*)al_entry < ni->attr_list ||
- (u8*)al_entry > al_end)
- goto em_put_err_out;
- /* Catch the end of the attribute list. */
- if ((u8*)al_entry == al_end)
- goto em_put_err_out;
- if (!al_entry->length)
- goto em_put_err_out;
- if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
- le16_to_cpu(al_entry->length) > al_end)
- goto em_put_err_out;
- next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
- le16_to_cpu(al_entry->length));
- if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
- goto em_put_err_out;
- if (AT_DATA != al_entry->type)
- continue;
- /* We want an unnamed attribute. */
- if (al_entry->name_length)
- goto em_put_err_out;
- /* Want the first entry, i.e. lowest_vcn == 0. */
- if (al_entry->lowest_vcn)
- goto em_put_err_out;
- /* First entry has to be in the base mft record. */
- if (MREF_LE(al_entry->mft_reference) != vi->i_ino) {
- /* MFT references do not match, logic fails. */
- ntfs_error(sb, "BUG: The first $DATA extent "
- "of $MFT is not in the base "
- "mft record. Please report "
- "you saw this message to "
- "linux-ntfs-dev@lists."
- "sourceforge.net");
- goto put_err_out;
- } else {
- /* Sequence numbers must match. */
- if (MSEQNO_LE(al_entry->mft_reference) !=
- ni->seq_no)
- goto em_put_err_out;
- /* Got it. All is ok. We can stop now. */
- break;
- }
- }
- }
-
- ntfs_attr_reinit_search_ctx(ctx);
-
- /* Now load all attribute extents. */
- a = NULL;
- next_vcn = last_vcn = highest_vcn = 0;
- while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0,
- ctx))) {
- runlist_element *nrl;
-
- /* Cache the current attribute. */
- a = ctx->attr;
- /* $MFT must be non-resident. */
- if (!a->non_resident) {
- ntfs_error(sb, "$MFT must be non-resident but a "
- "resident extent was found. $MFT is "
- "corrupt. Run chkdsk.");
- goto put_err_out;
- }
- /* $MFT must be uncompressed and unencrypted. */
- if (a->flags & ATTR_COMPRESSION_MASK ||
- a->flags & ATTR_IS_ENCRYPTED ||
- a->flags & ATTR_IS_SPARSE) {
- ntfs_error(sb, "$MFT must be uncompressed, "
- "non-sparse, and unencrypted but a "
- "compressed/sparse/encrypted extent "
- "was found. $MFT is corrupt. Run "
- "chkdsk.");
- goto put_err_out;
- }
- /*
- * Decompress the mapping pairs array of this extent and merge
- * the result into the existing runlist. No need for locking
- * as we have exclusive access to the inode at this time and we
- * are a mount in progress task, too.
- */
- nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
- if (IS_ERR(nrl)) {
- ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
- "failed with error code %ld. $MFT is "
- "corrupt.", PTR_ERR(nrl));
- goto put_err_out;
- }
- ni->runlist.rl = nrl;
-
- /* Are we in the first extent? */
- if (!next_vcn) {
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(sb, "First extent of $DATA "
- "attribute has non zero "
- "lowest_vcn. $MFT is corrupt. "
- "You should run chkdsk.");
- goto put_err_out;
- }
- /* Get the last vcn in the $DATA attribute. */
- last_vcn = sle64_to_cpu(
- a->data.non_resident.allocated_size)
- >> vol->cluster_size_bits;
- /* Fill in the inode size. */
- vi->i_size = sle64_to_cpu(
- a->data.non_resident.data_size);
- ni->initialized_size = sle64_to_cpu(
- a->data.non_resident.initialized_size);
- ni->allocated_size = sle64_to_cpu(
- a->data.non_resident.allocated_size);
- /*
- * Verify the number of mft records does not exceed
- * 2^32 - 1.
- */
- if ((vi->i_size >> vol->mft_record_size_bits) >=
- (1ULL << 32)) {
- ntfs_error(sb, "$MFT is too big! Aborting.");
- goto put_err_out;
- }
- /*
- * We have got the first extent of the runlist for
- * $MFT which means it is now relatively safe to call
- * the normal ntfs_read_inode() function.
- * Complete reading the inode, this will actually
- * re-read the mft record for $MFT, this time entering
- * it into the page cache with which we complete the
- * kick start of the volume. It should be safe to do
- * this now as the first extent of $MFT/$DATA is
- * already known and we would hope that we don't need
- * further extents in order to find the other
- * attributes belonging to $MFT. Only time will tell if
- * this is really the case. If not we will have to play
- * magic at this point, possibly duplicating a lot of
- * ntfs_read_inode() at this point. We will need to
- * ensure we do enough of its work to be able to call
- * ntfs_read_inode() on extents of $MFT/$DATA. But lets
- * hope this never happens...
- */
- ntfs_read_locked_inode(vi);
- if (is_bad_inode(vi)) {
- ntfs_error(sb, "ntfs_read_inode() of $MFT "
- "failed. BUG or corrupt $MFT. "
- "Run chkdsk and if no errors "
- "are found, please report you "
- "saw this message to "
- "linux-ntfs-dev@lists."
- "sourceforge.net");
- ntfs_attr_put_search_ctx(ctx);
- /* Revert to the safe super operations. */
- ntfs_free(m);
- return -1;
- }
- /*
- * Re-initialize some specifics about $MFT's inode as
- * ntfs_read_inode() will have set up the default ones.
- */
- /* Set uid and gid to root. */
- vi->i_uid = GLOBAL_ROOT_UID;
- vi->i_gid = GLOBAL_ROOT_GID;
- /* Regular file. No access for anyone. */
- vi->i_mode = S_IFREG;
- /* No VFS initiated operations allowed for $MFT. */
- vi->i_op = &ntfs_empty_inode_ops;
- vi->i_fop = &ntfs_empty_file_ops;
- }
-
- /* Get the lowest vcn for the next extent. */
- highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
- next_vcn = highest_vcn + 1;
-
- /* Only one extent or error, which we catch below. */
- if (next_vcn <= 0)
- break;
-
- /* Avoid endless loops due to corruption. */
- if (next_vcn < sle64_to_cpu(
- a->data.non_resident.lowest_vcn)) {
- ntfs_error(sb, "$MFT has corrupt attribute list "
- "attribute. Run chkdsk.");
- goto put_err_out;
- }
- }
- if (err != -ENOENT) {
- ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. "
- "$MFT is corrupt. Run chkdsk.");
- goto put_err_out;
- }
- if (!a) {
- ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is "
- "corrupt. Run chkdsk.");
- goto put_err_out;
- }
- if (highest_vcn && highest_vcn != last_vcn - 1) {
- ntfs_error(sb, "Failed to load the complete runlist for "
- "$MFT/$DATA. Driver bug or corrupt $MFT. "
- "Run chkdsk.");
- ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx",
- (unsigned long long)highest_vcn,
- (unsigned long long)last_vcn - 1);
- goto put_err_out;
- }
- ntfs_attr_put_search_ctx(ctx);
- ntfs_debug("Done.");
- ntfs_free(m);
-
- /*
- * Split the locking rules of the MFT inode from the
- * locking rules of other inodes:
- */
- lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key);
- lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key);
-
- return 0;
-
-em_put_err_out:
- ntfs_error(sb, "Couldn't find first extent of $DATA attribute in "
- "attribute list. $MFT is corrupt. Run chkdsk.");
-put_err_out:
- ntfs_attr_put_search_ctx(ctx);
-err_out:
- ntfs_error(sb, "Failed. Marking inode as bad.");
- make_bad_inode(vi);
- ntfs_free(m);
- return -1;
-}
-
-static void __ntfs_clear_inode(ntfs_inode *ni)
-{
- /* Free all alocated memory. */
- down_write(&ni->runlist.lock);
- if (ni->runlist.rl) {
- ntfs_free(ni->runlist.rl);
- ni->runlist.rl = NULL;
- }
- up_write(&ni->runlist.lock);
-
- if (ni->attr_list) {
- ntfs_free(ni->attr_list);
- ni->attr_list = NULL;
- }
-
- down_write(&ni->attr_list_rl.lock);
- if (ni->attr_list_rl.rl) {
- ntfs_free(ni->attr_list_rl.rl);
- ni->attr_list_rl.rl = NULL;
- }
- up_write(&ni->attr_list_rl.lock);
-
- if (ni->name_len && ni->name != I30) {
- /* Catch bugs... */
- BUG_ON(!ni->name);
- kfree(ni->name);
- }
-}
-
-void ntfs_clear_extent_inode(ntfs_inode *ni)
-{
- ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
-
- BUG_ON(NInoAttr(ni));
- BUG_ON(ni->nr_extents != -1);
-
-#ifdef NTFS_RW
- if (NInoDirty(ni)) {
- if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino)))
- ntfs_error(ni->vol->sb, "Clearing dirty extent inode! "
- "Losing data! This is a BUG!!!");
- // FIXME: Do something!!!
- }
-#endif /* NTFS_RW */
-
- __ntfs_clear_inode(ni);
-
- /* Bye, bye... */
- ntfs_destroy_extent_inode(ni);
-}
-
-/**
- * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
- * @vi: vfs inode pending annihilation
- *
- * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
- * is called, which deallocates all memory belonging to the NTFS specific part
- * of the inode and returns.
- *
- * If the MFT record is dirty, we commit it before doing anything else.
- */
-void ntfs_evict_big_inode(struct inode *vi)
-{
- ntfs_inode *ni = NTFS_I(vi);
-
- truncate_inode_pages_final(&vi->i_data);
- clear_inode(vi);
-
-#ifdef NTFS_RW
- if (NInoDirty(ni)) {
- bool was_bad = (is_bad_inode(vi));
-
- /* Committing the inode also commits all extent inodes. */
- ntfs_commit_inode(vi);
-
- if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) {
- ntfs_error(vi->i_sb, "Failed to commit dirty inode "
- "0x%lx. Losing data!", vi->i_ino);
- // FIXME: Do something!!!
- }
- }
-#endif /* NTFS_RW */
-
- /* No need to lock at this stage as no one else has a reference. */
- if (ni->nr_extents > 0) {
- int i;
-
- for (i = 0; i < ni->nr_extents; i++)
- ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]);
- kfree(ni->ext.extent_ntfs_inos);
- }
-
- __ntfs_clear_inode(ni);
-
- if (NInoAttr(ni)) {
- /* Release the base inode if we are holding it. */
- if (ni->nr_extents == -1) {
- iput(VFS_I(ni->ext.base_ntfs_ino));
- ni->nr_extents = 0;
- ni->ext.base_ntfs_ino = NULL;
- }
- }
- BUG_ON(ni->page);
- if (!atomic_dec_and_test(&ni->count))
- BUG();
- return;
-}
-
-/**
- * ntfs_show_options - show mount options in /proc/mounts
- * @sf: seq_file in which to write our mount options
- * @root: root of the mounted tree whose mount options to display
- *
- * Called by the VFS once for each mounted ntfs volume when someone reads
- * /proc/mounts in order to display the NTFS specific mount options of each
- * mount. The mount options of fs specified by @root are written to the seq file
- * @sf and success is returned.
- */
-int ntfs_show_options(struct seq_file *sf, struct dentry *root)
-{
- ntfs_volume *vol = NTFS_SB(root->d_sb);
- int i;
-
- seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid));
- seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid));
- if (vol->fmask == vol->dmask)
- seq_printf(sf, ",umask=0%o", vol->fmask);
- else {
- seq_printf(sf, ",fmask=0%o", vol->fmask);
- seq_printf(sf, ",dmask=0%o", vol->dmask);
- }
- seq_printf(sf, ",nls=%s", vol->nls_map->charset);
- if (NVolCaseSensitive(vol))
- seq_printf(sf, ",case_sensitive");
- if (NVolShowSystemFiles(vol))
- seq_printf(sf, ",show_sys_files");
- if (!NVolSparseEnabled(vol))
- seq_printf(sf, ",disable_sparse");
- for (i = 0; on_errors_arr[i].val; i++) {
- if (on_errors_arr[i].val & vol->on_errors)
- seq_printf(sf, ",errors=%s", on_errors_arr[i].str);
- }
- seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier);
- return 0;
-}
-
-#ifdef NTFS_RW
-
-static const char *es = " Leaving inconsistent metadata. Unmount and run "
- "chkdsk.";
-
-/**
- * ntfs_truncate - called when the i_size of an ntfs inode is changed
- * @vi: inode for which the i_size was changed
- *
- * We only support i_size changes for normal files at present, i.e. not
- * compressed and not encrypted. This is enforced in ntfs_setattr(), see
- * below.
- *
- * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
- * that the change is allowed.
- *
- * This implies for us that @vi is a file inode rather than a directory, index,
- * or attribute inode as well as that @vi is a base inode.
- *
- * Returns 0 on success or -errno on error.
- *
- * Called with ->i_mutex held.
- */
-int ntfs_truncate(struct inode *vi)
-{
- s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
- VCN highest_vcn;
- unsigned long flags;
- ntfs_inode *base_ni, *ni = NTFS_I(vi);
- ntfs_volume *vol = ni->vol;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- const char *te = " Leaving file length out of sync with i_size.";
- int err, mp_size, size_change, alloc_change;
-
- ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
- BUG_ON(NInoAttr(ni));
- BUG_ON(S_ISDIR(vi->i_mode));
- BUG_ON(NInoMstProtected(ni));
- BUG_ON(ni->nr_extents < 0);
-retry_truncate:
- /*
- * Lock the runlist for writing and map the mft record to ensure it is
- * safe to mess with the attribute runlist and sizes.
- */
- down_write(&ni->runlist.lock);
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
- "(error code %d).%s", vi->i_ino, err, te);
- ctx = NULL;
- m = NULL;
- goto old_bad_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- ntfs_error(vi->i_sb, "Failed to allocate a search context for "
- "inode 0x%lx (not enough memory).%s",
- vi->i_ino, te);
- err = -ENOMEM;
- goto old_bad_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- ntfs_error(vi->i_sb, "Open attribute is missing from "
- "mft record. Inode 0x%lx is corrupt. "
- "Run chkdsk.%s", vi->i_ino, te);
- err = -EIO;
- } else
- ntfs_error(vi->i_sb, "Failed to lookup attribute in "
- "inode 0x%lx (error code %d).%s",
- vi->i_ino, err, te);
- goto old_bad_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- /*
- * The i_size of the vfs inode is the new size for the attribute value.
- */
- new_size = i_size_read(vi);
- /* The current size of the attribute value is the old size. */
- old_size = ntfs_attr_size(a);
- /* Calculate the new allocated size. */
- if (NInoNonResident(ni))
- new_alloc_size = (new_size + vol->cluster_size - 1) &
- ~(s64)vol->cluster_size_mask;
- else
- new_alloc_size = (new_size + 7) & ~7;
- /* The current allocated size is the old allocated size. */
- read_lock_irqsave(&ni->size_lock, flags);
- old_alloc_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- /*
- * The change in the file size. This will be 0 if no change, >0 if the
- * size is growing, and <0 if the size is shrinking.
- */
- size_change = -1;
- if (new_size - old_size >= 0) {
- size_change = 1;
- if (new_size == old_size)
- size_change = 0;
- }
- /* As above for the allocated size. */
- alloc_change = -1;
- if (new_alloc_size - old_alloc_size >= 0) {
- alloc_change = 1;
- if (new_alloc_size == old_alloc_size)
- alloc_change = 0;
- }
- /*
- * If neither the size nor the allocation are being changed there is
- * nothing to do.
- */
- if (!size_change && !alloc_change)
- goto unm_done;
- /* If the size is changing, check if new size is allowed in $AttrDef. */
- if (size_change) {
- err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
- if (unlikely(err)) {
- if (err == -ERANGE) {
- ntfs_error(vol->sb, "Truncate would cause the "
- "inode 0x%lx to %simum size "
- "for its attribute type "
- "(0x%x). Aborting truncate.",
- vi->i_ino,
- new_size > old_size ? "exceed "
- "the max" : "go under the min",
- le32_to_cpu(ni->type));
- err = -EFBIG;
- } else {
- ntfs_error(vol->sb, "Inode 0x%lx has unknown "
- "attribute type 0x%x. "
- "Aborting truncate.",
- vi->i_ino,
- le32_to_cpu(ni->type));
- err = -EIO;
- }
- /* Reset the vfs inode size to the old size. */
- i_size_write(vi, old_size);
- goto err_out;
- }
- }
- if (NInoCompressed(ni) || NInoEncrypted(ni)) {
- ntfs_warning(vi->i_sb, "Changes in inode size are not "
- "supported yet for %s files, ignoring.",
- NInoCompressed(ni) ? "compressed" :
- "encrypted");
- err = -EOPNOTSUPP;
- goto bad_out;
- }
- if (a->non_resident)
- goto do_non_resident_truncate;
- BUG_ON(NInoNonResident(ni));
- /* Resize the attribute record to best fit the new attribute size. */
- if (new_size < vol->mft_record_size &&
- !ntfs_resident_attr_value_resize(m, a, new_size)) {
- /* The resize succeeded! */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- write_lock_irqsave(&ni->size_lock, flags);
- /* Update the sizes in the ntfs inode and all is done. */
- ni->allocated_size = le32_to_cpu(a->length) -
- le16_to_cpu(a->data.resident.value_offset);
- /*
- * Note ntfs_resident_attr_value_resize() has already done any
- * necessary data clearing in the attribute record. When the
- * file is being shrunk vmtruncate() will already have cleared
- * the top part of the last partial page, i.e. since this is
- * the resident case this is the page with index 0. However,
- * when the file is being expanded, the page cache page data
- * between the old data_size, i.e. old_size, and the new_size
- * has not been zeroed. Fortunately, we do not need to zero it
- * either since on one hand it will either already be zero due
- * to both read_folio and writepage clearing partial page data
- * beyond i_size in which case there is nothing to do or in the
- * case of the file being mmap()ped at the same time, POSIX
- * specifies that the behaviour is unspecified thus we do not
- * have to do anything. This means that in our implementation
- * in the rare case that the file is mmap()ped and a write
- * occurred into the mmap()ped region just beyond the file size
- * and writepage has not yet been called to write out the page
- * (which would clear the area beyond the file size) and we now
- * extend the file size to incorporate this dirty region
- * outside the file size, a write of the page would result in
- * this data being written to disk instead of being cleared.
- * Given both POSIX and the Linux mmap(2) man page specify that
- * this corner case is undefined, we choose to leave it like
- * that as this is much simpler for us as we cannot lock the
- * relevant page now since we are holding too many ntfs locks
- * which would result in a lock reversal deadlock.
- */
- ni->initialized_size = new_size;
- write_unlock_irqrestore(&ni->size_lock, flags);
- goto unm_done;
- }
- /* If the above resize failed, this must be an attribute extension. */
- BUG_ON(size_change < 0);
- /*
- * We have to drop all the locks so we can call
- * ntfs_attr_make_non_resident(). This could be optimised by try-
- * locking the first page cache page and only if that fails dropping
- * the locks, locking the page, and redoing all the locking and
- * lookups. While this would be a huge optimisation, it is not worth
- * it as this is definitely a slow code path as it only ever can happen
- * once for any given file.
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- /*
- * Not enough space in the mft record, try to make the attribute
- * non-resident and if successful restart the truncation process.
- */
- err = ntfs_attr_make_non_resident(ni, old_size);
- if (likely(!err))
- goto retry_truncate;
- /*
- * Could not make non-resident. If this is due to this not being
- * permitted for this attribute type or there not being enough space,
- * try to make other attributes non-resident. Otherwise fail.
- */
- if (unlikely(err != -EPERM && err != -ENOSPC)) {
- ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
- "type 0x%x, because the conversion from "
- "resident to non-resident attribute failed "
- "with error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- if (err != -ENOMEM)
- err = -EIO;
- goto conv_err_out;
- }
- /* TODO: Not implemented from here, abort. */
- if (err == -ENOSPC)
- ntfs_error(vol->sb, "Not enough space in the mft record/on "
- "disk for the non-resident attribute value. "
- "This case is not implemented yet.");
- else /* if (err == -EPERM) */
- ntfs_error(vol->sb, "This attribute type may not be "
- "non-resident. This case is not implemented "
- "yet.");
- err = -EOPNOTSUPP;
- goto conv_err_out;
-#if 0
- // TODO: Attempt to make other attributes non-resident.
- if (!err)
- goto do_resident_extend;
- /*
- * Both the attribute list attribute and the standard information
- * attribute must remain in the base inode. Thus, if this is one of
- * these attributes, we have to try to move other attributes out into
- * extent mft records instead.
- */
- if (ni->type == AT_ATTRIBUTE_LIST ||
- ni->type == AT_STANDARD_INFORMATION) {
- // TODO: Attempt to move other attributes into extent mft
- // records.
- err = -EOPNOTSUPP;
- if (!err)
- goto do_resident_extend;
- goto err_out;
- }
- // TODO: Attempt to move this attribute to an extent mft record, but
- // only if it is not already the only attribute in an mft record in
- // which case there would be nothing to gain.
- err = -EOPNOTSUPP;
- if (!err)
- goto do_resident_extend;
- /* There is nothing we can do to make enough space. )-: */
- goto err_out;
-#endif
-do_non_resident_truncate:
- BUG_ON(!NInoNonResident(ni));
- if (alloc_change < 0) {
- highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
- if (highest_vcn > 0 &&
- old_alloc_size >> vol->cluster_size_bits >
- highest_vcn + 1) {
- /*
- * This attribute has multiple extents. Not yet
- * supported.
- */
- ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
- "attribute type 0x%x, because the "
- "attribute is highly fragmented (it "
- "consists of multiple extents) and "
- "this case is not implemented yet.",
- vi->i_ino,
- (unsigned)le32_to_cpu(ni->type));
- err = -EOPNOTSUPP;
- goto bad_out;
- }
- }
- /*
- * If the size is shrinking, need to reduce the initialized_size and
- * the data_size before reducing the allocation.
- */
- if (size_change < 0) {
- /*
- * Make the valid size smaller (i_size is already up-to-date).
- */
- write_lock_irqsave(&ni->size_lock, flags);
- if (new_size < ni->initialized_size) {
- ni->initialized_size = new_size;
- a->data.non_resident.initialized_size =
- cpu_to_sle64(new_size);
- }
- a->data.non_resident.data_size = cpu_to_sle64(new_size);
- write_unlock_irqrestore(&ni->size_lock, flags);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- /* If the allocated size is not changing, we are done. */
- if (!alloc_change)
- goto unm_done;
- /*
- * If the size is shrinking it makes no sense for the
- * allocation to be growing.
- */
- BUG_ON(alloc_change > 0);
- } else /* if (size_change >= 0) */ {
- /*
- * The file size is growing or staying the same but the
- * allocation can be shrinking, growing or staying the same.
- */
- if (alloc_change > 0) {
- /*
- * We need to extend the allocation and possibly update
- * the data size. If we are updating the data size,
- * since we are not touching the initialized_size we do
- * not need to worry about the actual data on disk.
- * And as far as the page cache is concerned, there
- * will be no pages beyond the old data size and any
- * partial region in the last page between the old and
- * new data size (or the end of the page if the new
- * data size is outside the page) does not need to be
- * modified as explained above for the resident
- * attribute truncate case. To do this, we simply drop
- * the locks we hold and leave all the work to our
- * friendly helper ntfs_attr_extend_allocation().
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- err = ntfs_attr_extend_allocation(ni, new_size,
- size_change > 0 ? new_size : -1, -1);
- /*
- * ntfs_attr_extend_allocation() will have done error
- * output already.
- */
- goto done;
- }
- if (!alloc_change)
- goto alloc_done;
- }
- /* alloc_change < 0 */
- /* Free the clusters. */
- nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
- vol->cluster_size_bits, -1, ctx);
- m = ctx->mrec;
- a = ctx->attr;
- if (unlikely(nr_freed < 0)) {
- ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
- "%lli). Unmount and run chkdsk to recover "
- "the lost cluster(s).", (long long)nr_freed);
- NVolSetErrors(vol);
- nr_freed = 0;
- }
- /* Truncate the runlist. */
- err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
- new_alloc_size >> vol->cluster_size_bits);
- /*
- * If the runlist truncation failed and/or the search context is no
- * longer valid, we cannot resize the attribute record or build the
- * mapping pairs array thus we mark the inode bad so that no access to
- * the freed clusters can happen.
- */
- if (unlikely(err || IS_ERR(m))) {
- ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
- IS_ERR(m) ?
- "restore attribute search context" :
- "truncate attribute runlist",
- IS_ERR(m) ? PTR_ERR(m) : err, es);
- err = -EIO;
- goto bad_out;
- }
- /* Get the size for the shrunk mapping pairs array for the runlist. */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
- if (unlikely(mp_size <= 0)) {
- ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
- "attribute type 0x%x, because determining the "
- "size for the mapping pairs failed with error "
- "code %i.%s", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), mp_size, es);
- err = -EIO;
- goto bad_out;
- }
- /*
- * Shrink the attribute record for the new mapping pairs array. Note,
- * this cannot fail since we are making the attribute smaller thus by
- * definition there is enough space to do so.
- */
- err = ntfs_attr_record_resize(m, a, mp_size +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
- BUG_ON(err);
- /*
- * Generate the mapping pairs array directly into the attribute record.
- */
- err = ntfs_mapping_pairs_build(vol, (u8*)a +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
- mp_size, ni->runlist.rl, 0, -1, NULL);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
- "attribute type 0x%x, because building the "
- "mapping pairs failed with error code %i.%s",
- vi->i_ino, (unsigned)le32_to_cpu(ni->type),
- err, es);
- err = -EIO;
- goto bad_out;
- }
- /* Update the allocated/compressed size as well as the highest vcn. */
- a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
- vol->cluster_size_bits) - 1);
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = new_alloc_size;
- a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
- if (NInoSparse(ni) || NInoCompressed(ni)) {
- if (nr_freed) {
- ni->itype.compressed.size -= nr_freed <<
- vol->cluster_size_bits;
- BUG_ON(ni->itype.compressed.size < 0);
- a->data.non_resident.compressed_size = cpu_to_sle64(
- ni->itype.compressed.size);
- vi->i_blocks = ni->itype.compressed.size >> 9;
- }
- } else
- vi->i_blocks = new_alloc_size >> 9;
- write_unlock_irqrestore(&ni->size_lock, flags);
- /*
- * We have shrunk the allocation. If this is a shrinking truncate we
- * have already dealt with the initialized_size and the data_size above
- * and we are done. If the truncate is only changing the allocation
- * and not the data_size, we are also done. If this is an extending
- * truncate, need to extend the data_size now which is ensured by the
- * fact that @size_change is positive.
- */
-alloc_done:
- /*
- * If the size is growing, need to update it now. If it is shrinking,
- * we have already updated it above (before the allocation change).
- */
- if (size_change > 0)
- a->data.non_resident.data_size = cpu_to_sle64(new_size);
- /* Ensure the modified mft record is written out. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
-unm_done:
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
-done:
- /* Update the mtime and ctime on the base inode. */
- /* normally ->truncate shouldn't update ctime or mtime,
- * but ntfs did before so it got a copy & paste version
- * of file_update_time. one day someone should fix this
- * for real.
- */
- if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
- struct timespec64 now = current_time(VFS_I(base_ni));
- struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni));
- struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni));
- int sync_it = 0;
-
- if (!timespec64_equal(&mtime, &now) ||
- !timespec64_equal(&ctime, &now))
- sync_it = 1;
- inode_set_ctime_to_ts(VFS_I(base_ni), now);
- inode_set_mtime_to_ts(VFS_I(base_ni), now);
-
- if (sync_it)
- mark_inode_dirty_sync(VFS_I(base_ni));
- }
-
- if (likely(!err)) {
- NInoClearTruncateFailed(ni);
- ntfs_debug("Done.");
- }
- return err;
-old_bad_out:
- old_size = -1;
-bad_out:
- if (err != -ENOMEM && err != -EOPNOTSUPP)
- NVolSetErrors(vol);
- if (err != -EOPNOTSUPP)
- NInoSetTruncateFailed(ni);
- else if (old_size >= 0)
- i_size_write(vi, old_size);
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
-out:
- ntfs_debug("Failed. Returning error code %i.", err);
- return err;
-conv_err_out:
- if (err != -ENOMEM && err != -EOPNOTSUPP)
- NVolSetErrors(vol);
- if (err != -EOPNOTSUPP)
- NInoSetTruncateFailed(ni);
- else
- i_size_write(vi, old_size);
- goto out;
-}
-
-/**
- * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value
- * @vi: inode for which the i_size was changed
- *
- * Wrapper for ntfs_truncate() that has no return value.
- *
- * See ntfs_truncate() description above for details.
- */
-#ifdef NTFS_RW
-void ntfs_truncate_vfs(struct inode *vi) {
- ntfs_truncate(vi);
-}
-#endif
-
-/**
- * ntfs_setattr - called from notify_change() when an attribute is being changed
- * @idmap: idmap of the mount the inode was found from
- * @dentry: dentry whose attributes to change
- * @attr: structure describing the attributes and the changes
- *
- * We have to trap VFS attempts to truncate the file described by @dentry as
- * soon as possible, because we do not implement changes in i_size yet. So we
- * abort all i_size changes here.
- *
- * We also abort all changes of user, group, and mode as we do not implement
- * the NTFS ACLs yet.
- *
- * Called with ->i_mutex held.
- */
-int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct iattr *attr)
-{
- struct inode *vi = d_inode(dentry);
- int err;
- unsigned int ia_valid = attr->ia_valid;
-
- err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
- if (err)
- goto out;
- /* We do not support NTFS ACLs yet. */
- if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
- ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
- "supported yet, ignoring.");
- err = -EOPNOTSUPP;
- goto out;
- }
- if (ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(vi)) {
- ntfs_inode *ni = NTFS_I(vi);
- /*
- * FIXME: For now we do not support resizing of
- * compressed or encrypted files yet.
- */
- if (NInoCompressed(ni) || NInoEncrypted(ni)) {
- ntfs_warning(vi->i_sb, "Changes in inode size "
- "are not supported yet for "
- "%s files, ignoring.",
- NInoCompressed(ni) ?
- "compressed" : "encrypted");
- err = -EOPNOTSUPP;
- } else {
- truncate_setsize(vi, attr->ia_size);
- ntfs_truncate_vfs(vi);
- }
- if (err || ia_valid == ATTR_SIZE)
- goto out;
- } else {
- /*
- * We skipped the truncate but must still update
- * timestamps.
- */
- ia_valid |= ATTR_MTIME | ATTR_CTIME;
- }
- }
- if (ia_valid & ATTR_ATIME)
- inode_set_atime_to_ts(vi, attr->ia_atime);
- if (ia_valid & ATTR_MTIME)
- inode_set_mtime_to_ts(vi, attr->ia_mtime);
- if (ia_valid & ATTR_CTIME)
- inode_set_ctime_to_ts(vi, attr->ia_ctime);
- mark_inode_dirty(vi);
-out:
- return err;
-}
-
-/**
- * __ntfs_write_inode - write out a dirty inode
- * @vi: inode to write out
- * @sync: if true, write out synchronously
- *
- * Write out a dirty inode to disk including any extent inodes if present.
- *
- * If @sync is true, commit the inode to disk and wait for io completion. This
- * is done using write_mft_record().
- *
- * If @sync is false, just schedule the write to happen but do not wait for i/o
- * completion. In 2.6 kernels, scheduling usually happens just by virtue of
- * marking the page (and in this case mft record) dirty but we do not implement
- * this yet as write_mft_record() largely ignores the @sync parameter and
- * always performs synchronous writes.
- *
- * Return 0 on success and -errno on error.
- */
-int __ntfs_write_inode(struct inode *vi, int sync)
-{
- sle64 nt;
- ntfs_inode *ni = NTFS_I(vi);
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- STANDARD_INFORMATION *si;
- int err = 0;
- bool modified = false;
-
- ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
- vi->i_ino);
- /*
- * Dirty attribute inodes are written via their real inodes so just
- * clean them here. Access time updates are taken care off when the
- * real inode is written.
- */
- if (NInoAttr(ni)) {
- NInoClearDirty(ni);
- ntfs_debug("Done.");
- return 0;
- }
- /* Map, pin, and lock the mft record belonging to the inode. */
- m = map_mft_record(ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- goto err_out;
- }
- /* Update the access times in the standard information attribute. */
- ctx = ntfs_attr_get_search_ctx(ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- ntfs_attr_put_search_ctx(ctx);
- goto unm_err_out;
- }
- si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset));
- /* Update the access times if they have changed. */
- nt = utc2ntfs(inode_get_mtime(vi));
- if (si->last_data_change_time != nt) {
- ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
- "new = 0x%llx", vi->i_ino, (long long)
- sle64_to_cpu(si->last_data_change_time),
- (long long)sle64_to_cpu(nt));
- si->last_data_change_time = nt;
- modified = true;
- }
- nt = utc2ntfs(inode_get_ctime(vi));
- if (si->last_mft_change_time != nt) {
- ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
- "new = 0x%llx", vi->i_ino, (long long)
- sle64_to_cpu(si->last_mft_change_time),
- (long long)sle64_to_cpu(nt));
- si->last_mft_change_time = nt;
- modified = true;
- }
- nt = utc2ntfs(inode_get_atime(vi));
- if (si->last_access_time != nt) {
- ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
- "new = 0x%llx", vi->i_ino,
- (long long)sle64_to_cpu(si->last_access_time),
- (long long)sle64_to_cpu(nt));
- si->last_access_time = nt;
- modified = true;
- }
- /*
- * If we just modified the standard information attribute we need to
- * mark the mft record it is in dirty. We do this manually so that
- * mark_inode_dirty() is not called which would redirty the inode and
- * hence result in an infinite loop of trying to write the inode.
- * There is no need to mark the base inode nor the base mft record
- * dirty, since we are going to write this mft record below in any case
- * and the base mft record may actually not have been modified so it
- * might not need to be written out.
- * NOTE: It is not a problem when the inode for $MFT itself is being
- * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES
- * on the $MFT inode and hence __ntfs_write_inode() will not be
- * re-invoked because of it which in turn is ok since the dirtied mft
- * record will be cleaned and written out to disk below, i.e. before
- * this function returns.
- */
- if (modified) {
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- if (!NInoTestSetDirty(ctx->ntfs_ino))
- mark_ntfs_record_dirty(ctx->ntfs_ino->page,
- ctx->ntfs_ino->page_ofs);
- }
- ntfs_attr_put_search_ctx(ctx);
- /* Now the access times are updated, write the base mft record. */
- if (NInoDirty(ni))
- err = write_mft_record(ni, m, sync);
- /* Write all attached extent mft records. */
- mutex_lock(&ni->extent_lock);
- if (ni->nr_extents > 0) {
- ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos;
- int i;
-
- ntfs_debug("Writing %i extent inodes.", ni->nr_extents);
- for (i = 0; i < ni->nr_extents; i++) {
- ntfs_inode *tni = extent_nis[i];
-
- if (NInoDirty(tni)) {
- MFT_RECORD *tm = map_mft_record(tni);
- int ret;
-
- if (IS_ERR(tm)) {
- if (!err || err == -ENOMEM)
- err = PTR_ERR(tm);
- continue;
- }
- ret = write_mft_record(tni, tm, sync);
- unmap_mft_record(tni);
- if (unlikely(ret)) {
- if (!err || err == -ENOMEM)
- err = ret;
- }
- }
- }
- }
- mutex_unlock(&ni->extent_lock);
- unmap_mft_record(ni);
- if (unlikely(err))
- goto err_out;
- ntfs_debug("Done.");
- return 0;
-unm_err_out:
- unmap_mft_record(ni);
-err_out:
- if (err == -ENOMEM) {
- ntfs_warning(vi->i_sb, "Not enough memory to write inode. "
- "Marking the inode dirty again, so the VFS "
- "retries later.");
- mark_inode_dirty(vi);
- } else {
- ntfs_error(vi->i_sb, "Failed (error %i): Run chkdsk.", -err);
- NVolSetErrors(ni->vol);
- }
- return err;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
deleted file mode 100644
index 147ef4ddb691..000000000000
--- a/fs/ntfs/inode.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
- * the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_INODE_H
-#define _LINUX_NTFS_INODE_H
-
-#include <linux/atomic.h>
-
-#include <linux/fs.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/mutex.h>
-#include <linux/seq_file.h>
-
-#include "layout.h"
-#include "volume.h"
-#include "types.h"
-#include "runlist.h"
-#include "debug.h"
-
-typedef struct _ntfs_inode ntfs_inode;
-
-/*
- * The NTFS in-memory inode structure. It is just used as an extension to the
- * fields already provided in the VFS inode.
- */
-struct _ntfs_inode {
- rwlock_t size_lock; /* Lock serializing access to inode sizes. */
- s64 initialized_size; /* Copy from the attribute record. */
- s64 allocated_size; /* Copy from the attribute record. */
- unsigned long state; /* NTFS specific flags describing this inode.
- See ntfs_inode_state_bits below. */
- unsigned long mft_no; /* Number of the mft record / inode. */
- u16 seq_no; /* Sequence number of the mft record. */
- atomic_t count; /* Inode reference count for book keeping. */
- ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */
- /*
- * If NInoAttr() is true, the below fields describe the attribute which
- * this fake inode belongs to. The actual inode of this attribute is
- * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see
- * below). For real inodes, we also set the type (AT_DATA for files and
- * AT_INDEX_ALLOCATION for directories), with the name = NULL and
- * name_len = 0 for files and name = I30 (global constant) and
- * name_len = 4 for directories.
- */
- ATTR_TYPE type; /* Attribute type of this fake inode. */
- ntfschar *name; /* Attribute name of this fake inode. */
- u32 name_len; /* Attribute name length of this fake inode. */
- runlist runlist; /* If state has the NI_NonResident bit set,
- the runlist of the unnamed data attribute
- (if a file) or of the index allocation
- attribute (directory) or of the attribute
- described by the fake inode (if NInoAttr()).
- If runlist.rl is NULL, the runlist has not
- been read in yet or has been unmapped. If
- NI_NonResident is clear, the attribute is
- resident (file and fake inode) or there is
- no $I30 index allocation attribute
- (small directory). In the latter case
- runlist.rl is always NULL.*/
- /*
- * The following fields are only valid for real inodes and extent
- * inodes.
- */
- struct mutex mrec_lock; /* Lock for serializing access to the
- mft record belonging to this inode. */
- struct page *page; /* The page containing the mft record of the
- inode. This should only be touched by the
- (un)map_mft_record*() functions. */
- int page_ofs; /* Offset into the page at which the mft record
- begins. This should only be touched by the
- (un)map_mft_record*() functions. */
- /*
- * Attribute list support (only for use by the attribute lookup
- * functions). Setup during read_inode for all inodes with attribute
- * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is
- * further only valid if NI_AttrListNonResident is set.
- */
- u32 attr_list_size; /* Length of attribute list value in bytes. */
- u8 *attr_list; /* Attribute list value itself. */
- runlist attr_list_rl; /* Run list for the attribute list value. */
- union {
- struct { /* It is a directory, $MFT, or an index inode. */
- u32 block_size; /* Size of an index block. */
- u32 vcn_size; /* Size of a vcn in this
- index. */
- COLLATION_RULE collation_rule; /* The collation rule
- for the index. */
- u8 block_size_bits; /* Log2 of the above. */
- u8 vcn_size_bits; /* Log2 of the above. */
- } index;
- struct { /* It is a compressed/sparse file/attribute inode. */
- s64 size; /* Copy of compressed_size from
- $DATA. */
- u32 block_size; /* Size of a compression block
- (cb). */
- u8 block_size_bits; /* Log2 of the size of a cb. */
- u8 block_clusters; /* Number of clusters per cb. */
- } compressed;
- } itype;
- struct mutex extent_lock; /* Lock for accessing/modifying the
- below . */
- s32 nr_extents; /* For a base mft record, the number of attached extent
- inodes (0 if none), for extent records and for fake
- inodes describing an attribute this is -1. */
- union { /* This union is only used if nr_extents != 0. */
- ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of
- the ntfs inodes of the extent
- mft records belonging to
- this base inode which have
- been loaded. */
- ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the
- ntfs inode of the base mft
- record. For fake inodes, the
- real (base) inode to which
- the attribute belongs. */
- } ext;
-};
-
-/*
- * Defined bits for the state field in the ntfs_inode structure.
- * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only
- */
-typedef enum {
- NI_Dirty, /* 1: Mft record needs to be written to disk. */
- NI_AttrList, /* 1: Mft record contains an attribute list. */
- NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies
- NI_AttrList is set. */
-
- NI_Attr, /* 1: Fake inode for attribute i/o.
- 0: Real inode or extent inode. */
-
- NI_MstProtected, /* 1: Attribute is protected by MST fixups.
- 0: Attribute is not protected by fixups. */
- NI_NonResident, /* 1: Unnamed data attr is non-resident (f).
- 1: Attribute is non-resident (a). */
- NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is
- present (d). */
- NI_Compressed, /* 1: Unnamed data attr is compressed (f).
- 1: Create compressed files by default (d).
- 1: Attribute is compressed (a). */
- NI_Encrypted, /* 1: Unnamed data attr is encrypted (f).
- 1: Create encrypted files by default (d).
- 1: Attribute is encrypted (a). */
- NI_Sparse, /* 1: Unnamed data attr is sparse (f).
- 1: Create sparse files by default (d).
- 1: Attribute is sparse (a). */
- NI_SparseDisabled, /* 1: May not create sparse regions. */
- NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */
-} ntfs_inode_state_bits;
-
-/*
- * NOTE: We should be adding dirty mft records to a list somewhere and they
- * should be independent of the (ntfs/vfs) inode structure so that an inode can
- * be removed but the record can be left dirty for syncing later.
- */
-
-/*
- * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo()
- * functions.
- */
-#define NINO_FNS(flag) \
-static inline int NIno##flag(ntfs_inode *ni) \
-{ \
- return test_bit(NI_##flag, &(ni)->state); \
-} \
-static inline void NInoSet##flag(ntfs_inode *ni) \
-{ \
- set_bit(NI_##flag, &(ni)->state); \
-} \
-static inline void NInoClear##flag(ntfs_inode *ni) \
-{ \
- clear_bit(NI_##flag, &(ni)->state); \
-}
-
-/*
- * As above for NInoTestSetFoo() and NInoTestClearFoo().
- */
-#define TAS_NINO_FNS(flag) \
-static inline int NInoTestSet##flag(ntfs_inode *ni) \
-{ \
- return test_and_set_bit(NI_##flag, &(ni)->state); \
-} \
-static inline int NInoTestClear##flag(ntfs_inode *ni) \
-{ \
- return test_and_clear_bit(NI_##flag, &(ni)->state); \
-}
-
-/* Emit the ntfs inode bitops functions. */
-NINO_FNS(Dirty)
-TAS_NINO_FNS(Dirty)
-NINO_FNS(AttrList)
-NINO_FNS(AttrListNonResident)
-NINO_FNS(Attr)
-NINO_FNS(MstProtected)
-NINO_FNS(NonResident)
-NINO_FNS(IndexAllocPresent)
-NINO_FNS(Compressed)
-NINO_FNS(Encrypted)
-NINO_FNS(Sparse)
-NINO_FNS(SparseDisabled)
-NINO_FNS(TruncateFailed)
-
-/*
- * The full structure containing a ntfs_inode and a vfs struct inode. Used for
- * all real and fake inodes but not for extent inodes which lack the vfs struct
- * inode.
- */
-typedef struct {
- ntfs_inode ntfs_inode;
- struct inode vfs_inode; /* The vfs inode structure. */
-} big_ntfs_inode;
-
-/**
- * NTFS_I - return the ntfs inode given a vfs inode
- * @inode: VFS inode
- *
- * NTFS_I() returns the ntfs inode associated with the VFS @inode.
- */
-static inline ntfs_inode *NTFS_I(struct inode *inode)
-{
- return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode);
-}
-
-static inline struct inode *VFS_I(ntfs_inode *ni)
-{
- return &((big_ntfs_inode *)ni)->vfs_inode;
-}
-
-/**
- * ntfs_attr - ntfs in memory attribute structure
- * @mft_no: mft record number of the base mft record of this attribute
- * @name: Unicode name of the attribute (NULL if unnamed)
- * @name_len: length of @name in Unicode characters (0 if unnamed)
- * @type: attribute type (see layout.h)
- *
- * This structure exists only to provide a small structure for the
- * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism.
- *
- * NOTE: Elements are ordered by size to make the structure as compact as
- * possible on all architectures.
- */
-typedef struct {
- unsigned long mft_no;
- ntfschar *name;
- u32 name_len;
- ATTR_TYPE type;
-} ntfs_attr;
-
-extern int ntfs_test_inode(struct inode *vi, void *data);
-
-extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
-extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
- ntfschar *name, u32 name_len);
-extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
- u32 name_len);
-
-extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
-extern void ntfs_free_big_inode(struct inode *inode);
-extern void ntfs_evict_big_inode(struct inode *vi);
-
-extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
-
-static inline void ntfs_init_big_inode(struct inode *vi)
-{
- ntfs_inode *ni = NTFS_I(vi);
-
- ntfs_debug("Entering.");
- __ntfs_init_inode(vi->i_sb, ni);
- ni->mft_no = vi->i_ino;
-}
-
-extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
- unsigned long mft_no);
-extern void ntfs_clear_extent_inode(ntfs_inode *ni);
-
-extern int ntfs_read_inode_mount(struct inode *vi);
-
-extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
-
-#ifdef NTFS_RW
-
-extern int ntfs_truncate(struct inode *vi);
-extern void ntfs_truncate_vfs(struct inode *vi);
-
-extern int ntfs_setattr(struct mnt_idmap *idmap,
- struct dentry *dentry, struct iattr *attr);
-
-extern int __ntfs_write_inode(struct inode *vi, int sync);
-
-static inline void ntfs_commit_inode(struct inode *vi)
-{
- if (!is_bad_inode(vi))
- __ntfs_write_inode(vi, 1);
- return;
-}
-
-#else
-
-static inline void ntfs_truncate_vfs(struct inode *vi) {}
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
deleted file mode 100644
index 5d4bf7a3259f..000000000000
--- a/fs/ntfs/layout.h
+++ /dev/null
@@ -1,2421 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
- * project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_LAYOUT_H
-#define _LINUX_NTFS_LAYOUT_H
-
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/list.h>
-#include <asm/byteorder.h>
-
-#include "types.h"
-
-/* The NTFS oem_id "NTFS " */
-#define magicNTFS cpu_to_le64(0x202020205346544eULL)
-
-/*
- * Location of bootsector on partition:
- * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition.
- * On NT4 and above there is one backup copy of the boot sector to
- * be found on the last sector of the partition (not normally accessible
- * from within Windows as the bootsector contained number of sectors
- * value is one less than the actual value!).
- * On versions of NT 3.51 and earlier, the backup copy was located at
- * number of sectors/2 (integer divide), i.e. in the middle of the volume.
- */
-
-/*
- * BIOS parameter block (bpb) structure.
- */
-typedef struct {
- le16 bytes_per_sector; /* Size of a sector in bytes. */
- u8 sectors_per_cluster; /* Size of a cluster in sectors. */
- le16 reserved_sectors; /* zero */
- u8 fats; /* zero */
- le16 root_entries; /* zero */
- le16 sectors; /* zero */
- u8 media_type; /* 0xf8 = hard disk */
- le16 sectors_per_fat; /* zero */
- le16 sectors_per_track; /* irrelevant */
- le16 heads; /* irrelevant */
- le32 hidden_sectors; /* zero */
- le32 large_sectors; /* zero */
-} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK;
-
-/*
- * NTFS boot sector structure.
- */
-typedef struct {
- u8 jump[3]; /* Irrelevant (jump to boot up code).*/
- le64 oem_id; /* Magic "NTFS ". */
- BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */
- u8 unused[4]; /* zero, NTFS diskedit.exe states that
- this is actually:
- __u8 physical_drive; // 0x80
- __u8 current_head; // zero
- __u8 extended_boot_signature;
- // 0x80
- __u8 unused; // zero
- */
-/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives
- maximum volume size of 2^63 sectors.
- Assuming standard sector size of 512
- bytes, the maximum byte size is
- approx. 4.7x10^21 bytes. (-; */
- sle64 mft_lcn; /* Cluster location of mft data. */
- sle64 mftmirr_lcn; /* Cluster location of copy of mft. */
- s8 clusters_per_mft_record; /* Mft record size in clusters. */
- u8 reserved0[3]; /* zero */
- s8 clusters_per_index_record; /* Index block size in clusters. */
- u8 reserved1[3]; /* zero */
- le64 volume_serial_number; /* Irrelevant (serial number). */
- le32 checksum; /* Boot sector checksum. */
-/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */
- le16 end_of_sector_marker; /* End of bootsector magic. Always is
- 0xaa55 in little endian. */
-/* sizeof() = 512 (0x200) bytes */
-} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR;
-
-/*
- * Magic identifiers present at the beginning of all ntfs record containing
- * records (like mft records for example).
- */
-enum {
- /* Found in $MFT/$DATA. */
- magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
- magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
- magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
-
- /* Found in $LogFile/$DATA. */
- magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
- magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
-
- /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */
- magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
-
- /* Found in all ntfs record containing records. */
- magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
- transfer was detected. */
- /*
- * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
- * thus not initialized. Page must be initialized before using it.
- */
- magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
-};
-
-typedef le32 NTFS_RECORD_TYPE;
-
-/*
- * Generic magic comparison macros. Finally found a use for the ## preprocessor
- * operator! (-8
- */
-
-static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
-{
- return (x == r);
-}
-#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m)
-
-static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
-{
- return (*p == r);
-}
-#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m)
-
-/*
- * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above.
- */
-#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) )
-#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) )
-#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) )
-#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) )
-#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) )
-#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) )
-#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) )
-#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) )
-
-#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) )
-#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) )
-#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) )
-#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) )
-
-#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) )
-#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) )
-
-#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) )
-#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) )
-
-#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) )
-#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) )
-
-/*
- * The Update Sequence Array (usa) is an array of the le16 values which belong
- * to the end of each sector protected by the update sequence record in which
- * this array is contained. Note that the first entry is the Update Sequence
- * Number (usn), a cyclic counter of how many times the protected record has
- * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All
- * last le16's of each sector have to be equal to the usn (during reading) or
- * are set to it (during writing). If they are not, an incomplete multi sector
- * transfer has occurred when the data was written.
- * The maximum size for the update sequence array is fixed to:
- * maximum size = usa_ofs + (usa_count * 2) = 510 bytes
- * The 510 bytes comes from the fact that the last le16 in the array has to
- * (obviously) finish before the last le16 of the first 512-byte sector.
- * This formula can be used as a consistency check in that usa_ofs +
- * (usa_count * 2) has to be less than or equal to 510.
- */
-typedef struct {
- NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record
- type and/or status. */
- le16 usa_ofs; /* Offset to the Update Sequence Array (usa)
- from the start of the ntfs record. */
- le16 usa_count; /* Number of le16 sized entries in the usa
- including the Update Sequence Number (usn),
- thus the number of fixups is the usa_count
- minus 1. */
-} __attribute__ ((__packed__)) NTFS_RECORD;
-
-/*
- * System files mft record numbers. All these files are always marked as used
- * in the bitmap attribute of the mft; presumably in order to avoid accidental
- * allocation for random other mft records. Also, the sequence number for each
- * of the system files is always equal to their mft record number and it is
- * never modified.
- */
-typedef enum {
- FILE_MFT = 0, /* Master file table (mft). Data attribute
- contains the entries and bitmap attribute
- records which ones are in use (bit==1). */
- FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records
- in data attribute. If cluster size > 4kiB,
- copy of first N mft records, with
- N = cluster_size / mft_record_size. */
- FILE_LogFile = 2, /* Journalling log in data attribute. */
- FILE_Volume = 3, /* Volume name attribute and volume information
- attribute (flags and ntfs version). Windows
- refers to this file as volume DASD (Direct
- Access Storage Device). */
- FILE_AttrDef = 4, /* Array of attribute definitions in data
- attribute. */
- FILE_root = 5, /* Root directory. */
- FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in
- data attribute. */
- FILE_Boot = 7, /* Boot sector (always at cluster 0) in data
- attribute. */
- FILE_BadClus = 8, /* Contains all bad clusters in the non-resident
- data attribute. */
- FILE_Secure = 9, /* Shared security descriptors in data attribute
- and two indexes into the descriptors.
- Appeared in Windows 2000. Before that, this
- file was named $Quota but was unused. */
- FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode
- characters in data attribute. */
- FILE_Extend = 11, /* Directory containing other system files (eg.
- $ObjId, $Quota, $Reparse and $UsnJrnl). This
- is new to NTFS3.0. */
- FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */
- FILE_reserved13 = 13,
- FILE_reserved14 = 14,
- FILE_reserved15 = 15,
- FILE_first_user = 16, /* First user file, used as test limit for
- whether to allow opening a file or not. */
-} NTFS_SYSTEM_FILES;
-
-/*
- * These are the so far known MFT_RECORD_* flags (16-bit) which contain
- * information about the mft record in which they are present.
- */
-enum {
- MFT_RECORD_IN_USE = cpu_to_le16(0x0001),
- MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
-} __attribute__ ((__packed__));
-
-typedef le16 MFT_RECORD_FLAGS;
-
-/*
- * mft references (aka file references or file record segment references) are
- * used whenever a structure needs to refer to a record in the mft.
- *
- * A reference consists of a 48-bit index into the mft and a 16-bit sequence
- * number used to detect stale references.
- *
- * For error reporting purposes we treat the 48-bit index as a signed quantity.
- *
- * The sequence number is a circular counter (skipping 0) describing how many
- * times the referenced mft record has been (re)used. This has to match the
- * sequence number of the mft record being referenced, otherwise the reference
- * is considered stale and removed (FIXME: only ntfsck or the driver itself?).
- *
- * If the sequence number is zero it is assumed that no sequence number
- * consistency checking should be performed.
- *
- * FIXME: Since inodes are 32-bit as of now, the driver needs to always check
- * for high_part being 0 and if not either BUG(), cause a panic() or handle
- * the situation in some other way. This shouldn't be a problem as a volume has
- * to become HUGE in order to need more than 32-bits worth of mft records.
- * Assuming the standard mft record size of 1kb only the records (never mind
- * the non-resident attributes, etc.) would require 4Tb of space on their own
- * for the first 32 bits worth of records. This is only if some strange person
- * doesn't decide to foul play and make the mft sparse which would be a really
- * horrible thing to do as it would trash our current driver implementation. )-:
- * Do I hear screams "we want 64-bit inodes!" ?!? (-;
- *
- * FIXME: The mft zone is defined as the first 12% of the volume. This space is
- * reserved so that the mft can grow contiguously and hence doesn't become
- * fragmented. Volume free space includes the empty part of the mft zone and
- * when the volume's free 88% are used up, the mft zone is shrunk by a factor
- * of 2, thus making more space available for more files/data. This process is
- * repeated every time there is no more free space except for the mft zone until
- * there really is no more free space.
- */
-
-/*
- * Typedef the MFT_REF as a 64-bit value for easier handling.
- * Also define two unpacking macros to get to the reference (MREF) and
- * sequence number (MSEQNO) respectively.
- * The _LE versions are to be applied on little endian MFT_REFs.
- * Note: The _LE versions will return a CPU endian formatted value!
- */
-#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
-#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
-
-typedef u64 MFT_REF;
-typedef le64 leMFT_REF;
-
-#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \
- ((MFT_REF)(m) & MFT_REF_MASK_CPU)))
-#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s))
-
-#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU))
-#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff))
-#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
-#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff))
-
-#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false)
-#define ERR_MREF(x) ((u64)((s64)(x)))
-#define MREF_ERR(x) ((int)((s64)(x)))
-
-/*
- * The mft record header present at the beginning of every record in the mft.
- * This is followed by a sequence of variable length attribute records which
- * is terminated by an attribute of type AT_END which is a truncated attribute
- * in that it only consists of the attribute type code AT_END and none of the
- * other members of the attribute structure are present.
- */
-typedef struct {
-/*Ofs*/
-/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
- NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */
- le16 usa_ofs; /* See NTFS_RECORD definition above. */
- le16 usa_count; /* See NTFS_RECORD definition above. */
-
-/* 8*/ le64 lsn; /* $LogFile sequence number for this record.
- Changed every time the record is modified. */
-/* 16*/ le16 sequence_number; /* Number of times this mft record has been
- reused. (See description for MFT_REF
- above.) NOTE: The increment (skipping zero)
- is done when the file is deleted. NOTE: If
- this is zero it is left zero. */
-/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of
- directory entries referencing this record.
- NOTE: Only used in mft base records.
- NOTE: When deleting a directory entry we
- check the link_count and if it is 1 we
- delete the file. Otherwise we delete the
- FILE_NAME_ATTR being referenced by the
- directory entry from the mft record and
- decrement the link_count.
- FIXME: Careful with Win32 + DOS names! */
-/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this
- mft record from the start of the mft record.
- NOTE: Must be aligned to 8-byte boundary. */
-/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file
- is deleted, the MFT_RECORD_IN_USE flag is
- set to zero. */
-/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record.
- NOTE: Must be aligned to 8-byte boundary. */
-/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft
- record. This should be equal to the mft
- record size. */
-/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records.
- When it is not zero it is a mft reference
- pointing to the base mft record to which
- this record belongs (this is then used to
- locate the attribute list attribute present
- in the base record which describes this
- extension record and hence might need
- modification when the extension record
- itself is modified, also locating the
- attribute list also means finding the other
- potential extents, belonging to the non-base
- mft record). */
-/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to
- the next attribute added to this mft record.
- NOTE: Incremented each time after it is used.
- NOTE: Every time the mft record is reused
- this number is set to zero. NOTE: The first
- instance number is always 0. */
-/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */
-/* 42*/ le16 reserved; /* Reserved/alignment. */
-/* 44*/ le32 mft_record_number; /* Number of this mft record. */
-/* sizeof() = 48 bytes */
-/*
- * When (re)using the mft record, we place the update sequence array at this
- * offset, i.e. before we start with the attributes. This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work. As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading we obviously use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) MFT_RECORD;
-
-/* This is the version without the NTFS 3.1+ specific fields. */
-typedef struct {
-/*Ofs*/
-/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
- NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */
- le16 usa_ofs; /* See NTFS_RECORD definition above. */
- le16 usa_count; /* See NTFS_RECORD definition above. */
-
-/* 8*/ le64 lsn; /* $LogFile sequence number for this record.
- Changed every time the record is modified. */
-/* 16*/ le16 sequence_number; /* Number of times this mft record has been
- reused. (See description for MFT_REF
- above.) NOTE: The increment (skipping zero)
- is done when the file is deleted. NOTE: If
- this is zero it is left zero. */
-/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of
- directory entries referencing this record.
- NOTE: Only used in mft base records.
- NOTE: When deleting a directory entry we
- check the link_count and if it is 1 we
- delete the file. Otherwise we delete the
- FILE_NAME_ATTR being referenced by the
- directory entry from the mft record and
- decrement the link_count.
- FIXME: Careful with Win32 + DOS names! */
-/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this
- mft record from the start of the mft record.
- NOTE: Must be aligned to 8-byte boundary. */
-/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file
- is deleted, the MFT_RECORD_IN_USE flag is
- set to zero. */
-/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record.
- NOTE: Must be aligned to 8-byte boundary. */
-/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft
- record. This should be equal to the mft
- record size. */
-/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records.
- When it is not zero it is a mft reference
- pointing to the base mft record to which
- this record belongs (this is then used to
- locate the attribute list attribute present
- in the base record which describes this
- extension record and hence might need
- modification when the extension record
- itself is modified, also locating the
- attribute list also means finding the other
- potential extents, belonging to the non-base
- mft record). */
-/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to
- the next attribute added to this mft record.
- NOTE: Incremented each time after it is used.
- NOTE: Every time the mft record is reused
- this number is set to zero. NOTE: The first
- instance number is always 0. */
-/* sizeof() = 42 bytes */
-/*
- * When (re)using the mft record, we place the update sequence array at this
- * offset, i.e. before we start with the attributes. This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work. As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading we obviously use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) MFT_RECORD_OLD;
-
-/*
- * System defined attributes (32-bit). Each attribute type has a corresponding
- * attribute name (Unicode string of maximum 64 character length) as described
- * by the attribute definitions present in the data attribute of the $AttrDef
- * system file. On NTFS 3.0 volumes the names are just as the types are named
- * in the below defines exchanging AT_ for the dollar sign ($). If that is not
- * a revealing choice of symbol I do not know what is... (-;
- */
-enum {
- AT_UNUSED = cpu_to_le32( 0),
- AT_STANDARD_INFORMATION = cpu_to_le32( 0x10),
- AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20),
- AT_FILE_NAME = cpu_to_le32( 0x30),
- AT_OBJECT_ID = cpu_to_le32( 0x40),
- AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50),
- AT_VOLUME_NAME = cpu_to_le32( 0x60),
- AT_VOLUME_INFORMATION = cpu_to_le32( 0x70),
- AT_DATA = cpu_to_le32( 0x80),
- AT_INDEX_ROOT = cpu_to_le32( 0x90),
- AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0),
- AT_BITMAP = cpu_to_le32( 0xb0),
- AT_REPARSE_POINT = cpu_to_le32( 0xc0),
- AT_EA_INFORMATION = cpu_to_le32( 0xd0),
- AT_EA = cpu_to_le32( 0xe0),
- AT_PROPERTY_SET = cpu_to_le32( 0xf0),
- AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100),
- AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000),
- AT_END = cpu_to_le32(0xffffffff)
-};
-
-typedef le32 ATTR_TYPE;
-
-/*
- * The collation rules for sorting views/indexes/etc (32-bit).
- *
- * COLLATION_BINARY - Collate by binary compare where the first byte is most
- * significant.
- * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary
- * Unicode values, except that when a character can be uppercased, the
- * upper case value collates before the lower case one.
- * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation
- * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea
- * what the difference is. Perhaps the difference is that file names
- * would treat some special characters in an odd way (see
- * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[]
- * for what I mean but COLLATION_UNICODE_STRING would not give any special
- * treatment to any characters at all, but this is speculation.
- * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key
- * values. E.g. used for $SII index in FILE_Secure, which sorts by
- * security_id (le32).
- * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values.
- * E.g. used for $O index in FILE_Extend/$Quota.
- * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash
- * values and second by ascending security_id values. E.g. used for $SDH
- * index in FILE_Secure.
- * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending
- * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which
- * sorts by object_id (16-byte), by splitting up the object_id in four
- * le32 values and using them as individual keys. E.g. take the following
- * two security_ids, stored as follows on disk:
- * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59
- * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45
- * To compare them, they are split into four le32 values each, like so:
- * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081
- * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179
- * Now, it is apparent why the 2nd object_id collates after the 1st: the
- * first le32 value of the 1st object_id is less than the first le32 of
- * the 2nd object_id. If the first le32 values of both object_ids were
- * equal then the second le32 values would be compared, etc.
- */
-enum {
- COLLATION_BINARY = cpu_to_le32(0x00),
- COLLATION_FILE_NAME = cpu_to_le32(0x01),
- COLLATION_UNICODE_STRING = cpu_to_le32(0x02),
- COLLATION_NTOFS_ULONG = cpu_to_le32(0x10),
- COLLATION_NTOFS_SID = cpu_to_le32(0x11),
- COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12),
- COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13),
-};
-
-typedef le32 COLLATION_RULE;
-
-/*
- * The flags (32-bit) describing attribute properties in the attribute
- * definition structure. FIXME: This information is based on Regis's
- * information and, according to him, it is not certain and probably
- * incomplete. The INDEXABLE flag is fairly certainly correct as only the file
- * name attribute has this flag set and this is the only attribute indexed in
- * NT4.
- */
-enum {
- ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be
- indexed. */
- ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type
- can be present multiple times in the
- mft records of an inode. */
- ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value
- must contain at least one non-zero
- byte. */
- ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be
- indexed and the attribute value must be
- unique for the attribute type in all of
- the mft records of an inode. */
- ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be
- named and the name must be unique for
- the attribute type in all of the mft
- records of an inode. */
- ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be
- resident. */
- ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log
- modifications to this attribute,
- regardless of whether it is resident or
- non-resident. Without this, only log
- modifications if the attribute is
- resident. */
-};
-
-typedef le32 ATTR_DEF_FLAGS;
-
-/*
- * The data attribute of FILE_AttrDef contains a sequence of attribute
- * definitions for the NTFS volume. With this, it is supposed to be safe for an
- * older NTFS driver to mount a volume containing a newer NTFS version without
- * damaging it (that's the theory. In practice it's: not damaging it too much).
- * Entries are sorted by attribute type. The flags describe whether the
- * attribute can be resident/non-resident and possibly other things, but the
- * actual bits are unknown.
- */
-typedef struct {
-/*hex ofs*/
-/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero
- terminated. */
-/* 80*/ ATTR_TYPE type; /* Type of the attribute. */
-/* 84*/ le32 display_rule; /* Default display rule.
- FIXME: What does it mean? (AIA) */
-/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */
-/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */
-/* 90*/ sle64 min_size; /* Optional minimum attribute size. */
-/* 98*/ sle64 max_size; /* Maximum size of attribute. */
-/* sizeof() = 0xa0 or 160 bytes */
-} __attribute__ ((__packed__)) ATTR_DEF;
-
-/*
- * Attribute flags (16-bit).
- */
-enum {
- ATTR_IS_COMPRESSED = cpu_to_le16(0x0001),
- ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
- mask. Also, first
- illegal value. */
- ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000),
- ATTR_IS_SPARSE = cpu_to_le16(0x8000),
-} __attribute__ ((__packed__));
-
-typedef le16 ATTR_FLAGS;
-
-/*
- * Attribute compression.
- *
- * Only the data attribute is ever compressed in the current ntfs driver in
- * Windows. Further, compression is only applied when the data attribute is
- * non-resident. Finally, to use compression, the maximum allowed cluster size
- * on a volume is 4kib.
- *
- * The compression method is based on independently compressing blocks of X
- * clusters, where X is determined from the compression_unit value found in the
- * non-resident attribute record header (more precisely: X = 2^compression_unit
- * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4).
- *
- * There are three different cases of how a compression block of X clusters
- * can be stored:
- *
- * 1) The data in the block is all zero (a sparse block):
- * This is stored as a sparse block in the runlist, i.e. the runlist
- * entry has length = X and lcn = -1. The mapping pairs array actually
- * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at
- * all, which is then interpreted by the driver as lcn = -1.
- * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then
- * the same principles apply as above, except that the length is not
- * restricted to being any particular value.
- *
- * 2) The data in the block is not compressed:
- * This happens when compression doesn't reduce the size of the block
- * in clusters. I.e. if compression has a small effect so that the
- * compressed data still occupies X clusters, then the uncompressed data
- * is stored in the block.
- * This case is recognised by the fact that the runlist entry has
- * length = X and lcn >= 0. The mapping pairs array stores this as
- * normal with a run length of X and some specific delta_lcn, i.e.
- * delta_lcn has to be present.
- *
- * 3) The data in the block is compressed:
- * The common case. This case is recognised by the fact that the run
- * list entry has length L < X and lcn >= 0. The mapping pairs array
- * stores this as normal with a run length of X and some specific
- * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is
- * immediately followed by a sparse entry with length = X - L and
- * lcn = -1. The latter entry is to make up the vcn counting to the
- * full compression block size X.
- *
- * In fact, life is more complicated because adjacent entries of the same type
- * can be coalesced. This means that one has to keep track of the number of
- * clusters handled and work on a basis of X clusters at a time being one
- * block. An example: if length L > X this means that this particular runlist
- * entry contains a block of length X and part of one or more blocks of length
- * L - X. Another example: if length L < X, this does not necessarily mean that
- * the block is compressed as it might be that the lcn changes inside the block
- * and hence the following runlist entry describes the continuation of the
- * potentially compressed block. The block would be compressed if the
- * following runlist entry describes at least X - L sparse clusters, thus
- * making up the compression block length as described in point 3 above. (Of
- * course, there can be several runlist entries with small lengths so that the
- * sparse entry does not follow the first data containing entry with
- * length < X.)
- *
- * NOTE: At the end of the compressed attribute value, there most likely is not
- * just the right amount of data to make up a compression block, thus this data
- * is not even attempted to be compressed. It is just stored as is, unless
- * the number of clusters it occupies is reduced when compressed in which case
- * it is stored as a compressed compression block, complete with sparse
- * clusters at the end.
- */
-
-/*
- * Flags of resident attributes (8-bit).
- */
-enum {
- RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index
- (has implications for deleting and
- modifying the attribute). */
-} __attribute__ ((__packed__));
-
-typedef u8 RESIDENT_ATTR_FLAGS;
-
-/*
- * Attribute record header. Always aligned to 8-byte boundary.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */
-/* 4*/ le32 length; /* Byte size of the resident part of the
- attribute (aligned to 8-byte boundary).
- Used to get to the next attribute. */
-/* 8*/ u8 non_resident; /* If 0, attribute is resident.
- If 1, attribute is non-resident. */
-/* 9*/ u8 name_length; /* Unicode character size of name of attribute.
- 0 if unnamed. */
-/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the
- beginning of the name from the attribute
- record. Note that the name is stored as a
- Unicode string. When creating, place offset
- just at the end of the record header. Then,
- follow with attribute value or mapping pairs
- array, resident and non-resident attributes
- respectively, aligning to an 8-byte
- boundary. */
-/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */
-/* 14*/ le16 instance; /* The instance of this attribute record. This
- number is unique within this mft record (see
- MFT_RECORD/next_attribute_instance notes in
- mft.h for more details). */
-/* 16*/ union {
- /* Resident attributes. */
- struct {
-/* 16 */ le32 value_length;/* Byte size of attribute value. */
-/* 20 */ le16 value_offset;/* Byte offset of the attribute
- value from the start of the
- attribute record. When creating,
- align to 8-byte boundary if we
- have a name present as this might
- not have a length of a multiple
- of 8-bytes. */
-/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */
-/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte
- boundary. */
- } __attribute__ ((__packed__)) resident;
- /* Non-resident attributes. */
- struct {
-/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number
- for this portion of the attribute value or
- 0 if this is the only extent (usually the
- case). - Only when an attribute list is used
- does lowest_vcn != 0 ever occur. */
-/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of
- the attribute value. - Usually there is only one
- portion, so this usually equals the attribute
- value size in clusters minus 1. Can be -1 for
- zero length files. Can be 0 for "single extent"
- attributes. */
-/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the
- beginning of the structure to the mapping pairs
- array which contains the mappings between the
- vcns and the logical cluster numbers (lcns).
- When creating, place this at the end of this
- record header aligned to 8-byte boundary. */
-/* 34*/ u8 compression_unit; /* The compression unit expressed
- as the log to the base 2 of the number of
- clusters in a compression unit. 0 means not
- compressed. (This effectively limits the
- compression unit size to be a power of two
- clusters.) WinNT4 only uses a value of 4.
- Sparse files have this set to 0 on XPSP2. */
-/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */
-/* The sizes below are only used when lowest_vcn is zero, as otherwise it would
- be difficult to keep them up-to-date.*/
-/* 40*/ sle64 allocated_size; /* Byte size of disk space
- allocated to hold the attribute value. Always
- is a multiple of the cluster size. When a file
- is compressed, this field is a multiple of the
- compression block size (2^compression_unit) and
- it represents the logically allocated space
- rather than the actual on disk usage. For this
- use the compressed_size (see below). */
-/* 48*/ sle64 data_size; /* Byte size of the attribute
- value. Can be larger than allocated_size if
- attribute value is compressed or sparse. */
-/* 56*/ sle64 initialized_size; /* Byte size of initialized
- portion of the attribute value. Usually equals
- data_size. */
-/* sizeof(uncompressed attr) = 64*/
-/* 64*/ sle64 compressed_size; /* Byte size of the attribute
- value after compression. Only present when
- compressed or sparse. Always is a multiple of
- the cluster size. Represents the actual amount
- of disk space being used on the disk. */
-/* sizeof(compressed attr) = 72*/
- } __attribute__ ((__packed__)) non_resident;
- } __attribute__ ((__packed__)) data;
-} __attribute__ ((__packed__)) ATTR_RECORD;
-
-typedef ATTR_RECORD ATTR_REC;
-
-/*
- * File attribute flags (32-bit) appearing in the file_attributes fields of the
- * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR
- * attributes of MFT_RECORDs and directory index entries.
- *
- * All of the below flags appear in the directory index entries but only some
- * appear in the STANDARD_INFORMATION attribute whilst only some others appear
- * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the
- * flags appear in all of the above.
- */
-enum {
- FILE_ATTR_READONLY = cpu_to_le32(0x00000001),
- FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002),
- FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004),
- /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */
-
- FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010),
- /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is
- reserved for the DOS SUBDIRECTORY flag. */
- FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020),
- FILE_ATTR_DEVICE = cpu_to_le32(0x00000040),
- FILE_ATTR_NORMAL = cpu_to_le32(0x00000080),
-
- FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100),
- FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200),
- FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400),
- FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800),
-
- FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000),
- FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000),
- FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000),
-
- FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7),
- /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
- FILE_ATTR_DEVICE and preserves everything else. This mask is used
- to obtain all flags that are valid for reading. */
- FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7),
- /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
- F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
- F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
- is used to obtain all flags that are valid for setting. */
- /*
- * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
- * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
- * attribute of an mft record.
- */
- FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000),
- /* Note, this is a copy of the corresponding bit from the mft record,
- telling us whether this is a directory or not, i.e. whether it has
- an index root attribute or not. */
- FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000),
- /* Note, this is a copy of the corresponding bit from the mft record,
- telling us whether this file has a view index present (eg. object id
- index, quota index, one of the security indexes or the encrypting
- filesystem related indexes). */
-};
-
-typedef le32 FILE_ATTR_FLAGS;
-
-/*
- * NOTE on times in NTFS: All times are in MS standard time format, i.e. they
- * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00
- * universal coordinated time (UTC). (In Linux time starts 1st January 1970,
- * 00:00:00 UTC and is stored as the number of 1-second intervals since then.)
- */
-
-/*
- * Attribute: Standard information (0x10).
- *
- * NOTE: Always resident.
- * NOTE: Present in all base file records on a volume.
- * NOTE: There is conflicting information about the meaning of each of the time
- * fields but the meaning as defined below has been verified to be
- * correct by practical experimentation on Windows NT4 SP6a and is hence
- * assumed to be the one and only correct interpretation.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ sle64 creation_time; /* Time file was created. Updated when
- a filename is changed(?). */
-/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last
- modified. */
-/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last
- modified. */
-/* 24*/ sle64 last_access_time; /* Approximate time when the file was
- last accessed (obviously this is not
- updated on read-only volumes). In
- Windows this is only updated when
- accessed if some time delta has
- passed since the last update. Also,
- last access time updates can be
- disabled altogether for speed. */
-/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
-/* 36*/ union {
- /* NTFS 1.2 */
- struct {
- /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte
- boundary. */
- } __attribute__ ((__packed__)) v1;
- /* sizeof() = 48 bytes */
- /* NTFS 3.x */
- struct {
-/*
- * If a volume has been upgraded from a previous NTFS version, then these
- * fields are present only if the file has been accessed since the upgrade.
- * Recognize the difference by comparing the length of the resident attribute
- * value. If it is 48, then the following fields are missing. If it is 72 then
- * the fields are present. Maybe just check like this:
- * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) {
- * Assume NTFS 1.2- format.
- * If (volume version is 3.x)
- * Upgrade attribute to NTFS 3.x format.
- * else
- * Use NTFS 1.2- format for access.
- * } else
- * Use NTFS 3.x format for access.
- * Only problem is that it might be legal to set the length of the value to
- * arbitrarily large values thus spoiling this check. - But chkdsk probably
- * views that as a corruption, assuming that it behaves like this for all
- * attributes.
- */
- /* 36*/ le32 maximum_versions; /* Maximum allowed versions for
- file. Zero if version numbering is disabled. */
- /* 40*/ le32 version_number; /* This file's version (if any).
- Set to zero if maximum_versions is zero. */
- /* 44*/ le32 class_id; /* Class id from bidirectional
- class id index (?). */
- /* 48*/ le32 owner_id; /* Owner_id of the user owning
- the file. Translate via $Q index in FILE_Extend
- /$Quota to the quota control entry for the user
- owning the file. Zero if quotas are disabled. */
- /* 52*/ le32 security_id; /* Security_id for the file.
- Translate via $SII index and $SDS data stream
- in FILE_Secure to the security descriptor. */
- /* 56*/ le64 quota_charged; /* Byte size of the charge to
- the quota for all streams of the file. Note: Is
- zero if quotas are disabled. */
- /* 64*/ leUSN usn; /* Last update sequence number
- of the file. This is a direct index into the
- transaction log file ($UsnJrnl). It is zero if
- the usn journal is disabled or this file has
- not been subject to logging yet. See usnjrnl.h
- for details. */
- } __attribute__ ((__packed__)) v3;
- /* sizeof() = 72 bytes (NTFS 3.x) */
- } __attribute__ ((__packed__)) ver;
-} __attribute__ ((__packed__)) STANDARD_INFORMATION;
-
-/*
- * Attribute: Attribute list (0x20).
- *
- * - Can be either resident or non-resident.
- * - Value consists of a sequence of variable length, 8-byte aligned,
- * ATTR_LIST_ENTRY records.
- * - The list is not terminated by anything at all! The only way to know when
- * the end is reached is to keep track of the current offset and compare it to
- * the attribute value size.
- * - The attribute list attribute contains one entry for each attribute of
- * the file in which the list is located, except for the list attribute
- * itself. The list is sorted: first by attribute type, second by attribute
- * name (if present), third by instance number. The extents of one
- * non-resident attribute (if present) immediately follow after the initial
- * extent. They are ordered by lowest_vcn and have their instace set to zero.
- * It is not allowed to have two attributes with all sorting keys equal.
- * - Further restrictions:
- * - If not resident, the vcn to lcn mapping array has to fit inside the
- * base mft record.
- * - The attribute list attribute value has a maximum size of 256kb. This
- * is imposed by the Windows cache manager.
- * - Attribute lists are only used when the attributes of mft record do not
- * fit inside the mft record despite all attributes (that can be made
- * non-resident) having been made non-resident. This can happen e.g. when:
- * - File has a large number of hard links (lots of file name
- * attributes present).
- * - The mapping pairs array of some non-resident attribute becomes so
- * large due to fragmentation that it overflows the mft record.
- * - The security descriptor is very complex (not applicable to
- * NTFS 3.0 volumes).
- * - There are many named streams.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */
-/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */
-/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the
- attribute or 0 if unnamed. */
-/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name
- (always set this to where the name would
- start even if unnamed). */
-/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion
- of the attribute value. This is usually 0. It
- is non-zero for the case where one attribute
- does not fit into one mft record and thus
- several mft records are allocated to hold
- this attribute. In the latter case, each mft
- record holds one extent of the attribute and
- there is one attribute list entry for each
- extent. NOTE: This is DEFINITELY a signed
- value! The windows driver uses cmp, followed
- by jg when comparing this, thus it treats it
- as signed. */
-/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding
- the ATTR_RECORD for this portion of the
- attribute value. */
-/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the
- attribute being referenced; otherwise 0. */
-/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use
- name_offset to determine the location of the
- name. */
-/* sizeof() = 26 + (attribute_name_length * 2) bytes */
-} __attribute__ ((__packed__)) ATTR_LIST_ENTRY;
-
-/*
- * The maximum allowed length for a file name.
- */
-#define MAXIMUM_FILE_NAME_LENGTH 255
-
-/*
- * Possible namespaces for filenames in ntfs (8-bit).
- */
-enum {
- FILE_NAME_POSIX = 0x00,
- /* This is the largest namespace. It is case sensitive and allows all
- Unicode characters except for: '\0' and '/'. Beware that in
- WinNT/2k/2003 by default files which eg have the same name except
- for their case will not be distinguished by the standard utilities
- and thus a "del filename" will delete both "filename" and "fileName"
- without warning. However if for example Services For Unix (SFU) are
- installed and the case sensitive option was enabled at installation
- time, then you can create/access/delete such files.
- Note that even SFU places restrictions on the filenames beyond the
- '\0' and '/' and in particular the following set of characters is
- not allowed: '"', '/', '<', '>', '\'. All other characters,
- including the ones no allowed in WIN32 namespace are allowed.
- Tested with SFU 3.5 (this is now free) running on Windows XP. */
- FILE_NAME_WIN32 = 0x01,
- /* The standard WinNT/2k NTFS long filenames. Case insensitive. All
- Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
- and '|'. Further, names cannot end with a '.' or a space. */
- FILE_NAME_DOS = 0x02,
- /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit
- characters greater space, except: '"', '*', '+', ',', '/', ':', ';',
- '<', '=', '>', '?', and '\'. */
- FILE_NAME_WIN32_AND_DOS = 0x03,
- /* 3 means that both the Win32 and the DOS filenames are identical and
- hence have been saved in this single filename record. */
-} __attribute__ ((__packed__));
-
-typedef u8 FILE_NAME_TYPE_FLAGS;
-
-/*
- * Attribute: Filename (0x30).
- *
- * NOTE: Always resident.
- * NOTE: All fields, except the parent_directory, are only updated when the
- * filename is changed. Until then, they just become out of sync with
- * reality and the more up to date values are present in the standard
- * information attribute.
- * NOTE: There is conflicting information about the meaning of each of the time
- * fields but the meaning as defined below has been verified to be
- * correct by practical experimentation on Windows NT4 SP6a and is hence
- * assumed to be the one and only correct interpretation.
- */
-typedef struct {
-/*hex ofs*/
-/* 0*/ leMFT_REF parent_directory; /* Directory this filename is
- referenced from. */
-/* 8*/ sle64 creation_time; /* Time file was created. */
-/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last
- modified. */
-/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last
- modified. */
-/* 20*/ sle64 last_access_time; /* Time this mft record was last
- accessed. */
-/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space
- for the unnamed data attribute. So
- for normal $DATA, this is the
- allocated_size from the unnamed
- $DATA attribute and for compressed
- and/or sparse $DATA, this is the
- compressed_size from the unnamed
- $DATA attribute. For a directory or
- other inode without an unnamed $DATA
- attribute, this is always 0. NOTE:
- This is a multiple of the cluster
- size. */
-/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed
- data attribute. For a directory or
- other inode without an unnamed $DATA
- attribute, this is always 0. */
-/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
-/* 3c*/ union {
- /* 3c*/ struct {
- /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to
- pack the extended attributes
- (EAs), if such are present.*/
- /* 3e*/ le16 reserved; /* Reserved for alignment. */
- } __attribute__ ((__packed__)) ea;
- /* 3c*/ struct {
- /* 3c*/ le32 reparse_point_tag; /* Type of reparse point,
- present only in reparse
- points and only if there are
- no EAs. */
- } __attribute__ ((__packed__)) rp;
- } __attribute__ ((__packed__)) type;
-/* 40*/ u8 file_name_length; /* Length of file name in
- (Unicode) characters. */
-/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/
-/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */
-} __attribute__ ((__packed__)) FILE_NAME_ATTR;
-
-/*
- * GUID structures store globally unique identifiers (GUID). A GUID is a
- * 128-bit value consisting of one group of eight hexadecimal digits, followed
- * by three groups of four hexadecimal digits each, followed by one group of
- * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the
- * distributed computing environment (DCE) universally unique identifier (UUID).
- * Example of a GUID:
- * 1F010768-5A73-BC91-0010A52216A7
- */
-typedef struct {
- le32 data1; /* The first eight hexadecimal digits of the GUID. */
- le16 data2; /* The first group of four hexadecimal digits. */
- le16 data3; /* The second group of four hexadecimal digits. */
- u8 data4[8]; /* The first two bytes are the third group of four
- hexadecimal digits. The remaining six bytes are the
- final 12 hexadecimal digits. */
-} __attribute__ ((__packed__)) GUID;
-
-/*
- * FILE_Extend/$ObjId contains an index named $O. This index contains all
- * object_ids present on the volume as the index keys and the corresponding
- * mft_record numbers as the index entry data parts. The data part (defined
- * below) also contains three other object_ids:
- * birth_volume_id - object_id of FILE_Volume on which the file was first
- * created. Optional (i.e. can be zero).
- * birth_object_id - object_id of file when it was first created. Usually
- * equals the object_id. Optional (i.e. can be zero).
- * domain_id - Reserved (always zero).
- */
-typedef struct {
- leMFT_REF mft_reference;/* Mft record containing the object_id in
- the index entry key. */
- union {
- struct {
- GUID birth_volume_id;
- GUID birth_object_id;
- GUID domain_id;
- } __attribute__ ((__packed__)) origin;
- u8 extended_info[48];
- } __attribute__ ((__packed__)) opt;
-} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA;
-
-/*
- * Attribute: Object id (NTFS 3.0+) (0x40).
- *
- * NOTE: Always resident.
- */
-typedef struct {
- GUID object_id; /* Unique id assigned to the
- file.*/
- /* The following fields are optional. The attribute value size is 16
- bytes, i.e. sizeof(GUID), if these are not present at all. Note,
- the entries can be present but one or more (or all) can be zero
- meaning that that particular value(s) is(are) not defined. */
- union {
- struct {
- GUID birth_volume_id; /* Unique id of volume on which
- the file was first created.*/
- GUID birth_object_id; /* Unique id of file when it was
- first created. */
- GUID domain_id; /* Reserved, zero. */
- } __attribute__ ((__packed__)) origin;
- u8 extended_info[48];
- } __attribute__ ((__packed__)) opt;
-} __attribute__ ((__packed__)) OBJECT_ID_ATTR;
-
-/*
- * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in
- * the SID structure (see below).
- */
-//typedef enum { /* SID string prefix. */
-// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */
-// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */
-// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */
-// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */
-// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */
-// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */
-//} IDENTIFIER_AUTHORITIES;
-
-/*
- * These relative identifiers (RIDs) are used with the above identifier
- * authorities to make up universal well-known SIDs.
- *
- * Note: The relative identifier (RID) refers to the portion of a SID, which
- * identifies a user or group in relation to the authority that issued the SID.
- * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is
- * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and
- * the relative identifier SECURITY_CREATOR_OWNER_RID (0).
- */
-typedef enum { /* Identifier authority. */
- SECURITY_NULL_RID = 0, /* S-1-0 */
- SECURITY_WORLD_RID = 0, /* S-1-1 */
- SECURITY_LOCAL_RID = 0, /* S-1-2 */
-
- SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */
- SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */
-
- SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */
- SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */
-
- SECURITY_DIALUP_RID = 1,
- SECURITY_NETWORK_RID = 2,
- SECURITY_BATCH_RID = 3,
- SECURITY_INTERACTIVE_RID = 4,
- SECURITY_SERVICE_RID = 6,
- SECURITY_ANONYMOUS_LOGON_RID = 7,
- SECURITY_PROXY_RID = 8,
- SECURITY_ENTERPRISE_CONTROLLERS_RID=9,
- SECURITY_SERVER_LOGON_RID = 9,
- SECURITY_PRINCIPAL_SELF_RID = 0xa,
- SECURITY_AUTHENTICATED_USER_RID = 0xb,
- SECURITY_RESTRICTED_CODE_RID = 0xc,
- SECURITY_TERMINAL_SERVER_RID = 0xd,
-
- SECURITY_LOGON_IDS_RID = 5,
- SECURITY_LOGON_IDS_RID_COUNT = 3,
-
- SECURITY_LOCAL_SYSTEM_RID = 0x12,
-
- SECURITY_NT_NON_UNIQUE = 0x15,
-
- SECURITY_BUILTIN_DOMAIN_RID = 0x20,
-
- /*
- * Well-known domain relative sub-authority values (RIDs).
- */
-
- /* Users. */
- DOMAIN_USER_RID_ADMIN = 0x1f4,
- DOMAIN_USER_RID_GUEST = 0x1f5,
- DOMAIN_USER_RID_KRBTGT = 0x1f6,
-
- /* Groups. */
- DOMAIN_GROUP_RID_ADMINS = 0x200,
- DOMAIN_GROUP_RID_USERS = 0x201,
- DOMAIN_GROUP_RID_GUESTS = 0x202,
- DOMAIN_GROUP_RID_COMPUTERS = 0x203,
- DOMAIN_GROUP_RID_CONTROLLERS = 0x204,
- DOMAIN_GROUP_RID_CERT_ADMINS = 0x205,
- DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206,
- DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207,
- DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208,
-
- /* Aliases. */
- DOMAIN_ALIAS_RID_ADMINS = 0x220,
- DOMAIN_ALIAS_RID_USERS = 0x221,
- DOMAIN_ALIAS_RID_GUESTS = 0x222,
- DOMAIN_ALIAS_RID_POWER_USERS = 0x223,
-
- DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224,
- DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225,
- DOMAIN_ALIAS_RID_PRINT_OPS = 0x226,
- DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227,
-
- DOMAIN_ALIAS_RID_REPLICATOR = 0x228,
- DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229,
- DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a,
-} RELATIVE_IDENTIFIERS;
-
-/*
- * The universal well-known SIDs:
- *
- * NULL_SID S-1-0-0
- * WORLD_SID S-1-1-0
- * LOCAL_SID S-1-2-0
- * CREATOR_OWNER_SID S-1-3-0
- * CREATOR_GROUP_SID S-1-3-1
- * CREATOR_OWNER_SERVER_SID S-1-3-2
- * CREATOR_GROUP_SERVER_SID S-1-3-3
- *
- * (Non-unique IDs) S-1-4
- *
- * NT well-known SIDs:
- *
- * NT_AUTHORITY_SID S-1-5
- * DIALUP_SID S-1-5-1
- *
- * NETWORD_SID S-1-5-2
- * BATCH_SID S-1-5-3
- * INTERACTIVE_SID S-1-5-4
- * SERVICE_SID S-1-5-6
- * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session)
- * PROXY_SID S-1-5-8
- * SERVER_LOGON_SID S-1-5-9 (aka domain controller account)
- * SELF_SID S-1-5-10 (self RID)
- * AUTHENTICATED_USER_SID S-1-5-11
- * RESTRICTED_CODE_SID S-1-5-12 (running restricted code)
- * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server)
- *
- * (Logon IDs) S-1-5-5-X-Y
- *
- * (NT non-unique IDs) S-1-5-0x15-...
- *
- * (Built-in domain) S-1-5-0x20
- */
-
-/*
- * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure.
- *
- * NOTE: This is stored as a big endian number, hence the high_part comes
- * before the low_part.
- */
-typedef union {
- struct {
- u16 high_part; /* High 16-bits. */
- u32 low_part; /* Low 32-bits. */
- } __attribute__ ((__packed__)) parts;
- u8 value[6]; /* Value as individual bytes. */
-} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY;
-
-/*
- * The SID structure is a variable-length structure used to uniquely identify
- * users or groups. SID stands for security identifier.
- *
- * The standard textual representation of the SID is of the form:
- * S-R-I-S-S...
- * Where:
- * - The first "S" is the literal character 'S' identifying the following
- * digits as a SID.
- * - R is the revision level of the SID expressed as a sequence of digits
- * either in decimal or hexadecimal (if the later, prefixed by "0x").
- * - I is the 48-bit identifier_authority, expressed as digits as R above.
- * - S... is one or more sub_authority values, expressed as digits as above.
- *
- * Example SID; the domain-relative SID of the local Administrators group on
- * Windows NT/2k:
- * S-1-5-32-544
- * This translates to a SID with:
- * revision = 1,
- * sub_authority_count = 2,
- * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY
- * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID
- * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS
- */
-typedef struct {
- u8 revision;
- u8 sub_authority_count;
- SID_IDENTIFIER_AUTHORITY identifier_authority;
- le32 sub_authority[1]; /* At least one sub_authority. */
-} __attribute__ ((__packed__)) SID;
-
-/*
- * Current constants for SIDs.
- */
-typedef enum {
- SID_REVISION = 1, /* Current revision level. */
- SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */
- SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in
- a future revision. */
-} SID_CONSTANTS;
-
-/*
- * The predefined ACE types (8-bit, see below).
- */
-enum {
- ACCESS_MIN_MS_ACE_TYPE = 0,
- ACCESS_ALLOWED_ACE_TYPE = 0,
- ACCESS_DENIED_ACE_TYPE = 1,
- SYSTEM_AUDIT_ACE_TYPE = 2,
- SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */
- ACCESS_MAX_MS_V2_ACE_TYPE = 3,
-
- ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4,
- ACCESS_MAX_MS_V3_ACE_TYPE = 4,
-
- /* The following are Win2k only. */
- ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5,
- ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5,
- ACCESS_DENIED_OBJECT_ACE_TYPE = 6,
- SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7,
- SYSTEM_ALARM_OBJECT_ACE_TYPE = 8,
- ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8,
-
- ACCESS_MAX_MS_V4_ACE_TYPE = 8,
-
- /* This one is for WinNT/2k. */
- ACCESS_MAX_MS_ACE_TYPE = 8,
-} __attribute__ ((__packed__));
-
-typedef u8 ACE_TYPES;
-
-/*
- * The ACE flags (8-bit) for audit and inheritance (see below).
- *
- * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE
- * types to indicate that a message is generated (in Windows!) for successful
- * accesses.
- *
- * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types
- * to indicate that a message is generated (in Windows!) for failed accesses.
- */
-enum {
- /* The inheritance flags. */
- OBJECT_INHERIT_ACE = 0x01,
- CONTAINER_INHERIT_ACE = 0x02,
- NO_PROPAGATE_INHERIT_ACE = 0x04,
- INHERIT_ONLY_ACE = 0x08,
- INHERITED_ACE = 0x10, /* Win2k only. */
- VALID_INHERIT_FLAGS = 0x1f,
-
- /* The audit flags. */
- SUCCESSFUL_ACCESS_ACE_FLAG = 0x40,
- FAILED_ACCESS_ACE_FLAG = 0x80,
-} __attribute__ ((__packed__));
-
-typedef u8 ACE_FLAGS;
-
-/*
- * An ACE is an access-control entry in an access-control list (ACL).
- * An ACE defines access to an object for a specific user or group or defines
- * the types of access that generate system-administration messages or alarms
- * for a specific user or group. The user or group is identified by a security
- * identifier (SID).
- *
- * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary),
- * which specifies the type and size of the ACE. The format of the subsequent
- * data depends on the ACE type.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ ACE_TYPES type; /* Type of the ACE. */
-/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */
-/* 2*/ le16 size; /* Size in bytes of the ACE. */
-} __attribute__ ((__packed__)) ACE_HEADER;
-
-/*
- * The access mask (32-bit). Defines the access rights.
- *
- * The specific rights (bits 0 to 15). These depend on the type of the object
- * being secured by the ACE.
- */
-enum {
- /* Specific rights for files and directories are as follows: */
-
- /* Right to read data from the file. (FILE) */
- FILE_READ_DATA = cpu_to_le32(0x00000001),
- /* Right to list contents of a directory. (DIRECTORY) */
- FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001),
-
- /* Right to write data to the file. (FILE) */
- FILE_WRITE_DATA = cpu_to_le32(0x00000002),
- /* Right to create a file in the directory. (DIRECTORY) */
- FILE_ADD_FILE = cpu_to_le32(0x00000002),
-
- /* Right to append data to the file. (FILE) */
- FILE_APPEND_DATA = cpu_to_le32(0x00000004),
- /* Right to create a subdirectory. (DIRECTORY) */
- FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004),
-
- /* Right to read extended attributes. (FILE/DIRECTORY) */
- FILE_READ_EA = cpu_to_le32(0x00000008),
-
- /* Right to write extended attributes. (FILE/DIRECTORY) */
- FILE_WRITE_EA = cpu_to_le32(0x00000010),
-
- /* Right to execute a file. (FILE) */
- FILE_EXECUTE = cpu_to_le32(0x00000020),
- /* Right to traverse the directory. (DIRECTORY) */
- FILE_TRAVERSE = cpu_to_le32(0x00000020),
-
- /*
- * Right to delete a directory and all the files it contains (its
- * children), even if the files are read-only. (DIRECTORY)
- */
- FILE_DELETE_CHILD = cpu_to_le32(0x00000040),
-
- /* Right to read file attributes. (FILE/DIRECTORY) */
- FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080),
-
- /* Right to change file attributes. (FILE/DIRECTORY) */
- FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100),
-
- /*
- * The standard rights (bits 16 to 23). These are independent of the
- * type of object being secured.
- */
-
- /* Right to delete the object. */
- DELETE = cpu_to_le32(0x00010000),
-
- /*
- * Right to read the information in the object's security descriptor,
- * not including the information in the SACL, i.e. right to read the
- * security descriptor and owner.
- */
- READ_CONTROL = cpu_to_le32(0x00020000),
-
- /* Right to modify the DACL in the object's security descriptor. */
- WRITE_DAC = cpu_to_le32(0x00040000),
-
- /* Right to change the owner in the object's security descriptor. */
- WRITE_OWNER = cpu_to_le32(0x00080000),
-
- /*
- * Right to use the object for synchronization. Enables a process to
- * wait until the object is in the signalled state. Some object types
- * do not support this access right.
- */
- SYNCHRONIZE = cpu_to_le32(0x00100000),
-
- /*
- * The following STANDARD_RIGHTS_* are combinations of the above for
- * convenience and are defined by the Win32 API.
- */
-
- /* These are currently defined to READ_CONTROL. */
- STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000),
- STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000),
- STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000),
-
- /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
- STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000),
-
- /*
- * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
- * SYNCHRONIZE access.
- */
- STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000),
-
- /*
- * The access system ACL and maximum allowed access types (bits 24 to
- * 25, bits 26 to 27 are reserved).
- */
- ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000),
- MAXIMUM_ALLOWED = cpu_to_le32(0x02000000),
-
- /*
- * The generic rights (bits 28 to 31). These map onto the standard and
- * specific rights.
- */
-
- /* Read, write, and execute access. */
- GENERIC_ALL = cpu_to_le32(0x10000000),
-
- /* Execute access. */
- GENERIC_EXECUTE = cpu_to_le32(0x20000000),
-
- /*
- * Write access. For files, this maps onto:
- * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA |
- * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE
- * For directories, the mapping has the same numerical value. See
- * above for the descriptions of the rights granted.
- */
- GENERIC_WRITE = cpu_to_le32(0x40000000),
-
- /*
- * Read access. For files, this maps onto:
- * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA |
- * STANDARD_RIGHTS_READ | SYNCHRONIZE
- * For directories, the mapping has the same numberical value. See
- * above for the descriptions of the rights granted.
- */
- GENERIC_READ = cpu_to_le32(0x80000000),
-};
-
-typedef le32 ACCESS_MASK;
-
-/*
- * The generic mapping array. Used to denote the mapping of each generic
- * access right to a specific access mask.
- *
- * FIXME: What exactly is this and what is it for? (AIA)
- */
-typedef struct {
- ACCESS_MASK generic_read;
- ACCESS_MASK generic_write;
- ACCESS_MASK generic_execute;
- ACCESS_MASK generic_all;
-} __attribute__ ((__packed__)) GENERIC_MAPPING;
-
-/*
- * The predefined ACE type structures are as defined below.
- */
-
-/*
- * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE
- */
-typedef struct {
-/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
- ACE_TYPES type; /* Type of the ACE. */
- ACE_FLAGS flags; /* Flags describing the ACE. */
- le16 size; /* Size in bytes of the ACE. */
-/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */
-
-/* 8*/ SID sid; /* The SID associated with the ACE. */
-} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE,
- SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE;
-
-/*
- * The object ACE flags (32-bit).
- */
-enum {
- ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1),
- ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2),
-};
-
-typedef le32 OBJECT_ACE_FLAGS;
-
-typedef struct {
-/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
- ACE_TYPES type; /* Type of the ACE. */
- ACE_FLAGS flags; /* Flags describing the ACE. */
- le16 size; /* Size in bytes of the ACE. */
-/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */
-
-/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */
-/* 12*/ GUID object_type;
-/* 28*/ GUID inherited_object_type;
-
-/* 44*/ SID sid; /* The SID associated with the ACE. */
-} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE,
- ACCESS_DENIED_OBJECT_ACE,
- SYSTEM_AUDIT_OBJECT_ACE,
- SYSTEM_ALARM_OBJECT_ACE;
-
-/*
- * An ACL is an access-control list (ACL).
- * An ACL starts with an ACL header structure, which specifies the size of
- * the ACL and the number of ACEs it contains. The ACL header is followed by
- * zero or more access control entries (ACEs). The ACL as well as each ACE
- * are aligned on 4-byte boundaries.
- */
-typedef struct {
- u8 revision; /* Revision of this ACL. */
- u8 alignment1;
- le16 size; /* Allocated space in bytes for ACL. Includes this
- header, the ACEs and the remaining free space. */
- le16 ace_count; /* Number of ACEs in the ACL. */
- le16 alignment2;
-/* sizeof() = 8 bytes */
-} __attribute__ ((__packed__)) ACL;
-
-/*
- * Current constants for ACLs.
- */
-typedef enum {
- /* Current revision. */
- ACL_REVISION = 2,
- ACL_REVISION_DS = 4,
-
- /* History of revisions. */
- ACL_REVISION1 = 1,
- MIN_ACL_REVISION = 2,
- ACL_REVISION2 = 2,
- ACL_REVISION3 = 3,
- ACL_REVISION4 = 4,
- MAX_ACL_REVISION = 4,
-} ACL_CONSTANTS;
-
-/*
- * The security descriptor control flags (16-bit).
- *
- * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID
- * pointed to by the Owner field was provided by a defaulting mechanism
- * rather than explicitly provided by the original provider of the
- * security descriptor. This may affect the treatment of the SID with
- * respect to inheritance of an owner.
- *
- * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
- * the Group field was provided by a defaulting mechanism rather than
- * explicitly provided by the original provider of the security
- * descriptor. This may affect the treatment of the SID with respect to
- * inheritance of a primary group.
- *
- * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
- * descriptor contains a discretionary ACL. If this flag is set and the
- * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is
- * explicitly being specified.
- *
- * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
- * pointed to by the Dacl field was provided by a defaulting mechanism
- * rather than explicitly provided by the original provider of the
- * security descriptor. This may affect the treatment of the ACL with
- * respect to inheritance of an ACL. This flag is ignored if the
- * DaclPresent flag is not set.
- *
- * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security
- * descriptor contains a system ACL pointed to by the Sacl field. If this
- * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then
- * an empty (but present) ACL is being specified.
- *
- * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
- * pointed to by the Sacl field was provided by a defaulting mechanism
- * rather than explicitly provided by the original provider of the
- * security descriptor. This may affect the treatment of the ACL with
- * respect to inheritance of an ACL. This flag is ignored if the
- * SaclPresent flag is not set.
- *
- * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
- * descriptor is in self-relative form. In this form, all fields of the
- * security descriptor are contiguous in memory and all pointer fields are
- * expressed as offsets from the beginning of the security descriptor.
- */
-enum {
- SE_OWNER_DEFAULTED = cpu_to_le16(0x0001),
- SE_GROUP_DEFAULTED = cpu_to_le16(0x0002),
- SE_DACL_PRESENT = cpu_to_le16(0x0004),
- SE_DACL_DEFAULTED = cpu_to_le16(0x0008),
-
- SE_SACL_PRESENT = cpu_to_le16(0x0010),
- SE_SACL_DEFAULTED = cpu_to_le16(0x0020),
-
- SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100),
- SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200),
- SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400),
- SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800),
-
- SE_DACL_PROTECTED = cpu_to_le16(0x1000),
- SE_SACL_PROTECTED = cpu_to_le16(0x2000),
- SE_RM_CONTROL_VALID = cpu_to_le16(0x4000),
- SE_SELF_RELATIVE = cpu_to_le16(0x8000)
-} __attribute__ ((__packed__));
-
-typedef le16 SECURITY_DESCRIPTOR_CONTROL;
-
-/*
- * Self-relative security descriptor. Contains the owner and group SIDs as well
- * as the sacl and dacl ACLs inside the security descriptor itself.
- */
-typedef struct {
- u8 revision; /* Revision level of the security descriptor. */
- u8 alignment;
- SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
- the descriptor as well as the following fields. */
- le32 owner; /* Byte offset to a SID representing an object's
- owner. If this is NULL, no owner SID is present in
- the descriptor. */
- le32 group; /* Byte offset to a SID representing an object's
- primary group. If this is NULL, no primary group
- SID is present in the descriptor. */
- le32 sacl; /* Byte offset to a system ACL. Only valid, if
- SE_SACL_PRESENT is set in the control field. If
- SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
- is specified. */
- le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if
- SE_DACL_PRESENT is set in the control field. If
- SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
- (unconditionally granting access) is specified. */
-/* sizeof() = 0x14 bytes */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE;
-
-/*
- * Absolute security descriptor. Does not contain the owner and group SIDs, nor
- * the sacl and dacl ACLs inside the security descriptor. Instead, it contains
- * pointers to these structures in memory. Obviously, absolute security
- * descriptors are only useful for in memory representations of security
- * descriptors. On disk, a self-relative security descriptor is used.
- */
-typedef struct {
- u8 revision; /* Revision level of the security descriptor. */
- u8 alignment;
- SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
- the descriptor as well as the following fields. */
- SID *owner; /* Points to a SID representing an object's owner. If
- this is NULL, no owner SID is present in the
- descriptor. */
- SID *group; /* Points to a SID representing an object's primary
- group. If this is NULL, no primary group SID is
- present in the descriptor. */
- ACL *sacl; /* Points to a system ACL. Only valid, if
- SE_SACL_PRESENT is set in the control field. If
- SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
- is specified. */
- ACL *dacl; /* Points to a discretionary ACL. Only valid, if
- SE_DACL_PRESENT is set in the control field. If
- SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
- (unconditionally granting access) is specified. */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR;
-
-/*
- * Current constants for security descriptors.
- */
-typedef enum {
- /* Current revision. */
- SECURITY_DESCRIPTOR_REVISION = 1,
- SECURITY_DESCRIPTOR_REVISION1 = 1,
-
- /* The sizes of both the absolute and relative security descriptors is
- the same as pointers, at least on ia32 architecture are 32-bit. */
- SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR),
-} SECURITY_DESCRIPTOR_CONSTANTS;
-
-/*
- * Attribute: Security descriptor (0x50). A standard self-relative security
- * descriptor.
- *
- * NOTE: Can be resident or non-resident.
- * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally
- * in FILE_Secure and the correct descriptor is found using the security_id
- * from the standard information attribute.
- */
-typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR;
-
-/*
- * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one
- * referenced instance of each unique security descriptor is stored.
- *
- * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It
- * does, however, contain two indexes ($SDH and $SII) as well as a named data
- * stream ($SDS).
- *
- * Every unique security descriptor is assigned a unique security identifier
- * (security_id, not to be confused with a SID). The security_id is unique for
- * the NTFS volume and is used as an index into the $SII index, which maps
- * security_ids to the security descriptor's storage location within the $SDS
- * data attribute. The $SII index is sorted by ascending security_id.
- *
- * A simple hash is computed from each security descriptor. This hash is used
- * as an index into the $SDH index, which maps security descriptor hashes to
- * the security descriptor's storage location within the $SDS data attribute.
- * The $SDH index is sorted by security descriptor hash and is stored in a B+
- * tree. When searching $SDH (with the intent of determining whether or not a
- * new security descriptor is already present in the $SDS data stream), if a
- * matching hash is found, but the security descriptors do not match, the
- * search in the $SDH index is continued, searching for a next matching hash.
- *
- * When a precise match is found, the security_id coresponding to the security
- * descriptor in the $SDS attribute is read from the found $SDH index entry and
- * is stored in the $STANDARD_INFORMATION attribute of the file/directory to
- * which the security descriptor is being applied. The $STANDARD_INFORMATION
- * attribute is present in all base mft records (i.e. in all files and
- * directories).
- *
- * If a match is not found, the security descriptor is assigned a new unique
- * security_id and is added to the $SDS data attribute. Then, entries
- * referencing the this security descriptor in the $SDS data attribute are
- * added to the $SDH and $SII indexes.
- *
- * Note: Entries are never deleted from FILE_Secure, even if nothing
- * references an entry any more.
- */
-
-/*
- * This header precedes each security descriptor in the $SDS data stream.
- * This is also the index entry data part of both the $SII and $SDH indexes.
- */
-typedef struct {
- le32 hash; /* Hash of the security descriptor. */
- le32 security_id; /* The security_id assigned to the descriptor. */
- le64 offset; /* Byte offset of this entry in the $SDS stream. */
- le32 length; /* Size in bytes of this entry in $SDS stream. */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER;
-
-/*
- * The $SDS data stream contains the security descriptors, aligned on 16-byte
- * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot
- * cross 256kib boundaries (this restriction is imposed by the Windows cache
- * manager). Each security descriptor is contained in a SDS_ENTRY structure.
- * Also, each security descriptor is stored twice in the $SDS stream with a
- * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
- * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
- * first copy of the security descriptor will be at offset 0x51d0 in the
- * $SDS data stream and the second copy will be at offset 0x451d0.
- */
-typedef struct {
-/*Ofs*/
-/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like
- unnamed structs. */
- le32 hash; /* Hash of the security descriptor. */
- le32 security_id; /* The security_id assigned to the descriptor. */
- le64 offset; /* Byte offset of this entry in the $SDS stream. */
- le32 length; /* Size in bytes of this entry in $SDS stream. */
-/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security
- descriptor. */
-} __attribute__ ((__packed__)) SDS_ENTRY;
-
-/*
- * The index entry key used in the $SII index. The collation type is
- * COLLATION_NTOFS_ULONG.
- */
-typedef struct {
- le32 security_id; /* The security_id assigned to the descriptor. */
-} __attribute__ ((__packed__)) SII_INDEX_KEY;
-
-/*
- * The index entry key used in the $SDH index. The keys are sorted first by
- * hash and then by security_id. The collation rule is
- * COLLATION_NTOFS_SECURITY_HASH.
- */
-typedef struct {
- le32 hash; /* Hash of the security descriptor. */
- le32 security_id; /* The security_id assigned to the descriptor. */
-} __attribute__ ((__packed__)) SDH_INDEX_KEY;
-
-/*
- * Attribute: Volume name (0x60).
- *
- * NOTE: Always resident.
- * NOTE: Present only in FILE_Volume.
- */
-typedef struct {
- ntfschar name[0]; /* The name of the volume in Unicode. */
-} __attribute__ ((__packed__)) VOLUME_NAME;
-
-/*
- * Possible flags for the volume (16-bit).
- */
-enum {
- VOLUME_IS_DIRTY = cpu_to_le16(0x0001),
- VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002),
- VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004),
- VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008),
-
- VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010),
- VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020),
-
- VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000),
- VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000),
-
- VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f),
-
- /* To make our life easier when checking if we must mount read-only. */
- VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027),
-} __attribute__ ((__packed__));
-
-typedef le16 VOLUME_FLAGS;
-
-/*
- * Attribute: Volume information (0x70).
- *
- * NOTE: Always resident.
- * NOTE: Present only in FILE_Volume.
- * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses
- * NTFS 1.2. I haven't personally seen other values yet.
- */
-typedef struct {
- le64 reserved; /* Not used (yet?). */
- u8 major_ver; /* Major version of the ntfs format. */
- u8 minor_ver; /* Minor version of the ntfs format. */
- VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */
-} __attribute__ ((__packed__)) VOLUME_INFORMATION;
-
-/*
- * Attribute: Data attribute (0x80).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Data contents of a file (i.e. the unnamed stream) or of a named stream.
- */
-typedef struct {
- u8 data[0]; /* The file's data contents. */
-} __attribute__ ((__packed__)) DATA_ATTR;
-
-/*
- * Index header flags (8-bit).
- */
-enum {
- /*
- * When index header is in an index root attribute:
- */
- SMALL_INDEX = 0, /* The index is small enough to fit inside the index
- root attribute and there is no index allocation
- attribute present. */
- LARGE_INDEX = 1, /* The index is too large to fit in the index root
- attribute and/or an index allocation attribute is
- present. */
- /*
- * When index header is in an index block, i.e. is part of index
- * allocation attribute:
- */
- LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes
- branching off it. */
- INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf
- node. */
- NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */
-} __attribute__ ((__packed__));
-
-typedef u8 INDEX_HEADER_FLAGS;
-
-/*
- * This is the header for indexes, describing the INDEX_ENTRY records, which
- * follow the INDEX_HEADER. Together the index header and the index entries
- * make up a complete index.
- *
- * IMPORTANT NOTE: The offset, length and size structure members are counted
- * relative to the start of the index header structure and not relative to the
- * start of the index root or index allocation structures themselves.
- */
-typedef struct {
- le32 entries_offset; /* Byte offset to first INDEX_ENTRY
- aligned to 8-byte boundary. */
- le32 index_length; /* Data size of the index in bytes,
- i.e. bytes used from allocated
- size, aligned to 8-byte boundary. */
- le32 allocated_size; /* Byte size of this index (block),
- multiple of 8 bytes. */
- /* NOTE: For the index root attribute, the above two numbers are always
- equal, as the attribute is resident and it is resized as needed. In
- the case of the index allocation attribute the attribute is not
- resident and hence the allocated_size is a fixed value and must
- equal the index_block_size specified by the INDEX_ROOT attribute
- corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK
- belongs to. */
- INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */
- u8 reserved[3]; /* Reserved/align to 8-byte boundary. */
-} __attribute__ ((__packed__)) INDEX_HEADER;
-
-/*
- * Attribute: Index root (0x90).
- *
- * NOTE: Always resident.
- *
- * This is followed by a sequence of index entries (INDEX_ENTRY structures)
- * as described by the index header.
- *
- * When a directory is small enough to fit inside the index root then this
- * is the only attribute describing the directory. When the directory is too
- * large to fit in the index root, on the other hand, two additional attributes
- * are present: an index allocation attribute, containing sub-nodes of the B+
- * directory tree (see below), and a bitmap attribute, describing which virtual
- * cluster numbers (vcns) in the index allocation attribute are in use by an
- * index block.
- *
- * NOTE: The root directory (FILE_root) contains an entry for itself. Other
- * directories do not contain entries for themselves, though.
- */
-typedef struct {
- ATTR_TYPE type; /* Type of the indexed attribute. Is
- $FILE_NAME for directories, zero
- for view indexes. No other values
- allowed. */
- COLLATION_RULE collation_rule; /* Collation rule used to sort the
- index entries. If type is $FILE_NAME,
- this must be COLLATION_FILE_NAME. */
- le32 index_block_size; /* Size of each index block in bytes (in
- the index allocation attribute). */
- u8 clusters_per_index_block; /* Cluster size of each index block (in
- the index allocation attribute), when
- an index block is >= than a cluster,
- otherwise this will be the log of
- the size (like how the encoding of
- the mft record size and the index
- record size found in the boot sector
- work). Has to be a power of 2. */
- u8 reserved[3]; /* Reserved/align to 8-byte boundary. */
- INDEX_HEADER index; /* Index header describing the
- following index entries. */
-} __attribute__ ((__packed__)) INDEX_ROOT;
-
-/*
- * Attribute: Index allocation (0xa0).
- *
- * NOTE: Always non-resident (doesn't make sense to be resident anyway!).
- *
- * This is an array of index blocks. Each index block starts with an
- * INDEX_BLOCK structure containing an index header, followed by a sequence of
- * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER.
- */
-typedef struct {
-/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
- NTFS_RECORD_TYPE magic; /* Magic is "INDX". */
- le16 usa_ofs; /* See NTFS_RECORD definition. */
- le16 usa_count; /* See NTFS_RECORD definition. */
-
-/* 8*/ sle64 lsn; /* $LogFile sequence number of the last
- modification of this index block. */
-/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block.
- If the cluster_size on the volume is <= the
- index_block_size of the directory,
- index_block_vcn counts in units of clusters,
- and in units of sectors otherwise. */
-/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */
-/* sizeof()= 40 (0x28) bytes */
-/*
- * When creating the index block, we place the update sequence array at this
- * offset, i.e. before we start with the index entries. This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work. As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) INDEX_BLOCK;
-
-typedef INDEX_BLOCK INDEX_ALLOCATION;
-
-/*
- * The system file FILE_Extend/$Reparse contains an index named $R listing
- * all reparse points on the volume. The index entry keys are as defined
- * below. Note, that there is no index data associated with the index entries.
- *
- * The index entries are sorted by the index key file_id. The collation rule is
- * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the
- * primary key / is not a key at all. (AIA)
- */
-typedef struct {
- le32 reparse_tag; /* Reparse point type (inc. flags). */
- leMFT_REF file_id; /* Mft record of the file containing the
- reparse point attribute. */
-} __attribute__ ((__packed__)) REPARSE_INDEX_KEY;
-
-/*
- * Quota flags (32-bit).
- *
- * The user quota flags. Names explain meaning.
- */
-enum {
- QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001),
- QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002),
- QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004),
-
- QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007),
- /* This is a bit mask for the user quota flags. */
-
- /*
- * These flags are only present in the quota defaults index entry, i.e.
- * in the entry where owner_id = QUOTA_DEFAULTS_ID.
- */
- QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010),
- QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020),
- QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040),
- QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080),
-
- QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100),
- QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200),
- QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400),
- QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800),
-};
-
-typedef le32 QUOTA_FLAGS;
-
-/*
- * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas
- * are on a per volume and per user basis.
- *
- * The $Q index contains one entry for each existing user_id on the volume. The
- * index key is the user_id of the user/group owning this quota control entry,
- * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the
- * owner_id, is found in the standard information attribute. The collation rule
- * for $Q is COLLATION_NTOFS_ULONG.
- *
- * The $O index contains one entry for each user/group who has been assigned
- * a quota on that volume. The index key holds the SID of the user_id the
- * entry belongs to, i.e. the owner_id. The collation rule for $O is
- * COLLATION_NTOFS_SID.
- *
- * The $O index entry data is the user_id of the user corresponding to the SID.
- * This user_id is used as an index into $Q to find the quota control entry
- * associated with the SID.
- *
- * The $Q index entry data is the quota control entry and is defined below.
- */
-typedef struct {
- le32 version; /* Currently equals 2. */
- QUOTA_FLAGS flags; /* Flags describing this quota entry. */
- le64 bytes_used; /* How many bytes of the quota are in use. */
- sle64 change_time; /* Last time this quota entry was changed. */
- sle64 threshold; /* Soft quota (-1 if not limited). */
- sle64 limit; /* Hard quota (-1 if not limited). */
- sle64 exceeded_time; /* How long the soft quota has been exceeded. */
- SID sid; /* The SID of the user/object associated with
- this quota entry. Equals zero for the quota
- defaults entry (and in fact on a WinXP
- volume, it is not present at all). */
-} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY;
-
-/*
- * Predefined owner_id values (32-bit).
- */
-enum {
- QUOTA_INVALID_ID = cpu_to_le32(0x00000000),
- QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001),
- QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100),
-};
-
-/*
- * Current constants for quota control entries.
- */
-typedef enum {
- /* Current version. */
- QUOTA_VERSION = 2,
-} QUOTA_CONTROL_ENTRY_CONSTANTS;
-
-/*
- * Index entry flags (16-bit).
- */
-enum {
- INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
- sub-node, i.e. a reference to an index block in form of
- a virtual cluster number (see below). */
- INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last
- entry in an index block. The index entry does not
- represent a file but it can point to a sub-node. */
-
- INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
- enum bit width to 16-bit. */
-} __attribute__ ((__packed__));
-
-typedef le16 INDEX_ENTRY_FLAGS;
-
-/*
- * This the index entry header (see below).
- */
-typedef struct {
-/* 0*/ union {
- struct { /* Only valid when INDEX_ENTRY_END is not set. */
- leMFT_REF indexed_file; /* The mft reference of the file
- described by this index
- entry. Used for directory
- indexes. */
- } __attribute__ ((__packed__)) dir;
- struct { /* Used for views/indexes to find the entry's data. */
- le16 data_offset; /* Data byte offset from this
- INDEX_ENTRY. Follows the
- index key. */
- le16 data_length; /* Data length in bytes. */
- le32 reservedV; /* Reserved (zero). */
- } __attribute__ ((__packed__)) vi;
- } __attribute__ ((__packed__)) data;
-/* 8*/ le16 length; /* Byte size of this index entry, multiple of
- 8-bytes. */
-/* 10*/ le16 key_length; /* Byte size of the key value, which is in the
- index entry. It follows field reserved. Not
- multiple of 8-bytes. */
-/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
-/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */
-/* sizeof() = 16 bytes */
-} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER;
-
-/*
- * This is an index entry. A sequence of such entries follows each INDEX_HEADER
- * structure. Together they make up a complete index. The index follows either
- * an index root attribute or an index allocation attribute.
- *
- * NOTE: Before NTFS 3.0 only filename attributes were indexed.
- */
-typedef struct {
-/*Ofs*/
-/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */
- union {
- struct { /* Only valid when INDEX_ENTRY_END is not set. */
- leMFT_REF indexed_file; /* The mft reference of the file
- described by this index
- entry. Used for directory
- indexes. */
- } __attribute__ ((__packed__)) dir;
- struct { /* Used for views/indexes to find the entry's data. */
- le16 data_offset; /* Data byte offset from this
- INDEX_ENTRY. Follows the
- index key. */
- le16 data_length; /* Data length in bytes. */
- le32 reservedV; /* Reserved (zero). */
- } __attribute__ ((__packed__)) vi;
- } __attribute__ ((__packed__)) data;
- le16 length; /* Byte size of this index entry, multiple of
- 8-bytes. */
- le16 key_length; /* Byte size of the key value, which is in the
- index entry. It follows field reserved. Not
- multiple of 8-bytes. */
- INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
- le16 reserved; /* Reserved/align to 8-byte boundary. */
-
-/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present
- if INDEX_ENTRY_END bit in flags is not set. NOTE: On
- NTFS versions before 3.0 the only valid key is the
- FILE_NAME_ATTR. On NTFS 3.0+ the following
- additional index keys are defined: */
- FILE_NAME_ATTR file_name;/* $I30 index in directories. */
- SII_INDEX_KEY sii; /* $SII index in $Secure. */
- SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */
- GUID object_id; /* $O index in FILE_Extend/$ObjId: The
- object_id of the mft record found in
- the data part of the index. */
- REPARSE_INDEX_KEY reparse; /* $R index in
- FILE_Extend/$Reparse. */
- SID sid; /* $O index in FILE_Extend/$Quota:
- SID of the owner of the user_id. */
- le32 owner_id; /* $Q index in FILE_Extend/$Quota:
- user_id of the owner of the quota
- control entry in the data part of
- the index. */
- } __attribute__ ((__packed__)) key;
- /* The (optional) index data is inserted here when creating. */
- // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last
- // eight bytes of this index entry contain the virtual
- // cluster number of the index block that holds the
- // entries immediately preceding the current entry (the
- // vcn references the corresponding cluster in the data
- // of the non-resident index allocation attribute). If
- // the key_length is zero, then the vcn immediately
- // follows the INDEX_ENTRY_HEADER. Regardless of
- // key_length, the address of the 8-byte boundary
- // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
- // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
- // where sizeof(VCN) can be hardcoded as 8 if wanted. */
-} __attribute__ ((__packed__)) INDEX_ENTRY;
-
-/*
- * Attribute: Bitmap (0xb0).
- *
- * Contains an array of bits (aka a bitfield).
- *
- * When used in conjunction with the index allocation attribute, each bit
- * corresponds to one index block within the index allocation attribute. Thus
- * the number of bits in the bitmap * index block size / cluster size is the
- * number of clusters in the index allocation attribute.
- */
-typedef struct {
- u8 bitmap[0]; /* Array of bits. */
-} __attribute__ ((__packed__)) BITMAP_ATTR;
-
-/*
- * The reparse point tag defines the type of the reparse point. It also
- * includes several flags, which further describe the reparse point.
- *
- * The reparse point tag is an unsigned 32-bit value divided in three parts:
- *
- * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of
- * the reparse point.
- * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use.
- * 3. The most significant three bits are flags describing the reparse point.
- * They are defined as follows:
- * bit 29: Name surrogate bit. If set, the filename is an alias for
- * another object in the system.
- * bit 30: High-latency bit. If set, accessing the first byte of data will
- * be slow. (E.g. the data is stored on a tape drive.)
- * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User
- * defined tags have to use zero here.
- *
- * These are the predefined reparse point tags:
- */
-enum {
- IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000),
- IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000),
- IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000),
-
- IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000),
- IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001),
- IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001),
-
- IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005),
- IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006),
- IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007),
- IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008),
-
- IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003),
-
- IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004),
-
- IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000),
-
- IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff),
-};
-
-/*
- * Attribute: Reparse point (0xc0).
- *
- * NOTE: Can be resident or non-resident.
- */
-typedef struct {
- le32 reparse_tag; /* Reparse point type (inc. flags). */
- le16 reparse_data_length; /* Byte size of reparse data. */
- le16 reserved; /* Align to 8-byte boundary. */
- u8 reparse_data[0]; /* Meaning depends on reparse_tag. */
-} __attribute__ ((__packed__)) REPARSE_POINT;
-
-/*
- * Attribute: Extended attribute (EA) information (0xd0).
- *
- * NOTE: Always resident. (Is this true???)
- */
-typedef struct {
- le16 ea_length; /* Byte size of the packed extended
- attributes. */
- le16 need_ea_count; /* The number of extended attributes which have
- the NEED_EA bit set. */
- le32 ea_query_length; /* Byte size of the buffer required to query
- the extended attributes when calling
- ZwQueryEaFile() in Windows NT/2k. I.e. the
- byte size of the unpacked extended
- attributes. */
-} __attribute__ ((__packed__)) EA_INFORMATION;
-
-/*
- * Extended attribute flags (8-bit).
- */
-enum {
- NEED_EA = 0x80 /* If set the file to which the EA belongs
- cannot be interpreted without understanding
- the associates extended attributes. */
-} __attribute__ ((__packed__));
-
-typedef u8 EA_FLAGS;
-
-/*
- * Attribute: Extended attribute (EA) (0xe0).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Like the attribute list and the index buffer list, the EA attribute value is
- * a sequence of EA_ATTR variable length records.
- */
-typedef struct {
- le32 next_entry_offset; /* Offset to the next EA_ATTR. */
- EA_FLAGS flags; /* Flags describing the EA. */
- u8 ea_name_length; /* Length of the name of the EA in bytes
- excluding the '\0' byte terminator. */
- le16 ea_value_length; /* Byte size of the EA's value. */
- u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not
- Unicode and it is zero terminated. */
- u8 ea_value[0]; /* The value of the EA. Immediately follows
- the name. */
-} __attribute__ ((__packed__)) EA_ATTR;
-
-/*
- * Attribute: Property set (0xf0).
- *
- * Intended to support Native Structure Storage (NSS) - a feature removed from
- * NTFS 3.0 during beta testing.
- */
-typedef struct {
- /* Irrelevant as feature unused. */
-} __attribute__ ((__packed__)) PROPERTY_SET;
-
-/*
- * Attribute: Logged utility stream (0x100).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Operations on this attribute are logged to the journal ($LogFile) like
- * normal metadata changes.
- *
- * Used by the Encrypting File System (EFS). All encrypted files have this
- * attribute with the name $EFS.
- */
-typedef struct {
- /* Can be anything the creator chooses. */
- /* EFS uses it as follows: */
- // FIXME: Type this info, verifying it along the way. (AIA)
-} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR;
-
-#endif /* _LINUX_NTFS_LAYOUT_H */
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
deleted file mode 100644
index eda9972e6159..000000000000
--- a/fs/ntfs/lcnalloc.c
+++ /dev/null
@@ -1,1000 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/pagemap.h>
-
-#include "lcnalloc.h"
-#include "debug.h"
-#include "bitmap.h"
-#include "inode.h"
-#include "volume.h"
-#include "attrib.h"
-#include "malloc.h"
-#include "aops.h"
-#include "ntfs.h"
-
-/**
- * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
- * @vol: mounted ntfs volume on which to free the clusters
- * @rl: runlist describing the clusters to free
- *
- * Free all the clusters described by the runlist @rl on the volume @vol. In
- * the case of an error being returned, at least some of the clusters were not
- * freed.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - The volume lcn bitmap must be locked for writing on entry and is
- * left locked on return.
- */
-int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
- const runlist_element *rl)
-{
- struct inode *lcnbmp_vi = vol->lcnbmp_ino;
- int ret = 0;
-
- ntfs_debug("Entering.");
- if (!rl)
- return 0;
- for (; rl->length; rl++) {
- int err;
-
- if (rl->lcn < 0)
- continue;
- err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
- if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err))
- ret = err;
- }
- ntfs_debug("Done.");
- return ret;
-}
-
-/**
- * ntfs_cluster_alloc - allocate clusters on an ntfs volume
- * @vol: mounted ntfs volume on which to allocate the clusters
- * @start_vcn: vcn to use for the first allocated cluster
- * @count: number of clusters to allocate
- * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none)
- * @zone: zone from which to allocate the clusters
- * @is_extension: if 'true', this is an attribute extension
- *
- * Allocate @count clusters preferably starting at cluster @start_lcn or at the
- * current allocator position if @start_lcn is -1, on the mounted ntfs volume
- * @vol. @zone is either DATA_ZONE for allocation of normal clusters or
- * MFT_ZONE for allocation of clusters for the master file table, i.e. the
- * $MFT/$DATA attribute.
- *
- * @start_vcn specifies the vcn of the first allocated cluster. This makes
- * merging the resulting runlist with the old runlist easier.
- *
- * If @is_extension is 'true', the caller is allocating clusters to extend an
- * attribute and if it is 'false', the caller is allocating clusters to fill a
- * hole in an attribute. Practically the difference is that if @is_extension
- * is 'true' the returned runlist will be terminated with LCN_ENOENT and if
- * @is_extension is 'false' the runlist will be terminated with
- * LCN_RL_NOT_MAPPED.
- *
- * You need to check the return value with IS_ERR(). If this is false, the
- * function was successful and the return value is a runlist describing the
- * allocated cluster(s). If IS_ERR() is true, the function failed and
- * PTR_ERR() gives you the error code.
- *
- * Notes on the allocation algorithm
- * =================================
- *
- * There are two data zones. First is the area between the end of the mft zone
- * and the end of the volume, and second is the area between the start of the
- * volume and the start of the mft zone. On unmodified/standard NTFS 1.x
- * volumes, the second data zone does not exist due to the mft zone being
- * expanded to cover the start of the volume in order to reserve space for the
- * mft bitmap attribute.
- *
- * This is not the prettiest function but the complexity stems from the need of
- * implementing the mft vs data zoned approach and from the fact that we have
- * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
- * need to cope with crossing over boundaries of two buffers. Further, the
- * fact that the allocator allows for caller supplied hints as to the location
- * of where allocation should begin and the fact that the allocator keeps track
- * of where in the data zones the next natural allocation should occur,
- * contribute to the complexity of the function. But it should all be
- * worthwhile, because this allocator should: 1) be a full implementation of
- * the MFT zone approach used by Windows NT, 2) cause reduction in
- * fragmentation, and 3) be speedy in allocations (the code is not optimized
- * for speed, but the algorithm is, so further speed improvements are probably
- * possible).
- *
- * FIXME: We should be monitoring cluster allocation and increment the MFT zone
- * size dynamically but this is something for the future. We will just cause
- * heavier fragmentation by not doing it and I am not even sure Windows would
- * grow the MFT zone dynamically, so it might even be correct not to do this.
- * The overhead in doing dynamic MFT zone expansion would be very large and
- * unlikely worth the effort. (AIA)
- *
- * TODO: I have added in double the required zone position pointer wrap around
- * logic which can be optimized to having only one of the two logic sets.
- * However, having the double logic will work fine, but if we have only one of
- * the sets and we get it wrong somewhere, then we get into trouble, so
- * removing the duplicate logic requires _very_ careful consideration of _all_
- * possible code paths. So at least for now, I am leaving the double logic -
- * better safe than sorry... (AIA)
- *
- * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
- * on return.
- * - This function takes the volume lcn bitmap lock for writing and
- * modifies the bitmap contents.
- */
-runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
- const s64 count, const LCN start_lcn,
- const NTFS_CLUSTER_ALLOCATION_ZONES zone,
- const bool is_extension)
-{
- LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
- LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
- s64 clusters;
- loff_t i_size;
- struct inode *lcnbmp_vi;
- runlist_element *rl = NULL;
- struct address_space *mapping;
- struct page *page = NULL;
- u8 *buf, *byte;
- int err = 0, rlpos, rlsize, buf_size;
- u8 pass, done_zones, search_zone, need_writeback = 0, bit;
-
- ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn "
- "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn,
- (unsigned long long)count,
- (unsigned long long)start_lcn,
- zone == MFT_ZONE ? "MFT" : "DATA");
- BUG_ON(!vol);
- lcnbmp_vi = vol->lcnbmp_ino;
- BUG_ON(!lcnbmp_vi);
- BUG_ON(start_vcn < 0);
- BUG_ON(count < 0);
- BUG_ON(start_lcn < -1);
- BUG_ON(zone < FIRST_ZONE);
- BUG_ON(zone > LAST_ZONE);
-
- /* Return NULL if @count is zero. */
- if (!count)
- return NULL;
- /* Take the lcnbmp lock for writing. */
- down_write(&vol->lcnbmp_lock);
- /*
- * If no specific @start_lcn was requested, use the current data zone
- * position, otherwise use the requested @start_lcn but make sure it
- * lies outside the mft zone. Also set done_zones to 0 (no zones done)
- * and pass depending on whether we are starting inside a zone (1) or
- * at the beginning of a zone (2). If requesting from the MFT_ZONE,
- * we either start at the current position within the mft zone or at
- * the specified position. If the latter is out of bounds then we start
- * at the beginning of the MFT_ZONE.
- */
- done_zones = 0;
- pass = 1;
- /*
- * zone_start and zone_end are the current search range. search_zone
- * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
- * volume) and 4 for data zone 2 (start of volume till start of mft
- * zone).
- */
- zone_start = start_lcn;
- if (zone_start < 0) {
- if (zone == DATA_ZONE)
- zone_start = vol->data1_zone_pos;
- else
- zone_start = vol->mft_zone_pos;
- if (!zone_start) {
- /*
- * Zone starts at beginning of volume which means a
- * single pass is sufficient.
- */
- pass = 2;
- }
- } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start &&
- zone_start < vol->mft_zone_end) {
- zone_start = vol->mft_zone_end;
- /*
- * Starting at beginning of data1_zone which means a single
- * pass in this zone is sufficient.
- */
- pass = 2;
- } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start ||
- zone_start >= vol->mft_zone_end)) {
- zone_start = vol->mft_lcn;
- if (!vol->mft_zone_end)
- zone_start = 0;
- /*
- * Starting at beginning of volume which means a single pass
- * is sufficient.
- */
- pass = 2;
- }
- if (zone == MFT_ZONE) {
- zone_end = vol->mft_zone_end;
- search_zone = 1;
- } else /* if (zone == DATA_ZONE) */ {
- /* Skip searching the mft zone. */
- done_zones |= 1;
- if (zone_start >= vol->mft_zone_end) {
- zone_end = vol->nr_clusters;
- search_zone = 2;
- } else {
- zone_end = vol->mft_zone_start;
- search_zone = 4;
- }
- }
- /*
- * bmp_pos is the current bit position inside the bitmap. We use
- * bmp_initial_pos to determine whether or not to do a zone switch.
- */
- bmp_pos = bmp_initial_pos = zone_start;
-
- /* Loop until all clusters are allocated, i.e. clusters == 0. */
- clusters = count;
- rlpos = rlsize = 0;
- mapping = lcnbmp_vi->i_mapping;
- i_size = i_size_read(lcnbmp_vi);
- while (1) {
- ntfs_debug("Start of outer while loop: done_zones 0x%x, "
- "search_zone %i, pass %i, zone_start 0x%llx, "
- "zone_end 0x%llx, bmp_initial_pos 0x%llx, "
- "bmp_pos 0x%llx, rlpos %i, rlsize %i.",
- done_zones, search_zone, pass,
- (unsigned long long)zone_start,
- (unsigned long long)zone_end,
- (unsigned long long)bmp_initial_pos,
- (unsigned long long)bmp_pos, rlpos, rlsize);
- /* Loop until we run out of free clusters. */
- last_read_pos = bmp_pos >> 3;
- ntfs_debug("last_read_pos 0x%llx.",
- (unsigned long long)last_read_pos);
- if (last_read_pos > i_size) {
- ntfs_debug("End of attribute reached. "
- "Skipping to zone_pass_done.");
- goto zone_pass_done;
- }
- if (likely(page)) {
- if (need_writeback) {
- ntfs_debug("Marking page dirty.");
- flush_dcache_page(page);
- set_page_dirty(page);
- need_writeback = 0;
- }
- ntfs_unmap_page(page);
- }
- page = ntfs_map_page(mapping, last_read_pos >>
- PAGE_SHIFT);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- ntfs_error(vol->sb, "Failed to map page.");
- goto out;
- }
- buf_size = last_read_pos & ~PAGE_MASK;
- buf = page_address(page) + buf_size;
- buf_size = PAGE_SIZE - buf_size;
- if (unlikely(last_read_pos + buf_size > i_size))
- buf_size = i_size - last_read_pos;
- buf_size <<= 3;
- lcn = bmp_pos & 7;
- bmp_pos &= ~(LCN)7;
- ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, "
- "bmp_pos 0x%llx, need_writeback %i.", buf_size,
- (unsigned long long)lcn,
- (unsigned long long)bmp_pos, need_writeback);
- while (lcn < buf_size && lcn + bmp_pos < zone_end) {
- byte = buf + (lcn >> 3);
- ntfs_debug("In inner while loop: buf_size %i, "
- "lcn 0x%llx, bmp_pos 0x%llx, "
- "need_writeback %i, byte ofs 0x%x, "
- "*byte 0x%x.", buf_size,
- (unsigned long long)lcn,
- (unsigned long long)bmp_pos,
- need_writeback,
- (unsigned int)(lcn >> 3),
- (unsigned int)*byte);
- /* Skip full bytes. */
- if (*byte == 0xff) {
- lcn = (lcn + 8) & ~(LCN)7;
- ntfs_debug("Continuing while loop 1.");
- continue;
- }
- bit = 1 << (lcn & 7);
- ntfs_debug("bit 0x%x.", bit);
- /* If the bit is already set, go onto the next one. */
- if (*byte & bit) {
- lcn++;
- ntfs_debug("Continuing while loop 2.");
- continue;
- }
- /*
- * Allocate more memory if needed, including space for
- * the terminator element.
- * ntfs_malloc_nofs() operates on whole pages only.
- */
- if ((rlpos + 2) * sizeof(*rl) > rlsize) {
- runlist_element *rl2;
-
- ntfs_debug("Reallocating memory.");
- if (!rl)
- ntfs_debug("First free bit is at LCN "
- "0x%llx.",
- (unsigned long long)
- (lcn + bmp_pos));
- rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
- if (unlikely(!rl2)) {
- err = -ENOMEM;
- ntfs_error(vol->sb, "Failed to "
- "allocate memory.");
- goto out;
- }
- memcpy(rl2, rl, rlsize);
- ntfs_free(rl);
- rl = rl2;
- rlsize += PAGE_SIZE;
- ntfs_debug("Reallocated memory, rlsize 0x%x.",
- rlsize);
- }
- /* Allocate the bitmap bit. */
- *byte |= bit;
- /* We need to write this bitmap page to disk. */
- need_writeback = 1;
- ntfs_debug("*byte 0x%x, need_writeback is set.",
- (unsigned int)*byte);
- /*
- * Coalesce with previous run if adjacent LCNs.
- * Otherwise, append a new run.
- */
- ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), "
- "prev_lcn 0x%llx, lcn 0x%llx, "
- "bmp_pos 0x%llx, prev_run_len 0x%llx, "
- "rlpos %i.",
- (unsigned long long)(lcn + bmp_pos),
- 1ULL, (unsigned long long)prev_lcn,
- (unsigned long long)lcn,
- (unsigned long long)bmp_pos,
- (unsigned long long)prev_run_len,
- rlpos);
- if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
- ntfs_debug("Coalescing to run (lcn 0x%llx, "
- "len 0x%llx).",
- (unsigned long long)
- rl[rlpos - 1].lcn,
- (unsigned long long)
- rl[rlpos - 1].length);
- rl[rlpos - 1].length = ++prev_run_len;
- ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), "
- "prev_run_len 0x%llx.",
- (unsigned long long)
- rl[rlpos - 1].lcn,
- (unsigned long long)
- rl[rlpos - 1].length,
- (unsigned long long)
- prev_run_len);
- } else {
- if (likely(rlpos)) {
- ntfs_debug("Adding new run, (previous "
- "run lcn 0x%llx, "
- "len 0x%llx).",
- (unsigned long long)
- rl[rlpos - 1].lcn,
- (unsigned long long)
- rl[rlpos - 1].length);
- rl[rlpos].vcn = rl[rlpos - 1].vcn +
- prev_run_len;
- } else {
- ntfs_debug("Adding new run, is first "
- "run.");
- rl[rlpos].vcn = start_vcn;
- }
- rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
- rl[rlpos].length = prev_run_len = 1;
- rlpos++;
- }
- /* Done? */
- if (!--clusters) {
- LCN tc;
- /*
- * Update the current zone position. Positions
- * of already scanned zones have been updated
- * during the respective zone switches.
- */
- tc = lcn + bmp_pos + 1;
- ntfs_debug("Done. Updating current zone "
- "position, tc 0x%llx, "
- "search_zone %i.",
- (unsigned long long)tc,
- search_zone);
- switch (search_zone) {
- case 1:
- ntfs_debug("Before checks, "
- "vol->mft_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->mft_zone_pos);
- if (tc >= vol->mft_zone_end) {
- vol->mft_zone_pos =
- vol->mft_lcn;
- if (!vol->mft_zone_end)
- vol->mft_zone_pos = 0;
- } else if ((bmp_initial_pos >=
- vol->mft_zone_pos ||
- tc > vol->mft_zone_pos)
- && tc >= vol->mft_lcn)
- vol->mft_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->mft_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->mft_zone_pos);
- break;
- case 2:
- ntfs_debug("Before checks, "
- "vol->data1_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data1_zone_pos);
- if (tc >= vol->nr_clusters)
- vol->data1_zone_pos =
- vol->mft_zone_end;
- else if ((bmp_initial_pos >=
- vol->data1_zone_pos ||
- tc > vol->data1_zone_pos)
- && tc >= vol->mft_zone_end)
- vol->data1_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->data1_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data1_zone_pos);
- break;
- case 4:
- ntfs_debug("Before checks, "
- "vol->data2_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data2_zone_pos);
- if (tc >= vol->mft_zone_start)
- vol->data2_zone_pos = 0;
- else if (bmp_initial_pos >=
- vol->data2_zone_pos ||
- tc > vol->data2_zone_pos)
- vol->data2_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->data2_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data2_zone_pos);
- break;
- default:
- BUG();
- }
- ntfs_debug("Finished. Going to out.");
- goto out;
- }
- lcn++;
- }
- bmp_pos += buf_size;
- ntfs_debug("After inner while loop: buf_size 0x%x, lcn "
- "0x%llx, bmp_pos 0x%llx, need_writeback %i.",
- buf_size, (unsigned long long)lcn,
- (unsigned long long)bmp_pos, need_writeback);
- if (bmp_pos < zone_end) {
- ntfs_debug("Continuing outer while loop, "
- "bmp_pos 0x%llx, zone_end 0x%llx.",
- (unsigned long long)bmp_pos,
- (unsigned long long)zone_end);
- continue;
- }
-zone_pass_done: /* Finished with the current zone pass. */
- ntfs_debug("At zone_pass_done, pass %i.", pass);
- if (pass == 1) {
- /*
- * Now do pass 2, scanning the first part of the zone
- * we omitted in pass 1.
- */
- pass = 2;
- zone_end = zone_start;
- switch (search_zone) {
- case 1: /* mft_zone */
- zone_start = vol->mft_zone_start;
- break;
- case 2: /* data1_zone */
- zone_start = vol->mft_zone_end;
- break;
- case 4: /* data2_zone */
- zone_start = 0;
- break;
- default:
- BUG();
- }
- /* Sanity check. */
- if (zone_end < zone_start)
- zone_end = zone_start;
- bmp_pos = zone_start;
- ntfs_debug("Continuing outer while loop, pass 2, "
- "zone_start 0x%llx, zone_end 0x%llx, "
- "bmp_pos 0x%llx.",
- (unsigned long long)zone_start,
- (unsigned long long)zone_end,
- (unsigned long long)bmp_pos);
- continue;
- } /* pass == 2 */
-done_zones_check:
- ntfs_debug("At done_zones_check, search_zone %i, done_zones "
- "before 0x%x, done_zones after 0x%x.",
- search_zone, done_zones,
- done_zones | search_zone);
- done_zones |= search_zone;
- if (done_zones < 7) {
- ntfs_debug("Switching zone.");
- /* Now switch to the next zone we haven't done yet. */
- pass = 1;
- switch (search_zone) {
- case 1:
- ntfs_debug("Switching from mft zone to data1 "
- "zone.");
- /* Update mft zone position. */
- if (rlpos) {
- LCN tc;
-
- ntfs_debug("Before checks, "
- "vol->mft_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->mft_zone_pos);
- tc = rl[rlpos - 1].lcn +
- rl[rlpos - 1].length;
- if (tc >= vol->mft_zone_end) {
- vol->mft_zone_pos =
- vol->mft_lcn;
- if (!vol->mft_zone_end)
- vol->mft_zone_pos = 0;
- } else if ((bmp_initial_pos >=
- vol->mft_zone_pos ||
- tc > vol->mft_zone_pos)
- && tc >= vol->mft_lcn)
- vol->mft_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->mft_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->mft_zone_pos);
- }
- /* Switch from mft zone to data1 zone. */
-switch_to_data1_zone: search_zone = 2;
- zone_start = bmp_initial_pos =
- vol->data1_zone_pos;
- zone_end = vol->nr_clusters;
- if (zone_start == vol->mft_zone_end)
- pass = 2;
- if (zone_start >= zone_end) {
- vol->data1_zone_pos = zone_start =
- vol->mft_zone_end;
- pass = 2;
- }
- break;
- case 2:
- ntfs_debug("Switching from data1 zone to "
- "data2 zone.");
- /* Update data1 zone position. */
- if (rlpos) {
- LCN tc;
-
- ntfs_debug("Before checks, "
- "vol->data1_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data1_zone_pos);
- tc = rl[rlpos - 1].lcn +
- rl[rlpos - 1].length;
- if (tc >= vol->nr_clusters)
- vol->data1_zone_pos =
- vol->mft_zone_end;
- else if ((bmp_initial_pos >=
- vol->data1_zone_pos ||
- tc > vol->data1_zone_pos)
- && tc >= vol->mft_zone_end)
- vol->data1_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->data1_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data1_zone_pos);
- }
- /* Switch from data1 zone to data2 zone. */
- search_zone = 4;
- zone_start = bmp_initial_pos =
- vol->data2_zone_pos;
- zone_end = vol->mft_zone_start;
- if (!zone_start)
- pass = 2;
- if (zone_start >= zone_end) {
- vol->data2_zone_pos = zone_start =
- bmp_initial_pos = 0;
- pass = 2;
- }
- break;
- case 4:
- ntfs_debug("Switching from data2 zone to "
- "data1 zone.");
- /* Update data2 zone position. */
- if (rlpos) {
- LCN tc;
-
- ntfs_debug("Before checks, "
- "vol->data2_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data2_zone_pos);
- tc = rl[rlpos - 1].lcn +
- rl[rlpos - 1].length;
- if (tc >= vol->mft_zone_start)
- vol->data2_zone_pos = 0;
- else if (bmp_initial_pos >=
- vol->data2_zone_pos ||
- tc > vol->data2_zone_pos)
- vol->data2_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->data2_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data2_zone_pos);
- }
- /* Switch from data2 zone to data1 zone. */
- goto switch_to_data1_zone;
- default:
- BUG();
- }
- ntfs_debug("After zone switch, search_zone %i, "
- "pass %i, bmp_initial_pos 0x%llx, "
- "zone_start 0x%llx, zone_end 0x%llx.",
- search_zone, pass,
- (unsigned long long)bmp_initial_pos,
- (unsigned long long)zone_start,
- (unsigned long long)zone_end);
- bmp_pos = zone_start;
- if (zone_start == zone_end) {
- ntfs_debug("Empty zone, going to "
- "done_zones_check.");
- /* Empty zone. Don't bother searching it. */
- goto done_zones_check;
- }
- ntfs_debug("Continuing outer while loop.");
- continue;
- } /* done_zones == 7 */
- ntfs_debug("All zones are finished.");
- /*
- * All zones are finished! If DATA_ZONE, shrink mft zone. If
- * MFT_ZONE, we have really run out of space.
- */
- mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
- ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end "
- "0x%llx, mft_zone_size 0x%llx.",
- (unsigned long long)vol->mft_zone_start,
- (unsigned long long)vol->mft_zone_end,
- (unsigned long long)mft_zone_size);
- if (zone == MFT_ZONE || mft_zone_size <= 0) {
- ntfs_debug("No free clusters left, going to out.");
- /* Really no more space left on device. */
- err = -ENOSPC;
- goto out;
- } /* zone == DATA_ZONE && mft_zone_size > 0 */
- ntfs_debug("Shrinking mft zone.");
- zone_end = vol->mft_zone_end;
- mft_zone_size >>= 1;
- if (mft_zone_size > 0)
- vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
- else /* mft zone and data2 zone no longer exist. */
- vol->data2_zone_pos = vol->mft_zone_start =
- vol->mft_zone_end = 0;
- if (vol->mft_zone_pos >= vol->mft_zone_end) {
- vol->mft_zone_pos = vol->mft_lcn;
- if (!vol->mft_zone_end)
- vol->mft_zone_pos = 0;
- }
- bmp_pos = zone_start = bmp_initial_pos =
- vol->data1_zone_pos = vol->mft_zone_end;
- search_zone = 2;
- pass = 2;
- done_zones &= ~2;
- ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, "
- "vol->mft_zone_start 0x%llx, "
- "vol->mft_zone_end 0x%llx, "
- "vol->mft_zone_pos 0x%llx, search_zone 2, "
- "pass 2, dones_zones 0x%x, zone_start 0x%llx, "
- "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, "
- "continuing outer while loop.",
- (unsigned long long)mft_zone_size,
- (unsigned long long)vol->mft_zone_start,
- (unsigned long long)vol->mft_zone_end,
- (unsigned long long)vol->mft_zone_pos,
- done_zones, (unsigned long long)zone_start,
- (unsigned long long)zone_end,
- (unsigned long long)vol->data1_zone_pos);
- }
- ntfs_debug("After outer while loop.");
-out:
- ntfs_debug("At out.");
- /* Add runlist terminator element. */
- if (likely(rl)) {
- rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
- rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
- rl[rlpos].length = 0;
- }
- if (likely(page && !IS_ERR(page))) {
- if (need_writeback) {
- ntfs_debug("Marking page dirty.");
- flush_dcache_page(page);
- set_page_dirty(page);
- need_writeback = 0;
- }
- ntfs_unmap_page(page);
- }
- if (likely(!err)) {
- up_write(&vol->lcnbmp_lock);
- ntfs_debug("Done.");
- return rl;
- }
- ntfs_error(vol->sb, "Failed to allocate clusters, aborting "
- "(error %i).", err);
- if (rl) {
- int err2;
-
- if (err == -ENOSPC)
- ntfs_debug("Not enough space to complete allocation, "
- "err -ENOSPC, first free lcn 0x%llx, "
- "could allocate up to 0x%llx "
- "clusters.",
- (unsigned long long)rl[0].lcn,
- (unsigned long long)(count - clusters));
- /* Deallocate all allocated clusters. */
- ntfs_debug("Attempting rollback...");
- err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
- if (err2) {
- ntfs_error(vol->sb, "Failed to rollback (error %i). "
- "Leaving inconsistent metadata! "
- "Unmount and run chkdsk.", err2);
- NVolSetErrors(vol);
- }
- /* Free the runlist. */
- ntfs_free(rl);
- } else if (err == -ENOSPC)
- ntfs_debug("No space left at all, err = -ENOSPC, first free "
- "lcn = 0x%llx.",
- (long long)vol->data1_zone_pos);
- up_write(&vol->lcnbmp_lock);
- return ERR_PTR(err);
-}
-
-/**
- * __ntfs_cluster_free - free clusters on an ntfs volume
- * @ni: ntfs inode whose runlist describes the clusters to free
- * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
- * @count: number of clusters to free or -1 for all clusters
- * @ctx: active attribute search context if present or NULL if not
- * @is_rollback: true if this is a rollback operation
- *
- * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the vfs inode @ni.
- *
- * If @count is -1, all clusters from @start_vcn to the end of the runlist are
- * deallocated. Thus, to completely free all clusters in a runlist, use
- * @start_vcn = 0 and @count = -1.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record. This is needed when __ntfs_cluster_free() encounters unmapped
- * runlist fragments and allows their mapping. If you do not have the mft
- * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
- * perform the necessary mapping and unmapping.
- *
- * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
- * before returning. Thus, @ctx will be left pointing to the same attribute on
- * return as on entry. However, the actual pointers in @ctx may point to
- * different memory locations on return, so you must remember to reset any
- * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
- * you will probably want to do:
- * m = ctx->mrec;
- * a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * @is_rollback should always be 'false', it is for internal use to rollback
- * errors. You probably want to use ntfs_cluster_free() instead.
- *
- * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
- * remove from the runlist or mark sparse the freed runs later.
- *
- * Return the number of deallocated clusters (not counting sparse ones) on
- * success and -errno on error.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- * is no longer valid, i.e. you need to either call
- * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- * In that case PTR_ERR(@ctx->mrec) will give you the error code for
- * why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- * and is locked on return. Note the runlist may be modified when
- * needed runlist fragments need to be mapped.
- * - The volume lcn bitmap must be unlocked on entry and is unlocked
- * on return.
- * - This function takes the volume lcn bitmap lock for writing and
- * modifies the bitmap contents.
- * - If @ctx is NULL, the base mft record of @ni must not be mapped on
- * entry and it will be left unmapped on return.
- * - If @ctx is not NULL, the base mft record must be mapped on entry
- * and it will be left mapped on return.
- */
-s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
- ntfs_attr_search_ctx *ctx, const bool is_rollback)
-{
- s64 delta, to_free, total_freed, real_freed;
- ntfs_volume *vol;
- struct inode *lcnbmp_vi;
- runlist_element *rl;
- int err;
-
- BUG_ON(!ni);
- ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
- "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn,
- (unsigned long long)count,
- is_rollback ? " (rollback)" : "");
- vol = ni->vol;
- lcnbmp_vi = vol->lcnbmp_ino;
- BUG_ON(!lcnbmp_vi);
- BUG_ON(start_vcn < 0);
- BUG_ON(count < -1);
- /*
- * Lock the lcn bitmap for writing but only if not rolling back. We
- * must hold the lock all the way including through rollback otherwise
- * rollback is not possible because once we have cleared a bit and
- * dropped the lock, anyone could have set the bit again, thus
- * allocating the cluster for another use.
- */
- if (likely(!is_rollback))
- down_write(&vol->lcnbmp_lock);
-
- total_freed = real_freed = 0;
-
- rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
- if (IS_ERR(rl)) {
- if (!is_rollback)
- ntfs_error(vol->sb, "Failed to find first runlist "
- "element (error %li), aborting.",
- PTR_ERR(rl));
- err = PTR_ERR(rl);
- goto err_out;
- }
- if (unlikely(rl->lcn < LCN_HOLE)) {
- if (!is_rollback)
- ntfs_error(vol->sb, "First runlist element has "
- "invalid lcn, aborting.");
- err = -EIO;
- goto err_out;
- }
- /* Find the starting cluster inside the run that needs freeing. */
- delta = start_vcn - rl->vcn;
-
- /* The number of clusters in this run that need freeing. */
- to_free = rl->length - delta;
- if (count >= 0 && to_free > count)
- to_free = count;
-
- if (likely(rl->lcn >= 0)) {
- /* Do the actual freeing of the clusters in this run. */
- err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta,
- to_free, likely(!is_rollback) ? 0 : 1);
- if (unlikely(err)) {
- if (!is_rollback)
- ntfs_error(vol->sb, "Failed to clear first run "
- "(error %i), aborting.", err);
- goto err_out;
- }
- /* We have freed @to_free real clusters. */
- real_freed = to_free;
- };
- /* Go to the next run and adjust the number of clusters left to free. */
- ++rl;
- if (count >= 0)
- count -= to_free;
-
- /* Keep track of the total "freed" clusters, including sparse ones. */
- total_freed = to_free;
- /*
- * Loop over the remaining runs, using @count as a capping value, and
- * free them.
- */
- for (; rl->length && count != 0; ++rl) {
- if (unlikely(rl->lcn < LCN_HOLE)) {
- VCN vcn;
-
- /* Attempt to map runlist. */
- vcn = rl->vcn;
- rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
- if (IS_ERR(rl)) {
- err = PTR_ERR(rl);
- if (!is_rollback)
- ntfs_error(vol->sb, "Failed to map "
- "runlist fragment or "
- "failed to find "
- "subsequent runlist "
- "element.");
- goto err_out;
- }
- if (unlikely(rl->lcn < LCN_HOLE)) {
- if (!is_rollback)
- ntfs_error(vol->sb, "Runlist element "
- "has invalid lcn "
- "(0x%llx).",
- (unsigned long long)
- rl->lcn);
- err = -EIO;
- goto err_out;
- }
- }
- /* The number of clusters in this run that need freeing. */
- to_free = rl->length;
- if (count >= 0 && to_free > count)
- to_free = count;
-
- if (likely(rl->lcn >= 0)) {
- /* Do the actual freeing of the clusters in the run. */
- err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn,
- to_free, likely(!is_rollback) ? 0 : 1);
- if (unlikely(err)) {
- if (!is_rollback)
- ntfs_error(vol->sb, "Failed to clear "
- "subsequent run.");
- goto err_out;
- }
- /* We have freed @to_free real clusters. */
- real_freed += to_free;
- }
- /* Adjust the number of clusters left to free. */
- if (count >= 0)
- count -= to_free;
-
- /* Update the total done clusters. */
- total_freed += to_free;
- }
- if (likely(!is_rollback))
- up_write(&vol->lcnbmp_lock);
-
- BUG_ON(count > 0);
-
- /* We are done. Return the number of actually freed clusters. */
- ntfs_debug("Done.");
- return real_freed;
-err_out:
- if (is_rollback)
- return err;
- /* If no real clusters were freed, no need to rollback. */
- if (!real_freed) {
- up_write(&vol->lcnbmp_lock);
- return err;
- }
- /*
- * Attempt to rollback and if that succeeds just return the error code.
- * If rollback fails, set the volume errors flag, emit an error
- * message, and return the error code.
- */
- delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true);
- if (delta < 0) {
- ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving "
- "inconsistent metadata! Unmount and run "
- "chkdsk.", (int)delta);
- NVolSetErrors(vol);
- }
- up_write(&vol->lcnbmp_lock);
- ntfs_error(vol->sb, "Aborting (error %i).", err);
- return err;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
deleted file mode 100644
index 1589a6d8434b..000000000000
--- a/fs/ntfs/lcnalloc.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the
- * Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_LCNALLOC_H
-#define _LINUX_NTFS_LCNALLOC_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "attrib.h"
-#include "types.h"
-#include "inode.h"
-#include "runlist.h"
-#include "volume.h"
-
-typedef enum {
- FIRST_ZONE = 0, /* For sanity checking. */
- MFT_ZONE = 0, /* Allocate from $MFT zone. */
- DATA_ZONE = 1, /* Allocate from $DATA zone. */
- LAST_ZONE = 1, /* For sanity checking. */
-} NTFS_CLUSTER_ALLOCATION_ZONES;
-
-extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
- const VCN start_vcn, const s64 count, const LCN start_lcn,
- const NTFS_CLUSTER_ALLOCATION_ZONES zone,
- const bool is_extension);
-
-extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
- s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback);
-
-/**
- * ntfs_cluster_free - free clusters on an ntfs volume
- * @ni: ntfs inode whose runlist describes the clusters to free
- * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
- * @count: number of clusters to free or -1 for all clusters
- * @ctx: active attribute search context if present or NULL if not
- *
- * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the ntfs inode @ni.
- *
- * If @count is -1, all clusters from @start_vcn to the end of the runlist are
- * deallocated. Thus, to completely free all clusters in a runlist, use
- * @start_vcn = 0 and @count = -1.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record. This is needed when ntfs_cluster_free() encounters unmapped runlist
- * fragments and allows their mapping. If you do not have the mft record
- * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
- * the necessary mapping and unmapping.
- *
- * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
- * before returning. Thus, @ctx will be left pointing to the same attribute on
- * return as on entry. However, the actual pointers in @ctx may point to
- * different memory locations on return, so you must remember to reset any
- * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
- * you will probably want to do:
- * m = ctx->mrec;
- * a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
- * from the runlist or mark sparse the freed runs later.
- *
- * Return the number of deallocated clusters (not counting sparse ones) on
- * success and -errno on error.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- * is no longer valid, i.e. you need to either call
- * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- * In that case PTR_ERR(@ctx->mrec) will give you the error code for
- * why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- * and is locked on return. Note the runlist may be modified when
- * needed runlist fragments need to be mapped.
- * - The volume lcn bitmap must be unlocked on entry and is unlocked
- * on return.
- * - This function takes the volume lcn bitmap lock for writing and
- * modifies the bitmap contents.
- * - If @ctx is NULL, the base mft record of @ni must not be mapped on
- * entry and it will be left unmapped on return.
- * - If @ctx is not NULL, the base mft record must be mapped on entry
- * and it will be left mapped on return.
- */
-static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
- s64 count, ntfs_attr_search_ctx *ctx)
-{
- return __ntfs_cluster_free(ni, start_vcn, count, ctx, false);
-}
-
-extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
- const runlist_element *rl);
-
-/**
- * ntfs_cluster_free_from_rl - free clusters from runlist
- * @vol: mounted ntfs volume on which to free the clusters
- * @rl: runlist describing the clusters to free
- *
- * Free all the clusters described by the runlist @rl on the volume @vol. In
- * the case of an error being returned, at least some of the clusters were not
- * freed.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - This function takes the volume lcn bitmap lock for writing and
- * modifies the bitmap contents.
- * - The caller must have locked the runlist @rl for reading or
- * writing.
- */
-static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
- const runlist_element *rl)
-{
- int ret;
-
- down_write(&vol->lcnbmp_lock);
- ret = ntfs_cluster_free_from_rl_nolock(vol, rl);
- up_write(&vol->lcnbmp_lock);
- return ret;
-}
-
-#endif /* NTFS_RW */
-
-#endif /* defined _LINUX_NTFS_LCNALLOC_H */
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
deleted file mode 100644
index 6ce60ffc6ac0..000000000000
--- a/fs/ntfs/logfile.c
+++ /dev/null
@@ -1,849 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2002-2007 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/buffer_head.h>
-#include <linux/bitops.h>
-#include <linux/log2.h>
-#include <linux/bio.h>
-
-#include "attrib.h"
-#include "aops.h"
-#include "debug.h"
-#include "logfile.h"
-#include "malloc.h"
-#include "volume.h"
-#include "ntfs.h"
-
-/**
- * ntfs_check_restart_page_header - check the page header for consistency
- * @vi: $LogFile inode to which the restart page header belongs
- * @rp: restart page header to check
- * @pos: position in @vi at which the restart page header resides
- *
- * Check the restart page header @rp for consistency and return 'true' if it is
- * consistent and 'false' otherwise.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- */
-static bool ntfs_check_restart_page_header(struct inode *vi,
- RESTART_PAGE_HEADER *rp, s64 pos)
-{
- u32 logfile_system_page_size, logfile_log_page_size;
- u16 ra_ofs, usa_count, usa_ofs, usa_end = 0;
- bool have_usa = true;
-
- ntfs_debug("Entering.");
- /*
- * If the system or log page sizes are smaller than the ntfs block size
- * or either is not a power of 2 we cannot handle this log file.
- */
- logfile_system_page_size = le32_to_cpu(rp->system_page_size);
- logfile_log_page_size = le32_to_cpu(rp->log_page_size);
- if (logfile_system_page_size < NTFS_BLOCK_SIZE ||
- logfile_log_page_size < NTFS_BLOCK_SIZE ||
- logfile_system_page_size &
- (logfile_system_page_size - 1) ||
- !is_power_of_2(logfile_log_page_size)) {
- ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
- return false;
- }
- /*
- * We must be either at !pos (1st restart page) or at pos = system page
- * size (2nd restart page).
- */
- if (pos && pos != logfile_system_page_size) {
- ntfs_error(vi->i_sb, "Found restart area in incorrect "
- "position in $LogFile.");
- return false;
- }
- /* We only know how to handle version 1.1. */
- if (sle16_to_cpu(rp->major_ver) != 1 ||
- sle16_to_cpu(rp->minor_ver) != 1) {
- ntfs_error(vi->i_sb, "$LogFile version %i.%i is not "
- "supported. (This driver supports version "
- "1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
- (int)sle16_to_cpu(rp->minor_ver));
- return false;
- }
- /*
- * If chkdsk has been run the restart page may not be protected by an
- * update sequence array.
- */
- if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) {
- have_usa = false;
- goto skip_usa_checks;
- }
- /* Verify the size of the update sequence array. */
- usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
- if (usa_count != le16_to_cpu(rp->usa_count)) {
- ntfs_error(vi->i_sb, "$LogFile restart page specifies "
- "inconsistent update sequence array count.");
- return false;
- }
- /* Verify the position of the update sequence array. */
- usa_ofs = le16_to_cpu(rp->usa_ofs);
- usa_end = usa_ofs + usa_count * sizeof(u16);
- if (usa_ofs < sizeof(RESTART_PAGE_HEADER) ||
- usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
- ntfs_error(vi->i_sb, "$LogFile restart page specifies "
- "inconsistent update sequence array offset.");
- return false;
- }
-skip_usa_checks:
- /*
- * Verify the position of the restart area. It must be:
- * - aligned to 8-byte boundary,
- * - after the update sequence array, and
- * - within the system page size.
- */
- ra_ofs = le16_to_cpu(rp->restart_area_offset);
- if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end :
- ra_ofs < sizeof(RESTART_PAGE_HEADER)) ||
- ra_ofs > logfile_system_page_size) {
- ntfs_error(vi->i_sb, "$LogFile restart page specifies "
- "inconsistent restart area offset.");
- return false;
- }
- /*
- * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
- * set.
- */
- if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
- ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
- "by chkdsk but a chkdsk LSN is specified.");
- return false;
- }
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * ntfs_check_restart_area - check the restart area for consistency
- * @vi: $LogFile inode to which the restart page belongs
- * @rp: restart page whose restart area to check
- *
- * Check the restart area of the restart page @rp for consistency and return
- * 'true' if it is consistent and 'false' otherwise.
- *
- * This function assumes that the restart page header has already been
- * consistency checked.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- */
-static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
-{
- u64 file_size;
- RESTART_AREA *ra;
- u16 ra_ofs, ra_len, ca_ofs;
- u8 fs_bits;
-
- ntfs_debug("Entering.");
- ra_ofs = le16_to_cpu(rp->restart_area_offset);
- ra = (RESTART_AREA*)((u8*)rp + ra_ofs);
- /*
- * Everything before ra->file_size must be before the first word
- * protected by an update sequence number. This ensures that it is
- * safe to access ra->client_array_offset.
- */
- if (ra_ofs + offsetof(RESTART_AREA, file_size) >
- NTFS_BLOCK_SIZE - sizeof(u16)) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "inconsistent file offset.");
- return false;
- }
- /*
- * Now that we can access ra->client_array_offset, make sure everything
- * up to the log client array is before the first word protected by an
- * update sequence number. This ensures we can access all of the
- * restart area elements safely. Also, the client array offset must be
- * aligned to an 8-byte boundary.
- */
- ca_ofs = le16_to_cpu(ra->client_array_offset);
- if (((ca_ofs + 7) & ~7) != ca_ofs ||
- ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "inconsistent client array offset.");
- return false;
- }
- /*
- * The restart area must end within the system page size both when
- * calculated manually and as specified by ra->restart_area_length.
- * Also, the calculated length must not exceed the specified length.
- */
- ra_len = ca_ofs + le16_to_cpu(ra->log_clients) *
- sizeof(LOG_CLIENT_RECORD);
- if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) ||
- ra_ofs + le16_to_cpu(ra->restart_area_length) >
- le32_to_cpu(rp->system_page_size) ||
- ra_len > le16_to_cpu(ra->restart_area_length)) {
- ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds "
- "of the system page size specified by the "
- "restart page header and/or the specified "
- "restart area length is inconsistent.");
- return false;
- }
- /*
- * The ra->client_free_list and ra->client_in_use_list must be either
- * LOGFILE_NO_CLIENT or less than ra->log_clients or they are
- * overflowing the client array.
- */
- if ((ra->client_free_list != LOGFILE_NO_CLIENT &&
- le16_to_cpu(ra->client_free_list) >=
- le16_to_cpu(ra->log_clients)) ||
- (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
- le16_to_cpu(ra->client_in_use_list) >=
- le16_to_cpu(ra->log_clients))) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "overflowing client free and/or in use lists.");
- return false;
- }
- /*
- * Check ra->seq_number_bits against ra->file_size for consistency.
- * We cannot just use ffs() because the file size is not a power of 2.
- */
- file_size = (u64)sle64_to_cpu(ra->file_size);
- fs_bits = 0;
- while (file_size) {
- file_size >>= 1;
- fs_bits++;
- }
- if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "inconsistent sequence number bits.");
- return false;
- }
- /* The log record header length must be a multiple of 8. */
- if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
- le16_to_cpu(ra->log_record_header_length)) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "inconsistent log record header length.");
- return false;
- }
- /* Dito for the log page data offset. */
- if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
- le16_to_cpu(ra->log_page_data_offset)) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "inconsistent log page data offset.");
- return false;
- }
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * ntfs_check_log_client_array - check the log client array for consistency
- * @vi: $LogFile inode to which the restart page belongs
- * @rp: restart page whose log client array to check
- *
- * Check the log client array of the restart page @rp for consistency and
- * return 'true' if it is consistent and 'false' otherwise.
- *
- * This function assumes that the restart page header and the restart area have
- * already been consistency checked.
- *
- * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this
- * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
- * restart page and the page must be multi sector transfer deprotected.
- */
-static bool ntfs_check_log_client_array(struct inode *vi,
- RESTART_PAGE_HEADER *rp)
-{
- RESTART_AREA *ra;
- LOG_CLIENT_RECORD *ca, *cr;
- u16 nr_clients, idx;
- bool in_free_list, idx_is_first;
-
- ntfs_debug("Entering.");
- ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
- ca = (LOG_CLIENT_RECORD*)((u8*)ra +
- le16_to_cpu(ra->client_array_offset));
- /*
- * Check the ra->client_free_list first and then check the
- * ra->client_in_use_list. Check each of the log client records in
- * each of the lists and check that the array does not overflow the
- * ra->log_clients value. Also keep track of the number of records
- * visited as there cannot be more than ra->log_clients records and
- * that way we detect eventual loops in within a list.
- */
- nr_clients = le16_to_cpu(ra->log_clients);
- idx = le16_to_cpu(ra->client_free_list);
- in_free_list = true;
-check_list:
- for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
- idx = le16_to_cpu(cr->next_client)) {
- if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
- goto err_out;
- /* Set @cr to the current log client record. */
- cr = ca + idx;
- /* The first log client record must not have a prev_client. */
- if (idx_is_first) {
- if (cr->prev_client != LOGFILE_NO_CLIENT)
- goto err_out;
- idx_is_first = false;
- }
- }
- /* Switch to and check the in use list if we just did the free list. */
- if (in_free_list) {
- in_free_list = false;
- idx = le16_to_cpu(ra->client_in_use_list);
- goto check_list;
- }
- ntfs_debug("Done.");
- return true;
-err_out:
- ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
- return false;
-}
-
-/**
- * ntfs_check_and_load_restart_page - check the restart page for consistency
- * @vi: $LogFile inode to which the restart page belongs
- * @rp: restart page to check
- * @pos: position in @vi at which the restart page resides
- * @wrp: [OUT] copy of the multi sector transfer deprotected restart page
- * @lsn: [OUT] set to the current logfile lsn on success
- *
- * Check the restart page @rp for consistency and return 0 if it is consistent
- * and -errno otherwise. The restart page may have been modified by chkdsk in
- * which case its magic is CHKD instead of RSTR.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- *
- * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
- * copy of the complete multi sector transfer deprotected page. On failure,
- * *@wrp is undefined.
- *
- * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
- * logfile lsn according to this restart page. On failure, *@lsn is undefined.
- *
- * The following error codes are defined:
- * -EINVAL - The restart page is inconsistent.
- * -ENOMEM - Not enough memory to load the restart page.
- * -EIO - Failed to reading from $LogFile.
- */
-static int ntfs_check_and_load_restart_page(struct inode *vi,
- RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
- LSN *lsn)
-{
- RESTART_AREA *ra;
- RESTART_PAGE_HEADER *trp;
- int size, err;
-
- ntfs_debug("Entering.");
- /* Check the restart page header for consistency. */
- if (!ntfs_check_restart_page_header(vi, rp, pos)) {
- /* Error output already done inside the function. */
- return -EINVAL;
- }
- /* Check the restart area for consistency. */
- if (!ntfs_check_restart_area(vi, rp)) {
- /* Error output already done inside the function. */
- return -EINVAL;
- }
- ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
- /*
- * Allocate a buffer to store the whole restart page so we can multi
- * sector transfer deprotect it.
- */
- trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size));
- if (!trp) {
- ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
- "restart page buffer.");
- return -ENOMEM;
- }
- /*
- * Read the whole of the restart page into the buffer. If it fits
- * completely inside @rp, just copy it from there. Otherwise map all
- * the required pages and copy the data from them.
- */
- size = PAGE_SIZE - (pos & ~PAGE_MASK);
- if (size >= le32_to_cpu(rp->system_page_size)) {
- memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
- } else {
- pgoff_t idx;
- struct page *page;
- int have_read, to_read;
-
- /* First copy what we already have in @rp. */
- memcpy(trp, rp, size);
- /* Copy the remaining data one page at a time. */
- have_read = size;
- to_read = le32_to_cpu(rp->system_page_size) - size;
- idx = (pos + size) >> PAGE_SHIFT;
- BUG_ON((pos + size) & ~PAGE_MASK);
- do {
- page = ntfs_map_page(vi->i_mapping, idx);
- if (IS_ERR(page)) {
- ntfs_error(vi->i_sb, "Error mapping $LogFile "
- "page (index %lu).", idx);
- err = PTR_ERR(page);
- if (err != -EIO && err != -ENOMEM)
- err = -EIO;
- goto err_out;
- }
- size = min_t(int, to_read, PAGE_SIZE);
- memcpy((u8*)trp + have_read, page_address(page), size);
- ntfs_unmap_page(page);
- have_read += size;
- to_read -= size;
- idx++;
- } while (to_read > 0);
- }
- /*
- * Perform the multi sector transfer deprotection on the buffer if the
- * restart page is protected.
- */
- if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count))
- && post_read_mst_fixup((NTFS_RECORD*)trp,
- le32_to_cpu(rp->system_page_size))) {
- /*
- * A multi sector tranfer error was detected. We only need to
- * abort if the restart page contents exceed the multi sector
- * transfer fixup of the first sector.
- */
- if (le16_to_cpu(rp->restart_area_offset) +
- le16_to_cpu(ra->restart_area_length) >
- NTFS_BLOCK_SIZE - sizeof(u16)) {
- ntfs_error(vi->i_sb, "Multi sector transfer error "
- "detected in $LogFile restart page.");
- err = -EINVAL;
- goto err_out;
- }
- }
- /*
- * If the restart page is modified by chkdsk or there are no active
- * logfile clients, the logfile is consistent. Otherwise, need to
- * check the log client records for consistency, too.
- */
- err = 0;
- if (ntfs_is_rstr_record(rp->magic) &&
- ra->client_in_use_list != LOGFILE_NO_CLIENT) {
- if (!ntfs_check_log_client_array(vi, trp)) {
- err = -EINVAL;
- goto err_out;
- }
- }
- if (lsn) {
- if (ntfs_is_rstr_record(rp->magic))
- *lsn = sle64_to_cpu(ra->current_lsn);
- else /* if (ntfs_is_chkd_record(rp->magic)) */
- *lsn = sle64_to_cpu(rp->chkdsk_lsn);
- }
- ntfs_debug("Done.");
- if (wrp)
- *wrp = trp;
- else {
-err_out:
- ntfs_free(trp);
- }
- return err;
-}
-
-/**
- * ntfs_check_logfile - check the journal for consistency
- * @log_vi: struct inode of loaded journal $LogFile to check
- * @rp: [OUT] on success this is a copy of the current restart page
- *
- * Check the $LogFile journal for consistency and return 'true' if it is
- * consistent and 'false' if not. On success, the current restart page is
- * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it.
- *
- * At present we only check the two restart pages and ignore the log record
- * pages.
- *
- * Note that the MstProtected flag is not set on the $LogFile inode and hence
- * when reading pages they are not deprotected. This is because we do not know
- * if the $LogFile was created on a system with a different page size to ours
- * yet and mst deprotection would fail if our page size is smaller.
- */
-bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
-{
- s64 size, pos;
- LSN rstr1_lsn, rstr2_lsn;
- ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
- struct address_space *mapping = log_vi->i_mapping;
- struct page *page = NULL;
- u8 *kaddr = NULL;
- RESTART_PAGE_HEADER *rstr1_ph = NULL;
- RESTART_PAGE_HEADER *rstr2_ph = NULL;
- int log_page_size, err;
- bool logfile_is_empty = true;
- u8 log_page_bits;
-
- ntfs_debug("Entering.");
- /* An empty $LogFile must have been clean before it got emptied. */
- if (NVolLogFileEmpty(vol))
- goto is_empty;
- size = i_size_read(log_vi);
- /* Make sure the file doesn't exceed the maximum allowed size. */
- if (size > MaxLogFileSize)
- size = MaxLogFileSize;
- /*
- * Truncate size to a multiple of the page cache size or the default
- * log page size if the page cache size is between the default log page
- * log page size if the page cache size is between the default log page
- * size and twice that.
- */
- if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <=
- DefaultLogPageSize * 2)
- log_page_size = DefaultLogPageSize;
- else
- log_page_size = PAGE_SIZE;
- /*
- * Use ntfs_ffs() instead of ffs() to enable the compiler to
- * optimize log_page_size and log_page_bits into constants.
- */
- log_page_bits = ntfs_ffs(log_page_size) - 1;
- size &= ~(s64)(log_page_size - 1);
- /*
- * Ensure the log file is big enough to store at least the two restart
- * pages and the minimum number of log record pages.
- */
- if (size < log_page_size * 2 || (size - log_page_size * 2) >>
- log_page_bits < MinLogRecordPages) {
- ntfs_error(vol->sb, "$LogFile is too small.");
- return false;
- }
- /*
- * Read through the file looking for a restart page. Since the restart
- * page header is at the beginning of a page we only need to search at
- * what could be the beginning of a page (for each page size) rather
- * than scanning the whole file byte by byte. If all potential places
- * contain empty and uninitialzed records, the log file can be assumed
- * to be empty.
- */
- for (pos = 0; pos < size; pos <<= 1) {
- pgoff_t idx = pos >> PAGE_SHIFT;
- if (!page || page->index != idx) {
- if (page)
- ntfs_unmap_page(page);
- page = ntfs_map_page(mapping, idx);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Error mapping $LogFile "
- "page (index %lu).", idx);
- goto err_out;
- }
- }
- kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK);
- /*
- * A non-empty block means the logfile is not empty while an
- * empty block after a non-empty block has been encountered
- * means we are done.
- */
- if (!ntfs_is_empty_recordp((le32*)kaddr))
- logfile_is_empty = false;
- else if (!logfile_is_empty)
- break;
- /*
- * A log record page means there cannot be a restart page after
- * this so no need to continue searching.
- */
- if (ntfs_is_rcrd_recordp((le32*)kaddr))
- break;
- /* If not a (modified by chkdsk) restart page, continue. */
- if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
- !ntfs_is_chkd_recordp((le32*)kaddr)) {
- if (!pos)
- pos = NTFS_BLOCK_SIZE >> 1;
- continue;
- }
- /*
- * Check the (modified by chkdsk) restart page for consistency
- * and get a copy of the complete multi sector transfer
- * deprotected restart page.
- */
- err = ntfs_check_and_load_restart_page(log_vi,
- (RESTART_PAGE_HEADER*)kaddr, pos,
- !rstr1_ph ? &rstr1_ph : &rstr2_ph,
- !rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
- if (!err) {
- /*
- * If we have now found the first (modified by chkdsk)
- * restart page, continue looking for the second one.
- */
- if (!pos) {
- pos = NTFS_BLOCK_SIZE >> 1;
- continue;
- }
- /*
- * We have now found the second (modified by chkdsk)
- * restart page, so we can stop looking.
- */
- break;
- }
- /*
- * Error output already done inside the function. Note, we do
- * not abort if the restart page was invalid as we might still
- * find a valid one further in the file.
- */
- if (err != -EINVAL) {
- ntfs_unmap_page(page);
- goto err_out;
- }
- /* Continue looking. */
- if (!pos)
- pos = NTFS_BLOCK_SIZE >> 1;
- }
- if (page)
- ntfs_unmap_page(page);
- if (logfile_is_empty) {
- NVolSetLogFileEmpty(vol);
-is_empty:
- ntfs_debug("Done. ($LogFile is empty.)");
- return true;
- }
- if (!rstr1_ph) {
- BUG_ON(rstr2_ph);
- ntfs_error(vol->sb, "Did not find any restart pages in "
- "$LogFile and it was not empty.");
- return false;
- }
- /* If both restart pages were found, use the more recent one. */
- if (rstr2_ph) {
- /*
- * If the second restart area is more recent, switch to it.
- * Otherwise just throw it away.
- */
- if (rstr2_lsn > rstr1_lsn) {
- ntfs_debug("Using second restart page as it is more "
- "recent.");
- ntfs_free(rstr1_ph);
- rstr1_ph = rstr2_ph;
- /* rstr1_lsn = rstr2_lsn; */
- } else {
- ntfs_debug("Using first restart page as it is more "
- "recent.");
- ntfs_free(rstr2_ph);
- }
- rstr2_ph = NULL;
- }
- /* All consistency checks passed. */
- if (rp)
- *rp = rstr1_ph;
- else
- ntfs_free(rstr1_ph);
- ntfs_debug("Done.");
- return true;
-err_out:
- if (rstr1_ph)
- ntfs_free(rstr1_ph);
- return false;
-}
-
-/**
- * ntfs_is_logfile_clean - check in the journal if the volume is clean
- * @log_vi: struct inode of loaded journal $LogFile to check
- * @rp: copy of the current restart page
- *
- * Analyze the $LogFile journal and return 'true' if it indicates the volume was
- * shutdown cleanly and 'false' if not.
- *
- * At present we only look at the two restart pages and ignore the log record
- * pages. This is a little bit crude in that there will be a very small number
- * of cases where we think that a volume is dirty when in fact it is clean.
- * This should only affect volumes that have not been shutdown cleanly but did
- * not have any pending, non-check-pointed i/o, i.e. they were completely idle
- * at least for the five seconds preceding the unclean shutdown.
- *
- * This function assumes that the $LogFile journal has already been consistency
- * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
- * is empty this function requires that NVolLogFileEmpty() is true otherwise an
- * empty volume will be reported as dirty.
- */
-bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
-{
- ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
- RESTART_AREA *ra;
-
- ntfs_debug("Entering.");
- /* An empty $LogFile must have been clean before it got emptied. */
- if (NVolLogFileEmpty(vol)) {
- ntfs_debug("Done. ($LogFile is empty.)");
- return true;
- }
- BUG_ON(!rp);
- if (!ntfs_is_rstr_record(rp->magic) &&
- !ntfs_is_chkd_record(rp->magic)) {
- ntfs_error(vol->sb, "Restart page buffer is invalid. This is "
- "probably a bug in that the $LogFile should "
- "have been consistency checked before calling "
- "this function.");
- return false;
- }
- ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
- /*
- * If the $LogFile has active clients, i.e. it is open, and we do not
- * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags,
- * we assume there was an unclean shutdown.
- */
- if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
- !(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
- ntfs_debug("Done. $LogFile indicates a dirty shutdown.");
- return false;
- }
- /* $LogFile indicates a clean shutdown. */
- ntfs_debug("Done. $LogFile indicates a clean shutdown.");
- return true;
-}
-
-/**
- * ntfs_empty_logfile - empty the contents of the $LogFile journal
- * @log_vi: struct inode of loaded journal $LogFile to empty
- *
- * Empty the contents of the $LogFile journal @log_vi and return 'true' on
- * success and 'false' on error.
- *
- * This function assumes that the $LogFile journal has already been consistency
- * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
- * has been used to ensure that the $LogFile is clean.
- */
-bool ntfs_empty_logfile(struct inode *log_vi)
-{
- VCN vcn, end_vcn;
- ntfs_inode *log_ni = NTFS_I(log_vi);
- ntfs_volume *vol = log_ni->vol;
- struct super_block *sb = vol->sb;
- runlist_element *rl;
- unsigned long flags;
- unsigned block_size, block_size_bits;
- int err;
- bool should_wait = true;
-
- ntfs_debug("Entering.");
- if (NVolLogFileEmpty(vol)) {
- ntfs_debug("Done.");
- return true;
- }
- /*
- * We cannot use ntfs_attr_set() because we may be still in the middle
- * of a mount operation. Thus we do the emptying by hand by first
- * zapping the page cache pages for the $LogFile/$DATA attribute and
- * then emptying each of the buffers in each of the clusters specified
- * by the runlist by hand.
- */
- block_size = sb->s_blocksize;
- block_size_bits = sb->s_blocksize_bits;
- vcn = 0;
- read_lock_irqsave(&log_ni->size_lock, flags);
- end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
- vol->cluster_size_bits;
- read_unlock_irqrestore(&log_ni->size_lock, flags);
- truncate_inode_pages(log_vi->i_mapping, 0);
- down_write(&log_ni->runlist.lock);
- rl = log_ni->runlist.rl;
- if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
-map_vcn:
- err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
- if (err) {
- ntfs_error(sb, "Failed to map runlist fragment (error "
- "%d).", -err);
- goto err;
- }
- rl = log_ni->runlist.rl;
- BUG_ON(!rl || vcn < rl->vcn || !rl->length);
- }
- /* Seek to the runlist element containing @vcn. */
- while (rl->length && vcn >= rl[1].vcn)
- rl++;
- do {
- LCN lcn;
- sector_t block, end_block;
- s64 len;
-
- /*
- * If this run is not mapped map it now and start again as the
- * runlist will have been updated.
- */
- lcn = rl->lcn;
- if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
- vcn = rl->vcn;
- goto map_vcn;
- }
- /* If this run is not valid abort with an error. */
- if (unlikely(!rl->length || lcn < LCN_HOLE))
- goto rl_err;
- /* Skip holes. */
- if (lcn == LCN_HOLE)
- continue;
- block = lcn << vol->cluster_size_bits >> block_size_bits;
- len = rl->length;
- if (rl[1].vcn > end_vcn)
- len = end_vcn - rl->vcn;
- end_block = (lcn + len) << vol->cluster_size_bits >>
- block_size_bits;
- /* Iterate over the blocks in the run and empty them. */
- do {
- struct buffer_head *bh;
-
- /* Obtain the buffer, possibly not uptodate. */
- bh = sb_getblk(sb, block);
- BUG_ON(!bh);
- /* Setup buffer i/o submission. */
- lock_buffer(bh);
- bh->b_end_io = end_buffer_write_sync;
- get_bh(bh);
- /* Set the entire contents of the buffer to 0xff. */
- memset(bh->b_data, -1, block_size);
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- if (buffer_dirty(bh))
- clear_buffer_dirty(bh);
- /*
- * Submit the buffer and wait for i/o to complete but
- * only for the first buffer so we do not miss really
- * serious i/o errors. Once the first buffer has
- * completed ignore errors afterwards as we can assume
- * that if one buffer worked all of them will work.
- */
- submit_bh(REQ_OP_WRITE, bh);
- if (should_wait) {
- should_wait = false;
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- goto io_err;
- }
- brelse(bh);
- } while (++block < end_block);
- } while ((++rl)->vcn < end_vcn);
- up_write(&log_ni->runlist.lock);
- /*
- * Zap the pages again just in case any got instantiated whilst we were
- * emptying the blocks by hand. FIXME: We may not have completed
- * writing to all the buffer heads yet so this may happen too early.
- * We really should use a kernel thread to do the emptying
- * asynchronously and then we can also set the volume dirty and output
- * an error message if emptying should fail.
- */
- truncate_inode_pages(log_vi->i_mapping, 0);
- /* Set the flag so we do not have to do it again on remount. */
- NVolSetLogFileEmpty(vol);
- ntfs_debug("Done.");
- return true;
-io_err:
- ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk.");
- goto dirty_err;
-rl_err:
- ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk.");
-dirty_err:
- NVolSetErrors(vol);
- err = -EIO;
-err:
- up_write(&log_ni->runlist.lock);
- ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
- -err);
- return false;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
deleted file mode 100644
index 429d4909cc72..000000000000
--- a/fs/ntfs/logfile.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of
- * the Linux-NTFS project.
- *
- * Copyright (c) 2000-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_LOGFILE_H
-#define _LINUX_NTFS_LOGFILE_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "types.h"
-#include "endian.h"
-#include "layout.h"
-
-/*
- * Journal ($LogFile) organization:
- *
- * Two restart areas present in the first two pages (restart pages, one restart
- * area in each page). When the volume is dismounted they should be identical,
- * except for the update sequence array which usually has a different update
- * sequence number.
- *
- * These are followed by log records organized in pages headed by a log record
- * header going up to log file size. Not all pages contain log records when a
- * volume is first formatted, but as the volume ages, all records will be used.
- * When the log file fills up, the records at the beginning are purged (by
- * modifying the oldest_lsn to a higher value presumably) and writing begins
- * at the beginning of the file. Effectively, the log file is viewed as a
- * circular entity.
- *
- * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept
- * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We
- * probably only want to support 1.1 as this seems to be the current version
- * and we don't know how that differs from the older versions. The only
- * exception is if the journal is clean as marked by the two restart pages
- * then it doesn't matter whether we are on an earlier version. We can just
- * reinitialize the logfile and start again with version 1.1.
- */
-
-/* Some $LogFile related constants. */
-#define MaxLogFileSize 0x100000000ULL
-#define DefaultLogPageSize 4096
-#define MinLogRecordPages 48
-
-/*
- * Log file restart page header (begins the restart area).
- */
-typedef struct {
-/*Ofs*/
-/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
-/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */
-/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h.
- When creating, set this to be immediately
- after this header structure (without any
- alignment). */
-/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */
-
-/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by
- chkdsk. Only used when the magic is changed
- to "CHKD". Otherwise this is zero. */
-/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file
- was created, has to be >= 512 and a power of
- 2. Use this to calculate the required size
- of the usa (usa_count) and add it to usa_ofs.
- Then verify that the result is less than the
- value of the restart_area_offset. */
-/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >=
- 512 and a power of 2. The default is 4096
- and is used when the system page size is
- between 4096 and 8192. Otherwise this is
- set to the system page size instead. */
-/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to
- the RESTART_AREA. Value has to be aligned
- to 8-byte boundary. When creating, set this
- to be after the usa. */
-/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major
- version is 1. */
-/* 28*/ sle16 major_ver; /* Log file major version. We only support
- version 1.1. */
-/* sizeof() = 30 (0x1e) bytes */
-} __attribute__ ((__packed__)) RESTART_PAGE_HEADER;
-
-/*
- * Constant for the log client indices meaning that there are no client records
- * in this particular client array. Also inside the client records themselves,
- * this means that there are no client records preceding or following this one.
- */
-#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff)
-#define LOGFILE_NO_CLIENT_CPU 0xffff
-
-/*
- * These are the so far known RESTART_AREA_* flags (16-bit) which contain
- * information about the log file in which they are present.
- */
-enum {
- RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002),
- RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
-} __attribute__ ((__packed__));
-
-typedef le16 RESTART_AREA_FLAGS;
-
-/*
- * Log file restart area record. The offset of this record is found by adding
- * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found
- * in it. See notes at restart_area_offset above.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log
- when the restart area was last written.
- This happens often but what is the interval?
- Is it just fixed time or is it every time a
- check point is written or somethine else?
- On create set to 0. */
-/* 8*/ le16 log_clients; /* Number of log client records in the array of
- log client records which follows this
- restart area. Must be 1. */
-/* 10*/ le16 client_free_list; /* The index of the first free log client record
- in the array of log client records.
- LOGFILE_NO_CLIENT means that there are no
- free log client records in the array.
- If != LOGFILE_NO_CLIENT, check that
- log_clients > client_free_list. On Win2k
- and presumably earlier, on a clean volume
- this is != LOGFILE_NO_CLIENT, and it should
- be 0, i.e. the first (and only) client
- record is free and thus the logfile is
- closed and hence clean. A dirty volume
- would have left the logfile open and hence
- this would be LOGFILE_NO_CLIENT. On WinXP
- and presumably later, the logfile is always
- open, even on clean shutdown so this should
- always be LOGFILE_NO_CLIENT. */
-/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client
- record in the array of log client records.
- LOGFILE_NO_CLIENT means that there are no
- in-use log client records in the array. If
- != LOGFILE_NO_CLIENT check that log_clients
- > client_in_use_list. On Win2k and
- presumably earlier, on a clean volume this
- is LOGFILE_NO_CLIENT, i.e. there are no
- client records in use and thus the logfile
- is closed and hence clean. A dirty volume
- would have left the logfile open and hence
- this would be != LOGFILE_NO_CLIENT, and it
- should be 0, i.e. the first (and only)
- client record is in use. On WinXP and
- presumably later, the logfile is always
- open, even on clean shutdown so this should
- always be 0. */
-/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k
- and presumably earlier this is always 0. On
- WinXP and presumably later, if the logfile
- was shutdown cleanly, the second bit,
- RESTART_VOLUME_IS_CLEAN, is set. This bit
- is cleared when the volume is mounted by
- WinXP and set when the volume is dismounted,
- thus if the logfile is dirty, this bit is
- clear. Thus we don't need to check the
- Windows version to determine if the logfile
- is clean. Instead if the logfile is closed,
- we know it must be clean. If it is open and
- this bit is set, we also know it must be
- clean. If on the other hand the logfile is
- open and this bit is clear, we can be almost
- certain that the logfile is dirty. */
-/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence
- number. This is calculated as 67 - the
- number of bits required to store the logfile
- size in bytes and this can be used in with
- the specified file_size as a consistency
- check. */
-/* 20*/ le16 restart_area_length;/* Length of the restart area including the
- client array. Following checks required if
- version matches. Otherwise, skip them.
- restart_area_offset + restart_area_length
- has to be <= system_page_size. Also,
- restart_area_length has to be >=
- client_array_offset + (log_clients *
- sizeof(log client record)). */
-/* 22*/ le16 client_array_offset;/* Offset from the start of this record to
- the first log client record if versions are
- matched. When creating, set this to be
- after this restart area structure, aligned
- to 8-bytes boundary. If the versions do not
- match, this is ignored and the offset is
- assumed to be (sizeof(RESTART_AREA) + 7) &
- ~7, i.e. rounded up to first 8-byte
- boundary. Either way, client_array_offset
- has to be aligned to an 8-byte boundary.
- Also, restart_area_offset +
- client_array_offset has to be <= 510.
- Finally, client_array_offset + (log_clients
- * sizeof(log client record)) has to be <=
- system_page_size. On Win2k and presumably
- earlier, this is 0x30, i.e. immediately
- following this record. On WinXP and
- presumably later, this is 0x40, i.e. there
- are 16 extra bytes between this record and
- the client array. This probably means that
- the RESTART_AREA record is actually bigger
- in WinXP and later. */
-/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the
- restart_area_offset + the offset of the
- file_size are > 510 then corruption has
- occurred. This is the very first check when
- starting with the restart_area as if it
- fails it means that some of the above values
- will be corrupted by the multi sector
- transfer protection. The file_size has to
- be rounded down to be a multiple of the
- log_page_size in the RESTART_PAGE_HEADER and
- then it has to be at least big enough to
- store the two restart pages and 48 (0x30)
- log record pages. */
-/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including
- the log record header. On create set to
- 0. */
-/* 36*/ le16 log_record_header_length;/* Byte size of the log record header.
- If the version matches then check that the
- value of log_record_header_length is a
- multiple of 8, i.e.
- (log_record_header_length + 7) & ~7 ==
- log_record_header_length. When creating set
- it to sizeof(LOG_RECORD_HEADER), aligned to
- 8 bytes. */
-/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record
- page. Must be a multiple of 8. On create
- set it to immediately after the update
- sequence array of the log record page. */
-/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every
- time the logfile is restarted which happens
- at mount time when the logfile is opened.
- When creating set to a random value. Win2k
- sets it to the low 32 bits of the current
- system time in NTFS format (see time.h). */
-/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */
-/* sizeof() = 48 (0x30) bytes */
-} __attribute__ ((__packed__)) RESTART_AREA;
-
-/*
- * Log client record. The offset of this record is found by adding the offset
- * of the RESTART_AREA to the client_array_offset value found in it.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create
- set to 0. */
-/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart
- the volume, i.e. the current position within
- the log file. At present, if clean this
- should = current_lsn in restart area but it
- probably also = current_lsn when dirty most
- of the time. At create set to 0. */
-/* 16*/ le16 prev_client; /* The offset to the previous log client record
- in the array of log client records.
- LOGFILE_NO_CLIENT means there is no previous
- client record, i.e. this is the first one.
- This is always LOGFILE_NO_CLIENT. */
-/* 18*/ le16 next_client; /* The offset to the next log client record in
- the array of log client records.
- LOGFILE_NO_CLIENT means there are no next
- client records, i.e. this is the last one.
- This is always LOGFILE_NO_CLIENT. */
-/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set
- to zero every time the logfile is restarted
- and it is incremented when the logfile is
- closed at dismount time. Thus it is 0 when
- dirty and 1 when clean. On WinXP and
- presumably later, this is always 0. */
-/* 22*/ u8 reserved[6]; /* Reserved/alignment. */
-/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should
- always be 8. */
-/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should
- always be "NTFS" with the remaining bytes
- set to 0. */
-/* sizeof() = 160 (0xa0) bytes */
-} __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
-
-extern bool ntfs_check_logfile(struct inode *log_vi,
- RESTART_PAGE_HEADER **rp);
-
-extern bool ntfs_is_logfile_clean(struct inode *log_vi,
- const RESTART_PAGE_HEADER *rp);
-
-extern bool ntfs_empty_logfile(struct inode *log_vi);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_LOGFILE_H */
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
deleted file mode 100644
index 7068425735f1..000000000000
--- a/fs/ntfs/malloc.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_MALLOC_H
-#define _LINUX_NTFS_MALLOC_H
-
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-
-/**
- * __ntfs_malloc - allocate memory in multiples of pages
- * @size: number of bytes to allocate
- * @gfp_mask: extra flags for the allocator
- *
- * Internal function. You probably want ntfs_malloc_nofs()...
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * If there was insufficient memory to complete the request, return NULL.
- * Depending on @gfp_mask the allocation may be guaranteed to succeed.
- */
-static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
-{
- if (likely(size <= PAGE_SIZE)) {
- BUG_ON(!size);
- /* kmalloc() has per-CPU caches so is faster for now. */
- return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
- /* return (void *)__get_free_page(gfp_mask); */
- }
- if (likely((size >> PAGE_SHIFT) < totalram_pages()))
- return __vmalloc(size, gfp_mask);
- return NULL;
-}
-
-/**
- * ntfs_malloc_nofs - allocate memory in multiples of pages
- * @size: number of bytes to allocate
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * If there was insufficient memory to complete the request, return NULL.
- */
-static inline void *ntfs_malloc_nofs(unsigned long size)
-{
- return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
-}
-
-/**
- * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
- * @size: number of bytes to allocate
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * This function guarantees that the allocation will succeed. It will sleep
- * for as long as it takes to complete the allocation.
- *
- * If there was insufficient memory to complete the request, return NULL.
- */
-static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
-{
- return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
-}
-
-static inline void ntfs_free(void *addr)
-{
- kvfree(addr);
-}
-
-#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
deleted file mode 100644
index 6fd1dc4b08c8..000000000000
--- a/fs/ntfs/mft.c
+++ /dev/null
@@ -1,2907 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/bio.h>
-
-#include "attrib.h"
-#include "aops.h"
-#include "bitmap.h"
-#include "debug.h"
-#include "dir.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-
-#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE)
-
-/**
- * map_mft_record_page - map the page in which a specific mft record resides
- * @ni: ntfs inode whose mft record page to map
- *
- * This maps the page in which the mft record of the ntfs inode @ni is situated
- * and returns a pointer to the mft record within the mapped page.
- *
- * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
- * contains the negative error code returned.
- */
-static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
-{
- loff_t i_size;
- ntfs_volume *vol = ni->vol;
- struct inode *mft_vi = vol->mft_ino;
- struct page *page;
- unsigned long index, end_index;
- unsigned ofs;
-
- BUG_ON(ni->page);
- /*
- * The index into the page cache and the offset within the page cache
- * page of the wanted mft record. FIXME: We need to check for
- * overflowing the unsigned long, but I don't think we would ever get
- * here if the volume was that big...
- */
- index = (u64)ni->mft_no << vol->mft_record_size_bits >>
- PAGE_SHIFT;
- ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
-
- i_size = i_size_read(mft_vi);
- /* The maximum valid index into the page cache for $MFT's data. */
- end_index = i_size >> PAGE_SHIFT;
-
- /* If the wanted index is out of bounds the mft record doesn't exist. */
- if (unlikely(index >= end_index)) {
- if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
- vol->mft_record_size) {
- page = ERR_PTR(-ENOENT);
- ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
- "which is beyond the end of the mft. "
- "This is probably a bug in the ntfs "
- "driver.", ni->mft_no);
- goto err_out;
- }
- }
- /* Read, map, and pin the page. */
- page = ntfs_map_page(mft_vi->i_mapping, index);
- if (!IS_ERR(page)) {
- /* Catch multi sector transfer fixup errors. */
- if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
- ofs)))) {
- ni->page = page;
- ni->page_ofs = ofs;
- return page_address(page) + ofs;
- }
- ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. "
- "Run chkdsk.", ni->mft_no);
- ntfs_unmap_page(page);
- page = ERR_PTR(-EIO);
- NVolSetErrors(vol);
- }
-err_out:
- ni->page = NULL;
- ni->page_ofs = 0;
- return (void*)page;
-}
-
-/**
- * map_mft_record - map, pin and lock an mft record
- * @ni: ntfs inode whose MFT record to map
- *
- * First, take the mrec_lock mutex. We might now be sleeping, while waiting
- * for the mutex if it was already locked by someone else.
- *
- * The page of the record is mapped using map_mft_record_page() before being
- * returned to the caller.
- *
- * This in turn uses ntfs_map_page() to get the page containing the wanted mft
- * record (it in turn calls read_cache_page() which reads it in from disk if
- * necessary, increments the use count on the page so that it cannot disappear
- * under us and returns a reference to the page cache page).
- *
- * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
- * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
- * and the post-read mst fixups on each mft record in the page have been
- * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
- * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
- * ntfs_map_page() waits for PG_locked to become clear and checks if
- * PG_uptodate is set and returns an error code if not. This provides
- * sufficient protection against races when reading/using the page.
- *
- * However there is the write mapping to think about. Doing the above described
- * checking here will be fine, because when initiating the write we will set
- * PG_locked and clear PG_uptodate making sure nobody is touching the page
- * contents. Doing the locking this way means that the commit to disk code in
- * the page cache code paths is automatically sufficiently locked with us as
- * we will not touch a page that has been locked or is not uptodate. The only
- * locking problem then is them locking the page while we are accessing it.
- *
- * So that code will end up having to own the mrec_lock of all mft
- * records/inodes present in the page before I/O can proceed. In that case we
- * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
- * accessing anything without owning the mrec_lock mutex. But we do need to
- * use them because of the read_cache_page() invocation and the code becomes so
- * much simpler this way that it is well worth it.
- *
- * The mft record is now ours and we return a pointer to it. You need to check
- * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
- * the error code.
- *
- * NOTE: Caller is responsible for setting the mft record dirty before calling
- * unmap_mft_record(). This is obviously only necessary if the caller really
- * modified the mft record...
- * Q: Do we want to recycle one of the VFS inode state bits instead?
- * A: No, the inode ones mean we want to change the mft record, not we want to
- * write it out.
- */
-MFT_RECORD *map_mft_record(ntfs_inode *ni)
-{
- MFT_RECORD *m;
-
- ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
-
- /* Make sure the ntfs inode doesn't go away. */
- atomic_inc(&ni->count);
-
- /* Serialize access to this mft record. */
- mutex_lock(&ni->mrec_lock);
-
- m = map_mft_record_page(ni);
- if (!IS_ERR(m))
- return m;
-
- mutex_unlock(&ni->mrec_lock);
- atomic_dec(&ni->count);
- ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
- return m;
-}
-
-/**
- * unmap_mft_record_page - unmap the page in which a specific mft record resides
- * @ni: ntfs inode whose mft record page to unmap
- *
- * This unmaps the page in which the mft record of the ntfs inode @ni is
- * situated and returns. This is a NOOP if highmem is not configured.
- *
- * The unmap happens via ntfs_unmap_page() which in turn decrements the use
- * count on the page thus releasing it from the pinned state.
- *
- * We do not actually unmap the page from memory of course, as that will be
- * done by the page cache code itself when memory pressure increases or
- * whatever.
- */
-static inline void unmap_mft_record_page(ntfs_inode *ni)
-{
- BUG_ON(!ni->page);
-
- // TODO: If dirty, blah...
- ntfs_unmap_page(ni->page);
- ni->page = NULL;
- ni->page_ofs = 0;
- return;
-}
-
-/**
- * unmap_mft_record - release a mapped mft record
- * @ni: ntfs inode whose MFT record to unmap
- *
- * We release the page mapping and the mrec_lock mutex which unmaps the mft
- * record and releases it for others to get hold of. We also release the ntfs
- * inode by decrementing the ntfs inode reference count.
- *
- * NOTE: If caller has modified the mft record, it is imperative to set the mft
- * record dirty BEFORE calling unmap_mft_record().
- */
-void unmap_mft_record(ntfs_inode *ni)
-{
- struct page *page = ni->page;
-
- BUG_ON(!page);
-
- ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
-
- unmap_mft_record_page(ni);
- mutex_unlock(&ni->mrec_lock);
- atomic_dec(&ni->count);
- /*
- * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
- * ntfs_clear_extent_inode() in the extent inode case, and to the
- * caller in the non-extent, yet pure ntfs inode case, to do the actual
- * tear down of all structures and freeing of all allocated memory.
- */
- return;
-}
-
-/**
- * map_extent_mft_record - load an extent inode and attach it to its base
- * @base_ni: base ntfs inode
- * @mref: mft reference of the extent inode to load
- * @ntfs_ino: on successful return, pointer to the ntfs_inode structure
- *
- * Load the extent mft record @mref and attach it to its base inode @base_ni.
- * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
- * PTR_ERR(result) gives the negative error code.
- *
- * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
- * structure of the mapped extent inode.
- */
-MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
- ntfs_inode **ntfs_ino)
-{
- MFT_RECORD *m;
- ntfs_inode *ni = NULL;
- ntfs_inode **extent_nis = NULL;
- int i;
- unsigned long mft_no = MREF(mref);
- u16 seq_no = MSEQNO(mref);
- bool destroy_ni = false;
-
- ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
- mft_no, base_ni->mft_no);
- /* Make sure the base ntfs inode doesn't go away. */
- atomic_inc(&base_ni->count);
- /*
- * Check if this extent inode has already been added to the base inode,
- * in which case just return it. If not found, add it to the base
- * inode before returning it.
- */
- mutex_lock(&base_ni->extent_lock);
- if (base_ni->nr_extents > 0) {
- extent_nis = base_ni->ext.extent_ntfs_inos;
- for (i = 0; i < base_ni->nr_extents; i++) {
- if (mft_no != extent_nis[i]->mft_no)
- continue;
- ni = extent_nis[i];
- /* Make sure the ntfs inode doesn't go away. */
- atomic_inc(&ni->count);
- break;
- }
- }
- if (likely(ni != NULL)) {
- mutex_unlock(&base_ni->extent_lock);
- atomic_dec(&base_ni->count);
- /* We found the record; just have to map and return it. */
- m = map_mft_record(ni);
- /* map_mft_record() has incremented this on success. */
- atomic_dec(&ni->count);
- if (!IS_ERR(m)) {
- /* Verify the sequence number. */
- if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
- ntfs_debug("Done 1.");
- *ntfs_ino = ni;
- return m;
- }
- unmap_mft_record(ni);
- ntfs_error(base_ni->vol->sb, "Found stale extent mft "
- "reference! Corrupt filesystem. "
- "Run chkdsk.");
- return ERR_PTR(-EIO);
- }
-map_err_out:
- ntfs_error(base_ni->vol->sb, "Failed to map extent "
- "mft record, error code %ld.", -PTR_ERR(m));
- return m;
- }
- /* Record wasn't there. Get a new ntfs inode and initialize it. */
- ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
- if (unlikely(!ni)) {
- mutex_unlock(&base_ni->extent_lock);
- atomic_dec(&base_ni->count);
- return ERR_PTR(-ENOMEM);
- }
- ni->vol = base_ni->vol;
- ni->seq_no = seq_no;
- ni->nr_extents = -1;
- ni->ext.base_ntfs_ino = base_ni;
- /* Now map the record. */
- m = map_mft_record(ni);
- if (IS_ERR(m)) {
- mutex_unlock(&base_ni->extent_lock);
- atomic_dec(&base_ni->count);
- ntfs_clear_extent_inode(ni);
- goto map_err_out;
- }
- /* Verify the sequence number if it is present. */
- if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
- ntfs_error(base_ni->vol->sb, "Found stale extent mft "
- "reference! Corrupt filesystem. Run chkdsk.");
- destroy_ni = true;
- m = ERR_PTR(-EIO);
- goto unm_err_out;
- }
- /* Attach extent inode to base inode, reallocating memory if needed. */
- if (!(base_ni->nr_extents & 3)) {
- ntfs_inode **tmp;
- int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
-
- tmp = kmalloc(new_size, GFP_NOFS);
- if (unlikely(!tmp)) {
- ntfs_error(base_ni->vol->sb, "Failed to allocate "
- "internal buffer.");
- destroy_ni = true;
- m = ERR_PTR(-ENOMEM);
- goto unm_err_out;
- }
- if (base_ni->nr_extents) {
- BUG_ON(!base_ni->ext.extent_ntfs_inos);
- memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
- 4 * sizeof(ntfs_inode *));
- kfree(base_ni->ext.extent_ntfs_inos);
- }
- base_ni->ext.extent_ntfs_inos = tmp;
- }
- base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
- mutex_unlock(&base_ni->extent_lock);
- atomic_dec(&base_ni->count);
- ntfs_debug("Done 2.");
- *ntfs_ino = ni;
- return m;
-unm_err_out:
- unmap_mft_record(ni);
- mutex_unlock(&base_ni->extent_lock);
- atomic_dec(&base_ni->count);
- /*
- * If the extent inode was not attached to the base inode we need to
- * release it or we will leak memory.
- */
- if (destroy_ni)
- ntfs_clear_extent_inode(ni);
- return m;
-}
-
-#ifdef NTFS_RW
-
-/**
- * __mark_mft_record_dirty - set the mft record and the page containing it dirty
- * @ni: ntfs inode describing the mapped mft record
- *
- * Internal function. Users should call mark_mft_record_dirty() instead.
- *
- * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
- * as well as the page containing the mft record, dirty. Also, mark the base
- * vfs inode dirty. This ensures that any changes to the mft record are
- * written out to disk.
- *
- * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
- * on the base vfs inode, because even though file data may have been modified,
- * it is dirty in the inode meta data rather than the data page cache of the
- * inode, and thus there are no data pages that need writing out. Therefore, a
- * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because ->write_inode needs to be called even
- * in case of fdatasync. This needs to happen or the file data would not
- * necessarily hit the device synchronously, even though the vfs inode has the
- * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
- * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
- * which is not what I_DIRTY_SYNC on its own would suggest.
- */
-void __mark_mft_record_dirty(ntfs_inode *ni)
-{
- ntfs_inode *base_ni;
-
- ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
- BUG_ON(NInoAttr(ni));
- mark_ntfs_record_dirty(ni->page, ni->page_ofs);
- /* Determine the base vfs inode and mark it dirty, too. */
- mutex_lock(&ni->extent_lock);
- if (likely(ni->nr_extents >= 0))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- mutex_unlock(&ni->extent_lock);
- __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC);
-}
-
-static const char *ntfs_please_email = "Please email "
- "linux-ntfs-dev@lists.sourceforge.net and say that you saw "
- "this message. Thank you.";
-
-/**
- * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
- * @vol: ntfs volume on which the mft record to synchronize resides
- * @mft_no: mft record number of mft record to synchronize
- * @m: mapped, mst protected (extent) mft record to synchronize
- *
- * Write the mapped, mst protected (extent) mft record @m with mft record
- * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
- * bypassing the page cache and the $MFTMirr inode itself.
- *
- * This function is only for use at umount time when the mft mirror inode has
- * already been disposed off. We BUG() if we are called while the mft mirror
- * inode is still attached to the volume.
- *
- * On success return 0. On error return -errno.
- *
- * NOTE: This function is not implemented yet as I am not convinced it can
- * actually be triggered considering the sequence of commits we do in super.c::
- * ntfs_put_super(). But just in case we provide this place holder as the
- * alternative would be either to BUG() or to get a NULL pointer dereference
- * and Oops.
- */
-static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
- const unsigned long mft_no, MFT_RECORD *m)
-{
- BUG_ON(vol->mftmirr_ino);
- ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
- "implemented yet. %s", ntfs_please_email);
- return -EOPNOTSUPP;
-}
-
-/**
- * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
- * @vol: ntfs volume on which the mft record to synchronize resides
- * @mft_no: mft record number of mft record to synchronize
- * @m: mapped, mst protected (extent) mft record to synchronize
- * @sync: if true, wait for i/o completion
- *
- * Write the mapped, mst protected (extent) mft record @m with mft record
- * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
- *
- * On success return 0. On error return -errno and set the volume errors flag
- * in the ntfs volume @vol.
- *
- * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
- *
- * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
- * schedule i/o via ->writepage or do it via kntfsd or whatever.
- */
-int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
- MFT_RECORD *m, int sync)
-{
- struct page *page;
- unsigned int blocksize = vol->sb->s_blocksize;
- int max_bhs = vol->mft_record_size / blocksize;
- struct buffer_head *bhs[MAX_BHS];
- struct buffer_head *bh, *head;
- u8 *kmirr;
- runlist_element *rl;
- unsigned int block_start, block_end, m_start, m_end, page_ofs;
- int i_bhs, nr_bhs, err = 0;
- unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
-
- ntfs_debug("Entering for inode 0x%lx.", mft_no);
- BUG_ON(!max_bhs);
- if (WARN_ON(max_bhs > MAX_BHS))
- return -EINVAL;
- if (unlikely(!vol->mftmirr_ino)) {
- /* This could happen during umount... */
- err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
- if (likely(!err))
- return err;
- goto err_out;
- }
- /* Get the page containing the mirror copy of the mft record @m. */
- page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
- (PAGE_SHIFT - vol->mft_record_size_bits));
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to map mft mirror page.");
- err = PTR_ERR(page);
- goto err_out;
- }
- lock_page(page);
- BUG_ON(!PageUptodate(page));
- ClearPageUptodate(page);
- /* Offset of the mft mirror record inside the page. */
- page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
- /* The address in the page of the mirror copy of the mft record @m. */
- kmirr = page_address(page) + page_ofs;
- /* Copy the mst protected mft record to the mirror. */
- memcpy(kmirr, m, vol->mft_record_size);
- /* Create uptodate buffers if not present. */
- if (unlikely(!page_has_buffers(page))) {
- struct buffer_head *tail;
-
- bh = head = alloc_page_buffers(page, blocksize, true);
- do {
- set_buffer_uptodate(bh);
- tail = bh;
- bh = bh->b_this_page;
- } while (bh);
- tail->b_this_page = head;
- attach_page_private(page, head);
- }
- bh = head = page_buffers(page);
- BUG_ON(!bh);
- rl = NULL;
- nr_bhs = 0;
- block_start = 0;
- m_start = kmirr - (u8*)page_address(page);
- m_end = m_start + vol->mft_record_size;
- do {
- block_end = block_start + blocksize;
- /* If the buffer is outside the mft record, skip it. */
- if (block_end <= m_start)
- continue;
- if (unlikely(block_start >= m_end))
- break;
- /* Need to map the buffer if it is not mapped already. */
- if (unlikely(!buffer_mapped(bh))) {
- VCN vcn;
- LCN lcn;
- unsigned int vcn_ofs;
-
- bh->b_bdev = vol->sb->s_bdev;
- /* Obtain the vcn and offset of the current block. */
- vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
- (block_start - m_start);
- vcn_ofs = vcn & vol->cluster_size_mask;
- vcn >>= vol->cluster_size_bits;
- if (!rl) {
- down_read(&NTFS_I(vol->mftmirr_ino)->
- runlist.lock);
- rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
- /*
- * $MFTMirr always has the whole of its runlist
- * in memory.
- */
- BUG_ON(!rl);
- }
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- /* For $MFTMirr, only lcn >= 0 is a successful remap. */
- if (likely(lcn >= 0)) {
- /* Setup buffer head to correct block. */
- bh->b_blocknr = ((lcn <<
- vol->cluster_size_bits) +
- vcn_ofs) >> blocksize_bits;
- set_buffer_mapped(bh);
- } else {
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Cannot write mft mirror "
- "record 0x%lx because its "
- "location on disk could not "
- "be determined (error code "
- "%lli).", mft_no,
- (long long)lcn);
- err = -EIO;
- }
- }
- BUG_ON(!buffer_uptodate(bh));
- BUG_ON(!nr_bhs && (m_start != block_start));
- BUG_ON(nr_bhs >= max_bhs);
- bhs[nr_bhs++] = bh;
- BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
- } while (block_start = block_end, (bh = bh->b_this_page) != head);
- if (unlikely(rl))
- up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
- if (likely(!err)) {
- /* Lock buffers and start synchronous write i/o on them. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
- struct buffer_head *tbh = bhs[i_bhs];
-
- if (!trylock_buffer(tbh))
- BUG();
- BUG_ON(!buffer_uptodate(tbh));
- clear_buffer_dirty(tbh);
- get_bh(tbh);
- tbh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE, tbh);
- }
- /* Wait on i/o completion of buffers. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
- struct buffer_head *tbh = bhs[i_bhs];
-
- wait_on_buffer(tbh);
- if (unlikely(!buffer_uptodate(tbh))) {
- err = -EIO;
- /*
- * Set the buffer uptodate so the page and
- * buffer states do not become out of sync.
- */
- set_buffer_uptodate(tbh);
- }
- }
- } else /* if (unlikely(err)) */ {
- /* Clean the buffers. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
- clear_buffer_dirty(bhs[i_bhs]);
- }
- /* Current state: all buffers are clean, unlocked, and uptodate. */
- /* Remove the mst protection fixups again. */
- post_write_mst_fixup((NTFS_RECORD*)kmirr);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- ntfs_unmap_page(page);
- if (likely(!err)) {
- ntfs_debug("Done.");
- } else {
- ntfs_error(vol->sb, "I/O error while writing mft mirror "
- "record 0x%lx!", mft_no);
-err_out:
- ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
- "code %i). Volume will be left marked dirty "
- "on umount. Run ntfsfix on the partition "
- "after umounting to correct this.", -err);
- NVolSetErrors(vol);
- }
- return err;
-}
-
-/**
- * write_mft_record_nolock - write out a mapped (extent) mft record
- * @ni: ntfs inode describing the mapped (extent) mft record
- * @m: mapped (extent) mft record to write
- * @sync: if true, wait for i/o completion
- *
- * Write the mapped (extent) mft record @m described by the (regular or extent)
- * ntfs inode @ni to backing store. If the mft record @m has a counterpart in
- * the mft mirror, that is also updated.
- *
- * We only write the mft record if the ntfs inode @ni is dirty and the first
- * buffer belonging to its mft record is dirty, too. We ignore the dirty state
- * of subsequent buffers because we could have raced with
- * fs/ntfs/aops.c::mark_ntfs_record_dirty().
- *
- * On success, clean the mft record and return 0. On error, leave the mft
- * record dirty and return -errno.
- *
- * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
- * However, if the mft record has a counterpart in the mft mirror and @sync is
- * true, we write the mft record, wait for i/o completion, and only then write
- * the mft mirror copy. This ensures that if the system crashes either the mft
- * or the mft mirror will contain a self-consistent mft record @m. If @sync is
- * false on the other hand, we start i/o on both and then wait for completion
- * on them. This provides a speedup but no longer guarantees that you will end
- * up with a self-consistent mft record in the case of a crash but if you asked
- * for asynchronous writing you probably do not care about that anyway.
- *
- * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
- * schedule i/o via ->writepage or do it via kntfsd or whatever.
- */
-int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
-{
- ntfs_volume *vol = ni->vol;
- struct page *page = ni->page;
- unsigned int blocksize = vol->sb->s_blocksize;
- unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
- int max_bhs = vol->mft_record_size / blocksize;
- struct buffer_head *bhs[MAX_BHS];
- struct buffer_head *bh, *head;
- runlist_element *rl;
- unsigned int block_start, block_end, m_start, m_end;
- int i_bhs, nr_bhs, err = 0;
-
- ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
- BUG_ON(NInoAttr(ni));
- BUG_ON(!max_bhs);
- BUG_ON(!PageLocked(page));
- if (WARN_ON(max_bhs > MAX_BHS)) {
- err = -EINVAL;
- goto err_out;
- }
- /*
- * If the ntfs_inode is clean no need to do anything. If it is dirty,
- * mark it as clean now so that it can be redirtied later on if needed.
- * There is no danger of races since the caller is holding the locks
- * for the mft record @m and the page it is in.
- */
- if (!NInoTestClearDirty(ni))
- goto done;
- bh = head = page_buffers(page);
- BUG_ON(!bh);
- rl = NULL;
- nr_bhs = 0;
- block_start = 0;
- m_start = ni->page_ofs;
- m_end = m_start + vol->mft_record_size;
- do {
- block_end = block_start + blocksize;
- /* If the buffer is outside the mft record, skip it. */
- if (block_end <= m_start)
- continue;
- if (unlikely(block_start >= m_end))
- break;
- /*
- * If this block is not the first one in the record, we ignore
- * the buffer's dirty state because we could have raced with a
- * parallel mark_ntfs_record_dirty().
- */
- if (block_start == m_start) {
- /* This block is the first one in the record. */
- if (!buffer_dirty(bh)) {
- BUG_ON(nr_bhs);
- /* Clean records are not written out. */
- break;
- }
- }
- /* Need to map the buffer if it is not mapped already. */
- if (unlikely(!buffer_mapped(bh))) {
- VCN vcn;
- LCN lcn;
- unsigned int vcn_ofs;
-
- bh->b_bdev = vol->sb->s_bdev;
- /* Obtain the vcn and offset of the current block. */
- vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
- (block_start - m_start);
- vcn_ofs = vcn & vol->cluster_size_mask;
- vcn >>= vol->cluster_size_bits;
- if (!rl) {
- down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
- rl = NTFS_I(vol->mft_ino)->runlist.rl;
- BUG_ON(!rl);
- }
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- /* For $MFT, only lcn >= 0 is a successful remap. */
- if (likely(lcn >= 0)) {
- /* Setup buffer head to correct block. */
- bh->b_blocknr = ((lcn <<
- vol->cluster_size_bits) +
- vcn_ofs) >> blocksize_bits;
- set_buffer_mapped(bh);
- } else {
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Cannot write mft record "
- "0x%lx because its location "
- "on disk could not be "
- "determined (error code %lli).",
- ni->mft_no, (long long)lcn);
- err = -EIO;
- }
- }
- BUG_ON(!buffer_uptodate(bh));
- BUG_ON(!nr_bhs && (m_start != block_start));
- BUG_ON(nr_bhs >= max_bhs);
- bhs[nr_bhs++] = bh;
- BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
- } while (block_start = block_end, (bh = bh->b_this_page) != head);
- if (unlikely(rl))
- up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
- if (!nr_bhs)
- goto done;
- if (unlikely(err))
- goto cleanup_out;
- /* Apply the mst protection fixups. */
- err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
- if (err) {
- ntfs_error(vol->sb, "Failed to apply mst fixups!");
- goto cleanup_out;
- }
- flush_dcache_mft_record_page(ni);
- /* Lock buffers and start synchronous write i/o on them. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
- struct buffer_head *tbh = bhs[i_bhs];
-
- if (!trylock_buffer(tbh))
- BUG();
- BUG_ON(!buffer_uptodate(tbh));
- clear_buffer_dirty(tbh);
- get_bh(tbh);
- tbh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE, tbh);
- }
- /* Synchronize the mft mirror now if not @sync. */
- if (!sync && ni->mft_no < vol->mftmirr_size)
- ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
- /* Wait on i/o completion of buffers. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
- struct buffer_head *tbh = bhs[i_bhs];
-
- wait_on_buffer(tbh);
- if (unlikely(!buffer_uptodate(tbh))) {
- err = -EIO;
- /*
- * Set the buffer uptodate so the page and buffer
- * states do not become out of sync.
- */
- if (PageUptodate(page))
- set_buffer_uptodate(tbh);
- }
- }
- /* If @sync, now synchronize the mft mirror. */
- if (sync && ni->mft_no < vol->mftmirr_size)
- ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
- /* Remove the mst protection fixups again. */
- post_write_mst_fixup((NTFS_RECORD*)m);
- flush_dcache_mft_record_page(ni);
- if (unlikely(err)) {
- /* I/O error during writing. This is really bad! */
- ntfs_error(vol->sb, "I/O error while writing mft record "
- "0x%lx! Marking base inode as bad. You "
- "should unmount the volume and run chkdsk.",
- ni->mft_no);
- goto err_out;
- }
-done:
- ntfs_debug("Done.");
- return 0;
-cleanup_out:
- /* Clean the buffers. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
- clear_buffer_dirty(bhs[i_bhs]);
-err_out:
- /*
- * Current state: all buffers are clean, unlocked, and uptodate.
- * The caller should mark the base inode as bad so that no more i/o
- * happens. ->clear_inode() will still be invoked so all extent inodes
- * and other allocated memory will be freed.
- */
- if (err == -ENOMEM) {
- ntfs_error(vol->sb, "Not enough memory to write mft record. "
- "Redirtying so the write is retried later.");
- mark_mft_record_dirty(ni);
- err = 0;
- } else
- NVolSetErrors(vol);
- return err;
-}
-
-/**
- * ntfs_may_write_mft_record - check if an mft record may be written out
- * @vol: [IN] ntfs volume on which the mft record to check resides
- * @mft_no: [IN] mft record number of the mft record to check
- * @m: [IN] mapped mft record to check
- * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned
- *
- * Check if the mapped (base or extent) mft record @m with mft record number
- * @mft_no belonging to the ntfs volume @vol may be written out. If necessary
- * and possible the ntfs inode of the mft record is locked and the base vfs
- * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The
- * caller is responsible for unlocking the ntfs inode and unpinning the base
- * vfs inode.
- *
- * Return 'true' if the mft record may be written out and 'false' if not.
- *
- * The caller has locked the page and cleared the uptodate flag on it which
- * means that we can safely write out any dirty mft records that do not have
- * their inodes in icache as determined by ilookup5() as anyone
- * opening/creating such an inode would block when attempting to map the mft
- * record in read_cache_page() until we are finished with the write out.
- *
- * Here is a description of the tests we perform:
- *
- * If the inode is found in icache we know the mft record must be a base mft
- * record. If it is dirty, we do not write it and return 'false' as the vfs
- * inode write paths will result in the access times being updated which would
- * cause the base mft record to be redirtied and written out again. (We know
- * the access time update will modify the base mft record because Windows
- * chkdsk complains if the standard information attribute is not in the base
- * mft record.)
- *
- * If the inode is in icache and not dirty, we attempt to lock the mft record
- * and if we find the lock was already taken, it is not safe to write the mft
- * record and we return 'false'.
- *
- * If we manage to obtain the lock we have exclusive access to the mft record,
- * which also allows us safe writeout of the mft record. We then set
- * @locked_ni to the locked ntfs inode and return 'true'.
- *
- * Note we cannot just lock the mft record and sleep while waiting for the lock
- * because this would deadlock due to lock reversal (normally the mft record is
- * locked before the page is locked but we already have the page locked here
- * when we try to lock the mft record).
- *
- * If the inode is not in icache we need to perform further checks.
- *
- * If the mft record is not a FILE record or it is a base mft record, we can
- * safely write it and return 'true'.
- *
- * We now know the mft record is an extent mft record. We check if the inode
- * corresponding to its base mft record is in icache and obtain a reference to
- * it if it is. If it is not, we can safely write it and return 'true'.
- *
- * We now have the base inode for the extent mft record. We check if it has an
- * ntfs inode for the extent mft record attached and if not it is safe to write
- * the extent mft record and we return 'true'.
- *
- * The ntfs inode for the extent mft record is attached to the base inode so we
- * attempt to lock the extent mft record and if we find the lock was already
- * taken, it is not safe to write the extent mft record and we return 'false'.
- *
- * If we manage to obtain the lock we have exclusive access to the extent mft
- * record, which also allows us safe writeout of the extent mft record. We
- * set the ntfs inode of the extent mft record clean and then set @locked_ni to
- * the now locked ntfs inode and return 'true'.
- *
- * Note, the reason for actually writing dirty mft records here and not just
- * relying on the vfs inode dirty code paths is that we can have mft records
- * modified without them ever having actual inodes in memory. Also we can have
- * dirty mft records with clean ntfs inodes in memory. None of the described
- * cases would result in the dirty mft records being written out if we only
- * relied on the vfs inode dirty code paths. And these cases can really occur
- * during allocation of new mft records and in particular when the
- * initialized_size of the $MFT/$DATA attribute is extended and the new space
- * is initialized using ntfs_mft_record_format(). The clean inode can then
- * appear if the mft record is reused for a new inode before it got written
- * out.
- */
-bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
- const MFT_RECORD *m, ntfs_inode **locked_ni)
-{
- struct super_block *sb = vol->sb;
- struct inode *mft_vi = vol->mft_ino;
- struct inode *vi;
- ntfs_inode *ni, *eni, **extent_nis;
- int i;
- ntfs_attr na;
-
- ntfs_debug("Entering for inode 0x%lx.", mft_no);
- /*
- * Normally we do not return a locked inode so set @locked_ni to NULL.
- */
- BUG_ON(!locked_ni);
- *locked_ni = NULL;
- /*
- * Check if the inode corresponding to this mft record is in the VFS
- * inode cache and obtain a reference to it if it is.
- */
- ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
- na.mft_no = mft_no;
- na.name = NULL;
- na.name_len = 0;
- na.type = AT_UNUSED;
- /*
- * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
- * we get here for it rather often.
- */
- if (!mft_no) {
- /* Balance the below iput(). */
- vi = igrab(mft_vi);
- BUG_ON(vi != mft_vi);
- } else {
- /*
- * Have to use ilookup5_nowait() since ilookup5() waits for the
- * inode lock which causes ntfs to deadlock when a concurrent
- * inode write via the inode dirty code paths and the page
- * dirty code path of the inode dirty code path when writing
- * $MFT occurs.
- */
- vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na);
- }
- if (vi) {
- ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
- /* The inode is in icache. */
- ni = NTFS_I(vi);
- /* Take a reference to the ntfs inode. */
- atomic_inc(&ni->count);
- /* If the inode is dirty, do not write this record. */
- if (NInoDirty(ni)) {
- ntfs_debug("Inode 0x%lx is dirty, do not write it.",
- mft_no);
- atomic_dec(&ni->count);
- iput(vi);
- return false;
- }
- ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
- /* The inode is not dirty, try to take the mft record lock. */
- if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
- ntfs_debug("Mft record 0x%lx is already locked, do "
- "not write it.", mft_no);
- atomic_dec(&ni->count);
- iput(vi);
- return false;
- }
- ntfs_debug("Managed to lock mft record 0x%lx, write it.",
- mft_no);
- /*
- * The write has to occur while we hold the mft record lock so
- * return the locked ntfs inode.
- */
- *locked_ni = ni;
- return true;
- }
- ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
- /* The inode is not in icache. */
- /* Write the record if it is not a mft record (type "FILE"). */
- if (!ntfs_is_mft_record(m->magic)) {
- ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
- mft_no);
- return true;
- }
- /* Write the mft record if it is a base inode. */
- if (!m->base_mft_record) {
- ntfs_debug("Mft record 0x%lx is a base record, write it.",
- mft_no);
- return true;
- }
- /*
- * This is an extent mft record. Check if the inode corresponding to
- * its base mft record is in icache and obtain a reference to it if it
- * is.
- */
- na.mft_no = MREF_LE(m->base_mft_record);
- ntfs_debug("Mft record 0x%lx is an extent record. Looking for base "
- "inode 0x%lx in icache.", mft_no, na.mft_no);
- if (!na.mft_no) {
- /* Balance the below iput(). */
- vi = igrab(mft_vi);
- BUG_ON(vi != mft_vi);
- } else
- vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode,
- &na);
- if (!vi) {
- /*
- * The base inode is not in icache, write this extent mft
- * record.
- */
- ntfs_debug("Base inode 0x%lx is not in icache, write the "
- "extent record.", na.mft_no);
- return true;
- }
- ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
- /*
- * The base inode is in icache. Check if it has the extent inode
- * corresponding to this extent mft record attached.
- */
- ni = NTFS_I(vi);
- mutex_lock(&ni->extent_lock);
- if (ni->nr_extents <= 0) {
- /*
- * The base inode has no attached extent inodes, write this
- * extent mft record.
- */
- mutex_unlock(&ni->extent_lock);
- iput(vi);
- ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
- "write the extent record.", na.mft_no);
- return true;
- }
- /* Iterate over the attached extent inodes. */
- extent_nis = ni->ext.extent_ntfs_inos;
- for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
- if (mft_no == extent_nis[i]->mft_no) {
- /*
- * Found the extent inode corresponding to this extent
- * mft record.
- */
- eni = extent_nis[i];
- break;
- }
- }
- /*
- * If the extent inode was not attached to the base inode, write this
- * extent mft record.
- */
- if (!eni) {
- mutex_unlock(&ni->extent_lock);
- iput(vi);
- ntfs_debug("Extent inode 0x%lx is not attached to its base "
- "inode 0x%lx, write the extent record.",
- mft_no, na.mft_no);
- return true;
- }
- ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
- mft_no, na.mft_no);
- /* Take a reference to the extent ntfs inode. */
- atomic_inc(&eni->count);
- mutex_unlock(&ni->extent_lock);
- /*
- * Found the extent inode coresponding to this extent mft record.
- * Try to take the mft record lock.
- */
- if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
- atomic_dec(&eni->count);
- iput(vi);
- ntfs_debug("Extent mft record 0x%lx is already locked, do "
- "not write it.", mft_no);
- return false;
- }
- ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
- mft_no);
- if (NInoTestClearDirty(eni))
- ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
- mft_no);
- /*
- * The write has to occur while we hold the mft record lock so return
- * the locked extent ntfs inode.
- */
- *locked_ni = eni;
- return true;
-}
-
-static const char *es = " Leaving inconsistent metadata. Unmount and run "
- "chkdsk.";
-
-/**
- * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
- * @vol: volume on which to search for a free mft record
- * @base_ni: open base inode if allocating an extent mft record or NULL
- *
- * Search for a free mft record in the mft bitmap attribute on the ntfs volume
- * @vol.
- *
- * If @base_ni is NULL start the search at the default allocator position.
- *
- * If @base_ni is not NULL start the search at the mft record after the base
- * mft record @base_ni.
- *
- * Return the free mft record on success and -errno on error. An error code of
- * -ENOSPC means that there are no free mft records in the currently
- * initialized mft bitmap.
- *
- * Locking: Caller must hold vol->mftbmp_lock for writing.
- */
-static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
- ntfs_inode *base_ni)
-{
- s64 pass_end, ll, data_pos, pass_start, ofs, bit;
- unsigned long flags;
- struct address_space *mftbmp_mapping;
- u8 *buf, *byte;
- struct page *page;
- unsigned int page_ofs, size;
- u8 pass, b;
-
- ntfs_debug("Searching for free mft record in the currently "
- "initialized mft bitmap.");
- mftbmp_mapping = vol->mftbmp_ino->i_mapping;
- /*
- * Set the end of the pass making sure we do not overflow the mft
- * bitmap.
- */
- read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
- pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
- vol->mft_record_size_bits;
- read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
- read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
- ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
- read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
- if (pass_end > ll)
- pass_end = ll;
- pass = 1;
- if (!base_ni)
- data_pos = vol->mft_data_pos;
- else
- data_pos = base_ni->mft_no + 1;
- if (data_pos < 24)
- data_pos = 24;
- if (data_pos >= pass_end) {
- data_pos = 24;
- pass = 2;
- /* This happens on a freshly formatted volume. */
- if (data_pos >= pass_end)
- return -ENOSPC;
- }
- pass_start = data_pos;
- ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
- "pass_end 0x%llx, data_pos 0x%llx.", pass,
- (long long)pass_start, (long long)pass_end,
- (long long)data_pos);
- /* Loop until a free mft record is found. */
- for (; pass <= 2;) {
- /* Cap size to pass_end. */
- ofs = data_pos >> 3;
- page_ofs = ofs & ~PAGE_MASK;
- size = PAGE_SIZE - page_ofs;
- ll = ((pass_end + 7) >> 3) - ofs;
- if (size > ll)
- size = ll;
- size <<= 3;
- /*
- * If we are still within the active pass, search the next page
- * for a zero bit.
- */
- if (size) {
- page = ntfs_map_page(mftbmp_mapping,
- ofs >> PAGE_SHIFT);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read mft "
- "bitmap, aborting.");
- return PTR_ERR(page);
- }
- buf = (u8*)page_address(page) + page_ofs;
- bit = data_pos & 7;
- data_pos &= ~7ull;
- ntfs_debug("Before inner for loop: size 0x%x, "
- "data_pos 0x%llx, bit 0x%llx", size,
- (long long)data_pos, (long long)bit);
- for (; bit < size && data_pos + bit < pass_end;
- bit &= ~7ull, bit += 8) {
- byte = buf + (bit >> 3);
- if (*byte == 0xff)
- continue;
- b = ffz((unsigned long)*byte);
- if (b < 8 && b >= (bit & 7)) {
- ll = data_pos + (bit & ~7ull) + b;
- if (unlikely(ll > (1ll << 32))) {
- ntfs_unmap_page(page);
- return -ENOSPC;
- }
- *byte |= 1 << b;
- flush_dcache_page(page);
- set_page_dirty(page);
- ntfs_unmap_page(page);
- ntfs_debug("Done. (Found and "
- "allocated mft record "
- "0x%llx.)",
- (long long)ll);
- return ll;
- }
- }
- ntfs_debug("After inner for loop: size 0x%x, "
- "data_pos 0x%llx, bit 0x%llx", size,
- (long long)data_pos, (long long)bit);
- data_pos += size;
- ntfs_unmap_page(page);
- /*
- * If the end of the pass has not been reached yet,
- * continue searching the mft bitmap for a zero bit.
- */
- if (data_pos < pass_end)
- continue;
- }
- /* Do the next pass. */
- if (++pass == 2) {
- /*
- * Starting the second pass, in which we scan the first
- * part of the zone which we omitted earlier.
- */
- pass_end = pass_start;
- data_pos = pass_start = 24;
- ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
- "0x%llx.", pass, (long long)pass_start,
- (long long)pass_end);
- if (data_pos >= pass_end)
- break;
- }
- }
- /* No free mft records in currently initialized mft bitmap. */
- ntfs_debug("Done. (No free mft records left in currently initialized "
- "mft bitmap.)");
- return -ENOSPC;
-}
-
-/**
- * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
- * @vol: volume on which to extend the mft bitmap attribute
- *
- * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
- *
- * Note: Only changes allocated_size, i.e. does not touch initialized_size or
- * data_size.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - Caller must hold vol->mftbmp_lock for writing.
- * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
- * writing and releases it before returning.
- * - This function takes vol->lcnbmp_lock for writing and releases it
- * before returning.
- */
-static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
-{
- LCN lcn;
- s64 ll;
- unsigned long flags;
- struct page *page;
- ntfs_inode *mft_ni, *mftbmp_ni;
- runlist_element *rl, *rl2 = NULL;
- ntfs_attr_search_ctx *ctx = NULL;
- MFT_RECORD *mrec;
- ATTR_RECORD *a = NULL;
- int ret, mp_size;
- u32 old_alen = 0;
- u8 *b, tb;
- struct {
- u8 added_cluster:1;
- u8 added_run:1;
- u8 mp_rebuilt:1;
- } status = { 0, 0, 0 };
-
- ntfs_debug("Extending mft bitmap allocation.");
- mft_ni = NTFS_I(vol->mft_ino);
- mftbmp_ni = NTFS_I(vol->mftbmp_ino);
- /*
- * Determine the last lcn of the mft bitmap. The allocated size of the
- * mft bitmap cannot be zero so we are ok to do this.
- */
- down_write(&mftbmp_ni->runlist.lock);
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- ll = mftbmp_ni->allocated_size;
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
- (ll - 1) >> vol->cluster_size_bits, NULL);
- if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
- up_write(&mftbmp_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to determine last allocated "
- "cluster of mft bitmap attribute.");
- if (!IS_ERR(rl))
- ret = -EIO;
- else
- ret = PTR_ERR(rl);
- return ret;
- }
- lcn = rl->lcn + rl->length;
- ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
- (long long)lcn);
- /*
- * Attempt to get the cluster following the last allocated cluster by
- * hand as it may be in the MFT zone so the allocator would not give it
- * to us.
- */
- ll = lcn >> 3;
- page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
- ll >> PAGE_SHIFT);
- if (IS_ERR(page)) {
- up_write(&mftbmp_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
- return PTR_ERR(page);
- }
- b = (u8*)page_address(page) + (ll & ~PAGE_MASK);
- tb = 1 << (lcn & 7ull);
- down_write(&vol->lcnbmp_lock);
- if (*b != 0xff && !(*b & tb)) {
- /* Next cluster is free, allocate it. */
- *b |= tb;
- flush_dcache_page(page);
- set_page_dirty(page);
- up_write(&vol->lcnbmp_lock);
- ntfs_unmap_page(page);
- /* Update the mft bitmap runlist. */
- rl->length++;
- rl[1].vcn++;
- status.added_cluster = 1;
- ntfs_debug("Appending one cluster to mft bitmap.");
- } else {
- up_write(&vol->lcnbmp_lock);
- ntfs_unmap_page(page);
- /* Allocate a cluster from the DATA_ZONE. */
- rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
- true);
- if (IS_ERR(rl2)) {
- up_write(&mftbmp_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to allocate a cluster for "
- "the mft bitmap.");
- return PTR_ERR(rl2);
- }
- rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
- if (IS_ERR(rl)) {
- up_write(&mftbmp_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to merge runlists for mft "
- "bitmap.");
- if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to deallocate "
- "allocated cluster.%s", es);
- NVolSetErrors(vol);
- }
- ntfs_free(rl2);
- return PTR_ERR(rl);
- }
- mftbmp_ni->runlist.rl = rl;
- status.added_run = 1;
- ntfs_debug("Adding one run to mft bitmap.");
- /* Find the last run in the new runlist. */
- for (; rl[1].length; rl++)
- ;
- }
- /*
- * Update the attribute record as well. Note: @rl is the last
- * (non-terminator) runlist element of mft bitmap.
- */
- mrec = map_mft_record(mft_ni);
- if (IS_ERR(mrec)) {
- ntfs_error(vol->sb, "Failed to map mft record.");
- ret = PTR_ERR(mrec);
- goto undo_alloc;
- }
- ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
- if (unlikely(!ctx)) {
- ntfs_error(vol->sb, "Failed to get search context.");
- ret = -ENOMEM;
- goto undo_alloc;
- }
- ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
- mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
- 0, ctx);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to find last attribute extent of "
- "mft bitmap attribute.");
- if (ret == -ENOENT)
- ret = -EIO;
- goto undo_alloc;
- }
- a = ctx->attr;
- ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
- /* Search back for the previous last allocated cluster of mft bitmap. */
- for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
- if (ll >= rl2->vcn)
- break;
- }
- BUG_ON(ll < rl2->vcn);
- BUG_ON(ll >= rl2->vcn + rl2->length);
- /* Get the size for the new mapping pairs array for this extent. */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
- if (unlikely(mp_size <= 0)) {
- ntfs_error(vol->sb, "Get size for mapping pairs failed for "
- "mft bitmap attribute extent.");
- ret = mp_size;
- if (!ret)
- ret = -EIO;
- goto undo_alloc;
- }
- /* Expand the attribute record if necessary. */
- old_alen = le32_to_cpu(a->length);
- ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
- if (unlikely(ret)) {
- if (ret != -ENOSPC) {
- ntfs_error(vol->sb, "Failed to resize attribute "
- "record for mft bitmap attribute.");
- goto undo_alloc;
- }
- // TODO: Deal with this by moving this extent to a new mft
- // record or by starting a new extent in a new mft record or by
- // moving other attributes out of this mft record.
- // Note: It will need to be a special mft record and if none of
- // those are available it gets rather complicated...
- ntfs_error(vol->sb, "Not enough space in this mft record to "
- "accommodate extended mft bitmap attribute "
- "extent. Cannot handle this yet.");
- ret = -EOPNOTSUPP;
- goto undo_alloc;
- }
- status.mp_rebuilt = 1;
- /* Generate the mapping pairs array directly into the attr record. */
- ret = ntfs_mapping_pairs_build(vol, (u8*)a +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
- mp_size, rl2, ll, -1, NULL);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to build mapping pairs array for "
- "mft bitmap attribute.");
- goto undo_alloc;
- }
- /* Update the highest_vcn. */
- a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
- /*
- * We now have extended the mft bitmap allocated_size by one cluster.
- * Reflect this in the ntfs_inode structure and the attribute record.
- */
- if (a->data.non_resident.lowest_vcn) {
- /*
- * We are not in the first attribute extent, switch to it, but
- * first ensure the changes will make it to disk later.
- */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_reinit_search_ctx(ctx);
- ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
- mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
- 0, ctx);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to find first attribute "
- "extent of mft bitmap attribute.");
- goto restore_undo_alloc;
- }
- a = ctx->attr;
- }
- write_lock_irqsave(&mftbmp_ni->size_lock, flags);
- mftbmp_ni->allocated_size += vol->cluster_size;
- a->data.non_resident.allocated_size =
- cpu_to_sle64(mftbmp_ni->allocated_size);
- write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- up_write(&mftbmp_ni->runlist.lock);
- ntfs_debug("Done.");
- return 0;
-restore_undo_alloc:
- ntfs_attr_reinit_search_ctx(ctx);
- if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
- mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
- 0, ctx)) {
- ntfs_error(vol->sb, "Failed to find last attribute extent of "
- "mft bitmap attribute.%s", es);
- write_lock_irqsave(&mftbmp_ni->size_lock, flags);
- mftbmp_ni->allocated_size += vol->cluster_size;
- write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- up_write(&mftbmp_ni->runlist.lock);
- /*
- * The only thing that is now wrong is ->allocated_size of the
- * base attribute extent which chkdsk should be able to fix.
- */
- NVolSetErrors(vol);
- return ret;
- }
- a = ctx->attr;
- a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
-undo_alloc:
- if (status.added_cluster) {
- /* Truncate the last run in the runlist by one cluster. */
- rl->length--;
- rl[1].vcn--;
- } else if (status.added_run) {
- lcn = rl->lcn;
- /* Remove the last run from the runlist. */
- rl->lcn = rl[1].lcn;
- rl->length = 0;
- }
- /* Deallocate the cluster. */
- down_write(&vol->lcnbmp_lock);
- if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
- ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
- NVolSetErrors(vol);
- }
- up_write(&vol->lcnbmp_lock);
- if (status.mp_rebuilt) {
- if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
- old_alen - le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
- rl2, ll, -1, NULL)) {
- ntfs_error(vol->sb, "Failed to restore mapping pairs "
- "array.%s", es);
- NVolSetErrors(vol);
- }
- if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
- ntfs_error(vol->sb, "Failed to restore attribute "
- "record.%s", es);
- NVolSetErrors(vol);
- }
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- }
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (!IS_ERR(mrec))
- unmap_mft_record(mft_ni);
- up_write(&mftbmp_ni->runlist.lock);
- return ret;
-}
-
-/**
- * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
- * @vol: volume on which to extend the mft bitmap attribute
- *
- * Extend the initialized portion of the mft bitmap attribute on the ntfs
- * volume @vol by 8 bytes.
- *
- * Note: Only changes initialized_size and data_size, i.e. requires that
- * allocated_size is big enough to fit the new initialized_size.
- *
- * Return 0 on success and -error on error.
- *
- * Locking: Caller must hold vol->mftbmp_lock for writing.
- */
-static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
-{
- s64 old_data_size, old_initialized_size;
- unsigned long flags;
- struct inode *mftbmp_vi;
- ntfs_inode *mft_ni, *mftbmp_ni;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *mrec;
- ATTR_RECORD *a;
- int ret;
-
- ntfs_debug("Extending mft bitmap initiailized (and data) size.");
- mft_ni = NTFS_I(vol->mft_ino);
- mftbmp_vi = vol->mftbmp_ino;
- mftbmp_ni = NTFS_I(mftbmp_vi);
- /* Get the attribute record. */
- mrec = map_mft_record(mft_ni);
- if (IS_ERR(mrec)) {
- ntfs_error(vol->sb, "Failed to map mft record.");
- return PTR_ERR(mrec);
- }
- ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
- if (unlikely(!ctx)) {
- ntfs_error(vol->sb, "Failed to get search context.");
- ret = -ENOMEM;
- goto unm_err_out;
- }
- ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
- mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to find first attribute extent of "
- "mft bitmap attribute.");
- if (ret == -ENOENT)
- ret = -EIO;
- goto put_err_out;
- }
- a = ctx->attr;
- write_lock_irqsave(&mftbmp_ni->size_lock, flags);
- old_data_size = i_size_read(mftbmp_vi);
- old_initialized_size = mftbmp_ni->initialized_size;
- /*
- * We can simply update the initialized_size before filling the space
- * with zeroes because the caller is holding the mft bitmap lock for
- * writing which ensures that no one else is trying to access the data.
- */
- mftbmp_ni->initialized_size += 8;
- a->data.non_resident.initialized_size =
- cpu_to_sle64(mftbmp_ni->initialized_size);
- if (mftbmp_ni->initialized_size > old_data_size) {
- i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
- a->data.non_resident.data_size =
- cpu_to_sle64(mftbmp_ni->initialized_size);
- }
- write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- /* Initialize the mft bitmap attribute value with zeroes. */
- ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
- if (likely(!ret)) {
- ntfs_debug("Done. (Wrote eight initialized bytes to mft "
- "bitmap.");
- return 0;
- }
- ntfs_error(vol->sb, "Failed to write to mft bitmap.");
- /* Try to recover from the error. */
- mrec = map_mft_record(mft_ni);
- if (IS_ERR(mrec)) {
- ntfs_error(vol->sb, "Failed to map mft record.%s", es);
- NVolSetErrors(vol);
- return ret;
- }
- ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
- if (unlikely(!ctx)) {
- ntfs_error(vol->sb, "Failed to get search context.%s", es);
- NVolSetErrors(vol);
- goto unm_err_out;
- }
- if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
- mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
- ntfs_error(vol->sb, "Failed to find first attribute extent of "
- "mft bitmap attribute.%s", es);
- NVolSetErrors(vol);
-put_err_out:
- ntfs_attr_put_search_ctx(ctx);
-unm_err_out:
- unmap_mft_record(mft_ni);
- goto err_out;
- }
- a = ctx->attr;
- write_lock_irqsave(&mftbmp_ni->size_lock, flags);
- mftbmp_ni->initialized_size = old_initialized_size;
- a->data.non_resident.initialized_size =
- cpu_to_sle64(old_initialized_size);
- if (i_size_read(mftbmp_vi) != old_data_size) {
- i_size_write(mftbmp_vi, old_data_size);
- a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
- }
- write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
-#ifdef DEBUG
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
- "data_size 0x%llx, initialized_size 0x%llx.",
- (long long)mftbmp_ni->allocated_size,
- (long long)i_size_read(mftbmp_vi),
- (long long)mftbmp_ni->initialized_size);
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
-err_out:
- return ret;
-}
-
-/**
- * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
- * @vol: volume on which to extend the mft data attribute
- *
- * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
- * worth of clusters or if not enough space for this by one mft record worth
- * of clusters.
- *
- * Note: Only changes allocated_size, i.e. does not touch initialized_size or
- * data_size.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - Caller must hold vol->mftbmp_lock for writing.
- * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
- * writing and releases it before returning.
- * - This function calls functions which take vol->lcnbmp_lock for
- * writing and release it before returning.
- */
-static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
-{
- LCN lcn;
- VCN old_last_vcn;
- s64 min_nr, nr, ll;
- unsigned long flags;
- ntfs_inode *mft_ni;
- runlist_element *rl, *rl2;
- ntfs_attr_search_ctx *ctx = NULL;
- MFT_RECORD *mrec;
- ATTR_RECORD *a = NULL;
- int ret, mp_size;
- u32 old_alen = 0;
- bool mp_rebuilt = false;
-
- ntfs_debug("Extending mft data allocation.");
- mft_ni = NTFS_I(vol->mft_ino);
- /*
- * Determine the preferred allocation location, i.e. the last lcn of
- * the mft data attribute. The allocated size of the mft data
- * attribute cannot be zero so we are ok to do this.
- */
- down_write(&mft_ni->runlist.lock);
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ll = mft_ni->allocated_size;
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- rl = ntfs_attr_find_vcn_nolock(mft_ni,
- (ll - 1) >> vol->cluster_size_bits, NULL);
- if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
- up_write(&mft_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to determine last allocated "
- "cluster of mft data attribute.");
- if (!IS_ERR(rl))
- ret = -EIO;
- else
- ret = PTR_ERR(rl);
- return ret;
- }
- lcn = rl->lcn + rl->length;
- ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
- /* Minimum allocation is one mft record worth of clusters. */
- min_nr = vol->mft_record_size >> vol->cluster_size_bits;
- if (!min_nr)
- min_nr = 1;
- /* Want to allocate 16 mft records worth of clusters. */
- nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
- if (!nr)
- nr = min_nr;
- /* Ensure we do not go above 2^32-1 mft records. */
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ll = mft_ni->allocated_size;
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
- vol->mft_record_size_bits >= (1ll << 32))) {
- nr = min_nr;
- if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
- vol->mft_record_size_bits >= (1ll << 32))) {
- ntfs_warning(vol->sb, "Cannot allocate mft record "
- "because the maximum number of inodes "
- "(2^32) has already been reached.");
- up_write(&mft_ni->runlist.lock);
- return -ENOSPC;
- }
- }
- ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
- nr > min_nr ? "default" : "minimal", (long long)nr);
- old_last_vcn = rl[1].vcn;
- do {
- rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
- true);
- if (!IS_ERR(rl2))
- break;
- if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
- ntfs_error(vol->sb, "Failed to allocate the minimal "
- "number of clusters (%lli) for the "
- "mft data attribute.", (long long)nr);
- up_write(&mft_ni->runlist.lock);
- return PTR_ERR(rl2);
- }
- /*
- * There is not enough space to do the allocation, but there
- * might be enough space to do a minimal allocation so try that
- * before failing.
- */
- nr = min_nr;
- ntfs_debug("Retrying mft data allocation with minimal cluster "
- "count %lli.", (long long)nr);
- } while (1);
- rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
- if (IS_ERR(rl)) {
- up_write(&mft_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to merge runlists for mft data "
- "attribute.");
- if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to deallocate clusters "
- "from the mft data attribute.%s", es);
- NVolSetErrors(vol);
- }
- ntfs_free(rl2);
- return PTR_ERR(rl);
- }
- mft_ni->runlist.rl = rl;
- ntfs_debug("Allocated %lli clusters.", (long long)nr);
- /* Find the last run in the new runlist. */
- for (; rl[1].length; rl++)
- ;
- /* Update the attribute record as well. */
- mrec = map_mft_record(mft_ni);
- if (IS_ERR(mrec)) {
- ntfs_error(vol->sb, "Failed to map mft record.");
- ret = PTR_ERR(mrec);
- goto undo_alloc;
- }
- ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
- if (unlikely(!ctx)) {
- ntfs_error(vol->sb, "Failed to get search context.");
- ret = -ENOMEM;
- goto undo_alloc;
- }
- ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
- CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to find last attribute extent of "
- "mft data attribute.");
- if (ret == -ENOENT)
- ret = -EIO;
- goto undo_alloc;
- }
- a = ctx->attr;
- ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
- /* Search back for the previous last allocated cluster of mft bitmap. */
- for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
- if (ll >= rl2->vcn)
- break;
- }
- BUG_ON(ll < rl2->vcn);
- BUG_ON(ll >= rl2->vcn + rl2->length);
- /* Get the size for the new mapping pairs array for this extent. */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
- if (unlikely(mp_size <= 0)) {
- ntfs_error(vol->sb, "Get size for mapping pairs failed for "
- "mft data attribute extent.");
- ret = mp_size;
- if (!ret)
- ret = -EIO;
- goto undo_alloc;
- }
- /* Expand the attribute record if necessary. */
- old_alen = le32_to_cpu(a->length);
- ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
- if (unlikely(ret)) {
- if (ret != -ENOSPC) {
- ntfs_error(vol->sb, "Failed to resize attribute "
- "record for mft data attribute.");
- goto undo_alloc;
- }
- // TODO: Deal with this by moving this extent to a new mft
- // record or by starting a new extent in a new mft record or by
- // moving other attributes out of this mft record.
- // Note: Use the special reserved mft records and ensure that
- // this extent is not required to find the mft record in
- // question. If no free special records left we would need to
- // move an existing record away, insert ours in its place, and
- // then place the moved record into the newly allocated space
- // and we would then need to update all references to this mft
- // record appropriately. This is rather complicated...
- ntfs_error(vol->sb, "Not enough space in this mft record to "
- "accommodate extended mft data attribute "
- "extent. Cannot handle this yet.");
- ret = -EOPNOTSUPP;
- goto undo_alloc;
- }
- mp_rebuilt = true;
- /* Generate the mapping pairs array directly into the attr record. */
- ret = ntfs_mapping_pairs_build(vol, (u8*)a +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
- mp_size, rl2, ll, -1, NULL);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to build mapping pairs array of "
- "mft data attribute.");
- goto undo_alloc;
- }
- /* Update the highest_vcn. */
- a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
- /*
- * We now have extended the mft data allocated_size by nr clusters.
- * Reflect this in the ntfs_inode structure and the attribute record.
- * @rl is the last (non-terminator) runlist element of mft data
- * attribute.
- */
- if (a->data.non_resident.lowest_vcn) {
- /*
- * We are not in the first attribute extent, switch to it, but
- * first ensure the changes will make it to disk later.
- */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_reinit_search_ctx(ctx);
- ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
- mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
- ctx);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to find first attribute "
- "extent of mft data attribute.");
- goto restore_undo_alloc;
- }
- a = ctx->attr;
- }
- write_lock_irqsave(&mft_ni->size_lock, flags);
- mft_ni->allocated_size += nr << vol->cluster_size_bits;
- a->data.non_resident.allocated_size =
- cpu_to_sle64(mft_ni->allocated_size);
- write_unlock_irqrestore(&mft_ni->size_lock, flags);
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- up_write(&mft_ni->runlist.lock);
- ntfs_debug("Done.");
- return 0;
-restore_undo_alloc:
- ntfs_attr_reinit_search_ctx(ctx);
- if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
- CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
- ntfs_error(vol->sb, "Failed to find last attribute extent of "
- "mft data attribute.%s", es);
- write_lock_irqsave(&mft_ni->size_lock, flags);
- mft_ni->allocated_size += nr << vol->cluster_size_bits;
- write_unlock_irqrestore(&mft_ni->size_lock, flags);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- up_write(&mft_ni->runlist.lock);
- /*
- * The only thing that is now wrong is ->allocated_size of the
- * base attribute extent which chkdsk should be able to fix.
- */
- NVolSetErrors(vol);
- return ret;
- }
- ctx->attr->data.non_resident.highest_vcn =
- cpu_to_sle64(old_last_vcn - 1);
-undo_alloc:
- if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
- ntfs_error(vol->sb, "Failed to free clusters from mft data "
- "attribute.%s", es);
- NVolSetErrors(vol);
- }
-
- if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
- ntfs_error(vol->sb, "Failed to truncate mft data attribute "
- "runlist.%s", es);
- NVolSetErrors(vol);
- }
- if (ctx) {
- a = ctx->attr;
- if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
- if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
- old_alen - le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
- rl2, ll, -1, NULL)) {
- ntfs_error(vol->sb, "Failed to restore mapping pairs "
- "array.%s", es);
- NVolSetErrors(vol);
- }
- if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
- ntfs_error(vol->sb, "Failed to restore attribute "
- "record.%s", es);
- NVolSetErrors(vol);
- }
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- } else if (IS_ERR(ctx->mrec)) {
- ntfs_error(vol->sb, "Failed to restore attribute search "
- "context.%s", es);
- NVolSetErrors(vol);
- }
- ntfs_attr_put_search_ctx(ctx);
- }
- if (!IS_ERR(mrec))
- unmap_mft_record(mft_ni);
- up_write(&mft_ni->runlist.lock);
- return ret;
-}
-
-/**
- * ntfs_mft_record_layout - layout an mft record into a memory buffer
- * @vol: volume to which the mft record will belong
- * @mft_no: mft reference specifying the mft record number
- * @m: destination buffer of size >= @vol->mft_record_size bytes
- *
- * Layout an empty, unused mft record with the mft record number @mft_no into
- * the buffer @m. The volume @vol is needed because the mft record structure
- * was modified in NTFS 3.1 so we need to know which volume version this mft
- * record will be used on.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
- MFT_RECORD *m)
-{
- ATTR_RECORD *a;
-
- ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
- if (mft_no >= (1ll << 32)) {
- ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
- "maximum of 2^32.", (long long)mft_no);
- return -ERANGE;
- }
- /* Start by clearing the whole mft record to gives us a clean slate. */
- memset(m, 0, vol->mft_record_size);
- /* Aligned to 2-byte boundary. */
- if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
- m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
- else {
- m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
- /*
- * Set the NTFS 3.1+ specific fields while we know that the
- * volume version is 3.1+.
- */
- m->reserved = 0;
- m->mft_record_number = cpu_to_le32((u32)mft_no);
- }
- m->magic = magic_FILE;
- if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
- m->usa_count = cpu_to_le16(vol->mft_record_size /
- NTFS_BLOCK_SIZE + 1);
- else {
- m->usa_count = cpu_to_le16(1);
- ntfs_warning(vol->sb, "Sector size is bigger than mft record "
- "size. Setting usa_count to 1. If chkdsk "
- "reports this as corruption, please email "
- "linux-ntfs-dev@lists.sourceforge.net stating "
- "that you saw this message and that the "
- "modified filesystem created was corrupt. "
- "Thank you.");
- }
- /* Set the update sequence number to 1. */
- *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
- m->lsn = 0;
- m->sequence_number = cpu_to_le16(1);
- m->link_count = 0;
- /*
- * Place the attributes straight after the update sequence array,
- * aligned to 8-byte boundary.
- */
- m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
- (le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
- m->flags = 0;
- /*
- * Using attrs_offset plus eight bytes (for the termination attribute).
- * attrs_offset is already aligned to 8-byte boundary, so no need to
- * align again.
- */
- m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
- m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
- m->base_mft_record = 0;
- m->next_attr_instance = 0;
- /* Add the termination attribute. */
- a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
- a->type = AT_END;
- a->length = 0;
- ntfs_debug("Done.");
- return 0;
-}
-
-/**
- * ntfs_mft_record_format - format an mft record on an ntfs volume
- * @vol: volume on which to format the mft record
- * @mft_no: mft record number to format
- *
- * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
- * mft record into the appropriate place of the mft data attribute. This is
- * used when extending the mft data attribute.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
-{
- loff_t i_size;
- struct inode *mft_vi = vol->mft_ino;
- struct page *page;
- MFT_RECORD *m;
- pgoff_t index, end_index;
- unsigned int ofs;
- int err;
-
- ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
- /*
- * The index into the page cache and the offset within the page cache
- * page of the wanted mft record.
- */
- index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT;
- ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
- /* The maximum valid index into the page cache for $MFT's data. */
- i_size = i_size_read(mft_vi);
- end_index = i_size >> PAGE_SHIFT;
- if (unlikely(index >= end_index)) {
- if (unlikely(index > end_index || ofs + vol->mft_record_size >=
- (i_size & ~PAGE_MASK))) {
- ntfs_error(vol->sb, "Tried to format non-existing mft "
- "record 0x%llx.", (long long)mft_no);
- return -ENOENT;
- }
- }
- /* Read, map, and pin the page containing the mft record. */
- page = ntfs_map_page(mft_vi->i_mapping, index);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to map page containing mft record "
- "to format 0x%llx.", (long long)mft_no);
- return PTR_ERR(page);
- }
- lock_page(page);
- BUG_ON(!PageUptodate(page));
- ClearPageUptodate(page);
- m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
- err = ntfs_mft_record_layout(vol, mft_no, m);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
- (long long)mft_no);
- SetPageUptodate(page);
- unlock_page(page);
- ntfs_unmap_page(page);
- return err;
- }
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- /*
- * Make sure the mft record is written out to disk. We could use
- * ilookup5() to check if an inode is in icache and so on but this is
- * unnecessary as ntfs_writepage() will write the dirty record anyway.
- */
- mark_ntfs_record_dirty(page, ofs);
- ntfs_unmap_page(page);
- ntfs_debug("Done.");
- return 0;
-}
-
-/**
- * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
- * @vol: [IN] volume on which to allocate the mft record
- * @mode: [IN] mode if want a file or directory, i.e. base inode or 0
- * @base_ni: [IN] open base inode if allocating an extent mft record or NULL
- * @mrec: [OUT] on successful return this is the mapped mft record
- *
- * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
- *
- * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
- * direvctory inode, and allocate it at the default allocator position. In
- * this case @mode is the file mode as given to us by the caller. We in
- * particular use @mode to distinguish whether a file or a directory is being
- * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
- *
- * If @base_ni is not NULL make the allocated mft record an extent record,
- * allocate it starting at the mft record after the base mft record and attach
- * the allocated and opened ntfs inode to the base inode @base_ni. In this
- * case @mode must be 0 as it is meaningless for extent inodes.
- *
- * You need to check the return value with IS_ERR(). If false, the function
- * was successful and the return value is the now opened ntfs inode of the
- * allocated mft record. *@mrec is then set to the allocated, mapped, pinned,
- * and locked mft record. If IS_ERR() is true, the function failed and the
- * error code is obtained from PTR_ERR(return value). *@mrec is undefined in
- * this case.
- *
- * Allocation strategy:
- *
- * To find a free mft record, we scan the mft bitmap for a zero bit. To
- * optimize this we start scanning at the place specified by @base_ni or if
- * @base_ni is NULL we start where we last stopped and we perform wrap around
- * when we reach the end. Note, we do not try to allocate mft records below
- * number 24 because numbers 0 to 15 are the defined system files anyway and 16
- * to 24 are special in that they are used for storing extension mft records
- * for the $DATA attribute of $MFT. This is required to avoid the possibility
- * of creating a runlist with a circular dependency which once written to disk
- * can never be read in again. Windows will only use records 16 to 24 for
- * normal files if the volume is completely out of space. We never use them
- * which means that when the volume is really out of space we cannot create any
- * more files while Windows can still create up to 8 small files. We can start
- * doing this at some later time, it does not matter much for now.
- *
- * When scanning the mft bitmap, we only search up to the last allocated mft
- * record. If there are no free records left in the range 24 to number of
- * allocated mft records, then we extend the $MFT/$DATA attribute in order to
- * create free mft records. We extend the allocated size of $MFT/$DATA by 16
- * records at a time or one cluster, if cluster size is above 16kiB. If there
- * is not sufficient space to do this, we try to extend by a single mft record
- * or one cluster, if cluster size is above the mft record size.
- *
- * No matter how many mft records we allocate, we initialize only the first
- * allocated mft record, incrementing mft data size and initialized size
- * accordingly, open an ntfs_inode for it and return it to the caller, unless
- * there are less than 24 mft records, in which case we allocate and initialize
- * mft records until we reach record 24 which we consider as the first free mft
- * record for use by normal files.
- *
- * If during any stage we overflow the initialized data in the mft bitmap, we
- * extend the initialized size (and data size) by 8 bytes, allocating another
- * cluster if required. The bitmap data size has to be at least equal to the
- * number of mft records in the mft, but it can be bigger, in which case the
- * superflous bits are padded with zeroes.
- *
- * Thus, when we return successfully (IS_ERR() is false), we will have:
- * - initialized / extended the mft bitmap if necessary,
- * - initialized / extended the mft data if necessary,
- * - set the bit corresponding to the mft record being allocated in the
- * mft bitmap,
- * - opened an ntfs_inode for the allocated mft record, and we will have
- * - returned the ntfs_inode as well as the allocated mapped, pinned, and
- * locked mft record.
- *
- * On error, the volume will be left in a consistent state and no record will
- * be allocated. If rolling back a partial operation fails, we may leave some
- * inconsistent metadata in which case we set NVolErrors() so the volume is
- * left dirty when unmounted.
- *
- * Note, this function cannot make use of most of the normal functions, like
- * for example for attribute resizing, etc, because when the run list overflows
- * the base mft record and an attribute list is used, it is very important that
- * the extension mft records used to store the $DATA attribute of $MFT can be
- * reached without having to read the information contained inside them, as
- * this would make it impossible to find them in the first place after the
- * volume is unmounted. $MFT/$BITMAP probably does not need to follow this
- * rule because the bitmap is not essential for finding the mft records, but on
- * the other hand, handling the bitmap in this special way would make life
- * easier because otherwise there might be circular invocations of functions
- * when reading the bitmap.
- */
-ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
- ntfs_inode *base_ni, MFT_RECORD **mrec)
-{
- s64 ll, bit, old_data_initialized, old_data_size;
- unsigned long flags;
- struct inode *vi;
- struct page *page;
- ntfs_inode *mft_ni, *mftbmp_ni, *ni;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- pgoff_t index;
- unsigned int ofs;
- int err;
- le16 seq_no, usn;
- bool record_formatted = false;
-
- if (base_ni) {
- ntfs_debug("Entering (allocating an extent mft record for "
- "base mft record 0x%llx).",
- (long long)base_ni->mft_no);
- /* @mode and @base_ni are mutually exclusive. */
- BUG_ON(mode);
- } else
- ntfs_debug("Entering (allocating a base mft record).");
- if (mode) {
- /* @mode and @base_ni are mutually exclusive. */
- BUG_ON(base_ni);
- /* We only support creation of normal files and directories. */
- if (!S_ISREG(mode) && !S_ISDIR(mode))
- return ERR_PTR(-EOPNOTSUPP);
- }
- BUG_ON(!mrec);
- mft_ni = NTFS_I(vol->mft_ino);
- mftbmp_ni = NTFS_I(vol->mftbmp_ino);
- down_write(&vol->mftbmp_lock);
- bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
- if (bit >= 0) {
- ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
- (long long)bit);
- goto have_alloc_rec;
- }
- if (bit != -ENOSPC) {
- up_write(&vol->mftbmp_lock);
- return ERR_PTR(bit);
- }
- /*
- * No free mft records left. If the mft bitmap already covers more
- * than the currently used mft records, the next records are all free,
- * so we can simply allocate the first unused mft record.
- * Note: We also have to make sure that the mft bitmap at least covers
- * the first 24 mft records as they are special and whilst they may not
- * be in use, we do not allocate from them.
- */
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- old_data_initialized = mftbmp_ni->initialized_size;
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
- bit = ll;
- if (bit < 24)
- bit = 24;
- if (unlikely(bit >= (1ll << 32)))
- goto max_err_out;
- ntfs_debug("Found free record (#2), bit 0x%llx.",
- (long long)bit);
- goto found_free_rec;
- }
- /*
- * The mft bitmap needs to be expanded until it covers the first unused
- * mft record that we can allocate.
- * Note: The smallest mft record we allocate is mft record 24.
- */
- bit = old_data_initialized << 3;
- if (unlikely(bit >= (1ll << 32)))
- goto max_err_out;
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- old_data_size = mftbmp_ni->allocated_size;
- ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
- "data_size 0x%llx, initialized_size 0x%llx.",
- (long long)old_data_size,
- (long long)i_size_read(vol->mftbmp_ino),
- (long long)old_data_initialized);
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- if (old_data_initialized + 8 > old_data_size) {
- /* Need to extend bitmap by one more cluster. */
- ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
- err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
- if (unlikely(err)) {
- up_write(&vol->mftbmp_lock);
- goto err_out;
- }
-#ifdef DEBUG
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- ntfs_debug("Status of mftbmp after allocation extension: "
- "allocated_size 0x%llx, data_size 0x%llx, "
- "initialized_size 0x%llx.",
- (long long)mftbmp_ni->allocated_size,
- (long long)i_size_read(vol->mftbmp_ino),
- (long long)mftbmp_ni->initialized_size);
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
- }
- /*
- * We now have sufficient allocated space, extend the initialized_size
- * as well as the data_size if necessary and fill the new space with
- * zeroes.
- */
- err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
- if (unlikely(err)) {
- up_write(&vol->mftbmp_lock);
- goto err_out;
- }
-#ifdef DEBUG
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- ntfs_debug("Status of mftbmp after initialized extension: "
- "allocated_size 0x%llx, data_size 0x%llx, "
- "initialized_size 0x%llx.",
- (long long)mftbmp_ni->allocated_size,
- (long long)i_size_read(vol->mftbmp_ino),
- (long long)mftbmp_ni->initialized_size);
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
- ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
-found_free_rec:
- /* @bit is the found free mft record, allocate it in the mft bitmap. */
- ntfs_debug("At found_free_rec.");
- err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
- up_write(&vol->mftbmp_lock);
- goto err_out;
- }
- ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
-have_alloc_rec:
- /*
- * The mft bitmap is now uptodate. Deal with mft data attribute now.
- * Note, we keep hold of the mft bitmap lock for writing until all
- * modifications to the mft data attribute are complete, too, as they
- * will impact decisions for mft bitmap and mft record allocation done
- * by a parallel allocation and if the lock is not maintained a
- * parallel allocation could allocate the same mft record as this one.
- */
- ll = (bit + 1) << vol->mft_record_size_bits;
- read_lock_irqsave(&mft_ni->size_lock, flags);
- old_data_initialized = mft_ni->initialized_size;
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- if (ll <= old_data_initialized) {
- ntfs_debug("Allocated mft record already initialized.");
- goto mft_rec_already_initialized;
- }
- ntfs_debug("Initializing allocated mft record.");
- /*
- * The mft record is outside the initialized data. Extend the mft data
- * attribute until it covers the allocated record. The loop is only
- * actually traversed more than once when a freshly formatted volume is
- * first written to so it optimizes away nicely in the common case.
- */
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ntfs_debug("Status of mft data before extension: "
- "allocated_size 0x%llx, data_size 0x%llx, "
- "initialized_size 0x%llx.",
- (long long)mft_ni->allocated_size,
- (long long)i_size_read(vol->mft_ino),
- (long long)mft_ni->initialized_size);
- while (ll > mft_ni->allocated_size) {
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- err = ntfs_mft_data_extend_allocation_nolock(vol);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to extend mft data "
- "allocation.");
- goto undo_mftbmp_alloc_nolock;
- }
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ntfs_debug("Status of mft data after allocation extension: "
- "allocated_size 0x%llx, data_size 0x%llx, "
- "initialized_size 0x%llx.",
- (long long)mft_ni->allocated_size,
- (long long)i_size_read(vol->mft_ino),
- (long long)mft_ni->initialized_size);
- }
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- /*
- * Extend mft data initialized size (and data size of course) to reach
- * the allocated mft record, formatting the mft records allong the way.
- * Note: We only modify the ntfs_inode structure as that is all that is
- * needed by ntfs_mft_record_format(). We will update the attribute
- * record itself in one fell swoop later on.
- */
- write_lock_irqsave(&mft_ni->size_lock, flags);
- old_data_initialized = mft_ni->initialized_size;
- old_data_size = vol->mft_ino->i_size;
- while (ll > mft_ni->initialized_size) {
- s64 new_initialized_size, mft_no;
-
- new_initialized_size = mft_ni->initialized_size +
- vol->mft_record_size;
- mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
- if (new_initialized_size > i_size_read(vol->mft_ino))
- i_size_write(vol->mft_ino, new_initialized_size);
- write_unlock_irqrestore(&mft_ni->size_lock, flags);
- ntfs_debug("Initializing mft record 0x%llx.",
- (long long)mft_no);
- err = ntfs_mft_record_format(vol, mft_no);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to format mft record.");
- goto undo_data_init;
- }
- write_lock_irqsave(&mft_ni->size_lock, flags);
- mft_ni->initialized_size = new_initialized_size;
- }
- write_unlock_irqrestore(&mft_ni->size_lock, flags);
- record_formatted = true;
- /* Update the mft data attribute record to reflect the new sizes. */
- m = map_mft_record(mft_ni);
- if (IS_ERR(m)) {
- ntfs_error(vol->sb, "Failed to map mft record.");
- err = PTR_ERR(m);
- goto undo_data_init;
- }
- ctx = ntfs_attr_get_search_ctx(mft_ni, m);
- if (unlikely(!ctx)) {
- ntfs_error(vol->sb, "Failed to get search context.");
- err = -ENOMEM;
- unmap_mft_record(mft_ni);
- goto undo_data_init;
- }
- err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to find first attribute extent of "
- "mft data attribute.");
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- goto undo_data_init;
- }
- a = ctx->attr;
- read_lock_irqsave(&mft_ni->size_lock, flags);
- a->data.non_resident.initialized_size =
- cpu_to_sle64(mft_ni->initialized_size);
- a->data.non_resident.data_size =
- cpu_to_sle64(i_size_read(vol->mft_ino));
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ntfs_debug("Status of mft data after mft record initialization: "
- "allocated_size 0x%llx, data_size 0x%llx, "
- "initialized_size 0x%llx.",
- (long long)mft_ni->allocated_size,
- (long long)i_size_read(vol->mft_ino),
- (long long)mft_ni->initialized_size);
- BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
- BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
-mft_rec_already_initialized:
- /*
- * We can finally drop the mft bitmap lock as the mft data attribute
- * has been fully updated. The only disparity left is that the
- * allocated mft record still needs to be marked as in use to match the
- * set bit in the mft bitmap but this is actually not a problem since
- * this mft record is not referenced from anywhere yet and the fact
- * that it is allocated in the mft bitmap means that no-one will try to
- * allocate it either.
- */
- up_write(&vol->mftbmp_lock);
- /*
- * We now have allocated and initialized the mft record. Calculate the
- * index of and the offset within the page cache page the record is in.
- */
- index = bit << vol->mft_record_size_bits >> PAGE_SHIFT;
- ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK;
- /* Read, map, and pin the page containing the mft record. */
- page = ntfs_map_page(vol->mft_ino->i_mapping, index);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to map page containing allocated "
- "mft record 0x%llx.", (long long)bit);
- err = PTR_ERR(page);
- goto undo_mftbmp_alloc;
- }
- lock_page(page);
- BUG_ON(!PageUptodate(page));
- ClearPageUptodate(page);
- m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
- /* If we just formatted the mft record no need to do it again. */
- if (!record_formatted) {
- /* Sanity check that the mft record is really not in use. */
- if (ntfs_is_file_record(m->magic) &&
- (m->flags & MFT_RECORD_IN_USE)) {
- ntfs_error(vol->sb, "Mft record 0x%llx was marked "
- "free in mft bitmap but is marked "
- "used itself. Corrupt filesystem. "
- "Unmount and run chkdsk.",
- (long long)bit);
- err = -EIO;
- SetPageUptodate(page);
- unlock_page(page);
- ntfs_unmap_page(page);
- NVolSetErrors(vol);
- goto undo_mftbmp_alloc;
- }
- /*
- * We need to (re-)format the mft record, preserving the
- * sequence number if it is not zero as well as the update
- * sequence number if it is not zero or -1 (0xffff). This
- * means we do not need to care whether or not something went
- * wrong with the previous mft record.
- */
- seq_no = m->sequence_number;
- usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
- err = ntfs_mft_record_layout(vol, bit, m);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to layout allocated mft "
- "record 0x%llx.", (long long)bit);
- SetPageUptodate(page);
- unlock_page(page);
- ntfs_unmap_page(page);
- goto undo_mftbmp_alloc;
- }
- if (seq_no)
- m->sequence_number = seq_no;
- if (usn && le16_to_cpu(usn) != 0xffff)
- *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
- }
- /* Set the mft record itself in use. */
- m->flags |= MFT_RECORD_IN_USE;
- if (S_ISDIR(mode))
- m->flags |= MFT_RECORD_IS_DIRECTORY;
- flush_dcache_page(page);
- SetPageUptodate(page);
- if (base_ni) {
- MFT_RECORD *m_tmp;
-
- /*
- * Setup the base mft record in the extent mft record. This
- * completes initialization of the allocated extent mft record
- * and we can simply use it with map_extent_mft_record().
- */
- m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
- base_ni->seq_no);
- /*
- * Allocate an extent inode structure for the new mft record,
- * attach it to the base inode @base_ni and map, pin, and lock
- * its, i.e. the allocated, mft record.
- */
- m_tmp = map_extent_mft_record(base_ni, bit, &ni);
- if (IS_ERR(m_tmp)) {
- ntfs_error(vol->sb, "Failed to map allocated extent "
- "mft record 0x%llx.", (long long)bit);
- err = PTR_ERR(m_tmp);
- /* Set the mft record itself not in use. */
- m->flags &= cpu_to_le16(
- ~le16_to_cpu(MFT_RECORD_IN_USE));
- flush_dcache_page(page);
- /* Make sure the mft record is written out to disk. */
- mark_ntfs_record_dirty(page, ofs);
- unlock_page(page);
- ntfs_unmap_page(page);
- goto undo_mftbmp_alloc;
- }
- BUG_ON(m != m_tmp);
- /*
- * Make sure the allocated mft record is written out to disk.
- * No need to set the inode dirty because the caller is going
- * to do that anyway after finishing with the new extent mft
- * record (e.g. at a minimum a new attribute will be added to
- * the mft record.
- */
- mark_ntfs_record_dirty(page, ofs);
- unlock_page(page);
- /*
- * Need to unmap the page since map_extent_mft_record() mapped
- * it as well so we have it mapped twice at the moment.
- */
- ntfs_unmap_page(page);
- } else {
- /*
- * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink
- * is set to 1 but the mft record->link_count is 0. The caller
- * needs to bear this in mind.
- */
- vi = new_inode(vol->sb);
- if (unlikely(!vi)) {
- err = -ENOMEM;
- /* Set the mft record itself not in use. */
- m->flags &= cpu_to_le16(
- ~le16_to_cpu(MFT_RECORD_IN_USE));
- flush_dcache_page(page);
- /* Make sure the mft record is written out to disk. */
- mark_ntfs_record_dirty(page, ofs);
- unlock_page(page);
- ntfs_unmap_page(page);
- goto undo_mftbmp_alloc;
- }
- vi->i_ino = bit;
-
- /* The owner and group come from the ntfs volume. */
- vi->i_uid = vol->uid;
- vi->i_gid = vol->gid;
-
- /* Initialize the ntfs specific part of @vi. */
- ntfs_init_big_inode(vi);
- ni = NTFS_I(vi);
- /*
- * Set the appropriate mode, attribute type, and name. For
- * directories, also setup the index values to the defaults.
- */
- if (S_ISDIR(mode)) {
- vi->i_mode = S_IFDIR | S_IRWXUGO;
- vi->i_mode &= ~vol->dmask;
-
- NInoSetMstProtected(ni);
- ni->type = AT_INDEX_ALLOCATION;
- ni->name = I30;
- ni->name_len = 4;
-
- ni->itype.index.block_size = 4096;
- ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1;
- ni->itype.index.collation_rule = COLLATION_FILE_NAME;
- if (vol->cluster_size <= ni->itype.index.block_size) {
- ni->itype.index.vcn_size = vol->cluster_size;
- ni->itype.index.vcn_size_bits =
- vol->cluster_size_bits;
- } else {
- ni->itype.index.vcn_size = vol->sector_size;
- ni->itype.index.vcn_size_bits =
- vol->sector_size_bits;
- }
- } else {
- vi->i_mode = S_IFREG | S_IRWXUGO;
- vi->i_mode &= ~vol->fmask;
-
- ni->type = AT_DATA;
- ni->name = NULL;
- ni->name_len = 0;
- }
- if (IS_RDONLY(vi))
- vi->i_mode &= ~S_IWUGO;
-
- /* Set the inode times to the current time. */
- simple_inode_init_ts(vi);
- /*
- * Set the file size to 0, the ntfs inode sizes are set to 0 by
- * the call to ntfs_init_big_inode() below.
- */
- vi->i_size = 0;
- vi->i_blocks = 0;
-
- /* Set the sequence number. */
- vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
- /*
- * Manually map, pin, and lock the mft record as we already
- * have its page mapped and it is very easy to do.
- */
- atomic_inc(&ni->count);
- mutex_lock(&ni->mrec_lock);
- ni->page = page;
- ni->page_ofs = ofs;
- /*
- * Make sure the allocated mft record is written out to disk.
- * NOTE: We do not set the ntfs inode dirty because this would
- * fail in ntfs_write_inode() because the inode does not have a
- * standard information attribute yet. Also, there is no need
- * to set the inode dirty because the caller is going to do
- * that anyway after finishing with the new mft record (e.g. at
- * a minimum some new attributes will be added to the mft
- * record.
- */
- mark_ntfs_record_dirty(page, ofs);
- unlock_page(page);
-
- /* Add the inode to the inode hash for the superblock. */
- insert_inode_hash(vi);
-
- /* Update the default mft allocation position. */
- vol->mft_data_pos = bit + 1;
- }
- /*
- * Return the opened, allocated inode of the allocated mft record as
- * well as the mapped, pinned, and locked mft record.
- */
- ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
- base_ni ? "extent " : "", (long long)bit);
- *mrec = m;
- return ni;
-undo_data_init:
- write_lock_irqsave(&mft_ni->size_lock, flags);
- mft_ni->initialized_size = old_data_initialized;
- i_size_write(vol->mft_ino, old_data_size);
- write_unlock_irqrestore(&mft_ni->size_lock, flags);
- goto undo_mftbmp_alloc_nolock;
-undo_mftbmp_alloc:
- down_write(&vol->mftbmp_lock);
-undo_mftbmp_alloc_nolock:
- if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
- ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
- NVolSetErrors(vol);
- }
- up_write(&vol->mftbmp_lock);
-err_out:
- return ERR_PTR(err);
-max_err_out:
- ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
- "number of inodes (2^32) has already been reached.");
- up_write(&vol->mftbmp_lock);
- return ERR_PTR(-ENOSPC);
-}
-
-/**
- * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
- * @ni: ntfs inode of the mapped extent mft record to free
- * @m: mapped extent mft record of the ntfs inode @ni
- *
- * Free the mapped extent mft record @m of the extent ntfs inode @ni.
- *
- * Note that this function unmaps the mft record and closes and destroys @ni
- * internally and hence you cannot use either @ni nor @m any more after this
- * function returns success.
- *
- * On success return 0 and on error return -errno. @ni and @m are still valid
- * in this case and have not been freed.
- *
- * For some errors an error message is displayed and the success code 0 is
- * returned and the volume is then left dirty on umount. This makes sense in
- * case we could not rollback the changes that were already done since the
- * caller no longer wants to reference this mft record so it does not matter to
- * the caller if something is wrong with it as long as it is properly detached
- * from the base inode.
- */
-int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
-{
- unsigned long mft_no = ni->mft_no;
- ntfs_volume *vol = ni->vol;
- ntfs_inode *base_ni;
- ntfs_inode **extent_nis;
- int i, err;
- le16 old_seq_no;
- u16 seq_no;
-
- BUG_ON(NInoAttr(ni));
- BUG_ON(ni->nr_extents != -1);
-
- mutex_lock(&ni->extent_lock);
- base_ni = ni->ext.base_ntfs_ino;
- mutex_unlock(&ni->extent_lock);
-
- BUG_ON(base_ni->nr_extents <= 0);
-
- ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
- mft_no, base_ni->mft_no);
-
- mutex_lock(&base_ni->extent_lock);
-
- /* Make sure we are holding the only reference to the extent inode. */
- if (atomic_read(&ni->count) > 2) {
- ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
- "not freeing.", base_ni->mft_no);
- mutex_unlock(&base_ni->extent_lock);
- return -EBUSY;
- }
-
- /* Dissociate the ntfs inode from the base inode. */
- extent_nis = base_ni->ext.extent_ntfs_inos;
- err = -ENOENT;
- for (i = 0; i < base_ni->nr_extents; i++) {
- if (ni != extent_nis[i])
- continue;
- extent_nis += i;
- base_ni->nr_extents--;
- memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
- sizeof(ntfs_inode*));
- err = 0;
- break;
- }
-
- mutex_unlock(&base_ni->extent_lock);
-
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
- "its base inode 0x%lx.", mft_no,
- base_ni->mft_no);
- BUG();
- }
-
- /*
- * The extent inode is no longer attached to the base inode so no one
- * can get a reference to it any more.
- */
-
- /* Mark the mft record as not in use. */
- m->flags &= ~MFT_RECORD_IN_USE;
-
- /* Increment the sequence number, skipping zero, if it is not zero. */
- old_seq_no = m->sequence_number;
- seq_no = le16_to_cpu(old_seq_no);
- if (seq_no == 0xffff)
- seq_no = 1;
- else if (seq_no)
- seq_no++;
- m->sequence_number = cpu_to_le16(seq_no);
-
- /*
- * Set the ntfs inode dirty and write it out. We do not need to worry
- * about the base inode here since whatever caused the extent mft
- * record to be freed is guaranteed to do it already.
- */
- NInoSetDirty(ni);
- err = write_mft_record(ni, m, 0);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
- "freeing.", mft_no);
- goto rollback;
- }
-rollback_error:
- /* Unmap and throw away the now freed extent inode. */
- unmap_extent_mft_record(ni);
- ntfs_clear_extent_inode(ni);
-
- /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
- down_write(&vol->mftbmp_lock);
- err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
- up_write(&vol->mftbmp_lock);
- if (unlikely(err)) {
- /*
- * The extent inode is gone but we failed to deallocate it in
- * the mft bitmap. Just emit a warning and leave the volume
- * dirty on umount.
- */
- ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
- NVolSetErrors(vol);
- }
- return 0;
-rollback:
- /* Rollback what we did... */
- mutex_lock(&base_ni->extent_lock);
- extent_nis = base_ni->ext.extent_ntfs_inos;
- if (!(base_ni->nr_extents & 3)) {
- int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
-
- extent_nis = kmalloc(new_size, GFP_NOFS);
- if (unlikely(!extent_nis)) {
- ntfs_error(vol->sb, "Failed to allocate internal "
- "buffer during rollback.%s", es);
- mutex_unlock(&base_ni->extent_lock);
- NVolSetErrors(vol);
- goto rollback_error;
- }
- if (base_ni->nr_extents) {
- BUG_ON(!base_ni->ext.extent_ntfs_inos);
- memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
- new_size - 4 * sizeof(ntfs_inode*));
- kfree(base_ni->ext.extent_ntfs_inos);
- }
- base_ni->ext.extent_ntfs_inos = extent_nis;
- }
- m->flags |= MFT_RECORD_IN_USE;
- m->sequence_number = old_seq_no;
- extent_nis[base_ni->nr_extents++] = ni;
- mutex_unlock(&base_ni->extent_lock);
- mark_mft_record_dirty(ni);
- return err;
-}
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
deleted file mode 100644
index 49c001af16ed..000000000000
--- a/fs/ntfs/mft.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * mft.h - Defines for mft record handling in NTFS Linux kernel driver.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_MFT_H
-#define _LINUX_NTFS_MFT_H
-
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-
-#include "inode.h"
-
-extern MFT_RECORD *map_mft_record(ntfs_inode *ni);
-extern void unmap_mft_record(ntfs_inode *ni);
-
-extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
- ntfs_inode **ntfs_ino);
-
-static inline void unmap_extent_mft_record(ntfs_inode *ni)
-{
- unmap_mft_record(ni);
- return;
-}
-
-#ifdef NTFS_RW
-
-/**
- * flush_dcache_mft_record_page - flush_dcache_page() for mft records
- * @ni: ntfs inode structure of mft record
- *
- * Call flush_dcache_page() for the page in which an mft record resides.
- *
- * This must be called every time an mft record is modified, just after the
- * modification.
- */
-static inline void flush_dcache_mft_record_page(ntfs_inode *ni)
-{
- flush_dcache_page(ni->page);
-}
-
-extern void __mark_mft_record_dirty(ntfs_inode *ni);
-
-/**
- * mark_mft_record_dirty - set the mft record and the page containing it dirty
- * @ni: ntfs inode describing the mapped mft record
- *
- * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
- * as well as the page containing the mft record, dirty. Also, mark the base
- * vfs inode dirty. This ensures that any changes to the mft record are
- * written out to disk.
- *
- * NOTE: Do not do anything if the mft record is already marked dirty.
- */
-static inline void mark_mft_record_dirty(ntfs_inode *ni)
-{
- if (!NInoTestSetDirty(ni))
- __mark_mft_record_dirty(ni);
-}
-
-extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
- MFT_RECORD *m, int sync);
-
-extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
-
-/**
- * write_mft_record - write out a mapped (extent) mft record
- * @ni: ntfs inode describing the mapped (extent) mft record
- * @m: mapped (extent) mft record to write
- * @sync: if true, wait for i/o completion
- *
- * This is just a wrapper for write_mft_record_nolock() (see mft.c), which
- * locks the page for the duration of the write. This ensures that there are
- * no race conditions between writing the mft record via the dirty inode code
- * paths and via the page cache write back code paths or between writing
- * neighbouring mft records residing in the same page.
- *
- * Locking the page also serializes us against ->read_folio() if the page is not
- * uptodate.
- *
- * On success, clean the mft record and return 0. On error, leave the mft
- * record dirty and return -errno.
- */
-static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
-{
- struct page *page = ni->page;
- int err;
-
- BUG_ON(!page);
- lock_page(page);
- err = write_mft_record_nolock(ni, m, sync);
- unlock_page(page);
- return err;
-}
-
-extern bool ntfs_may_write_mft_record(ntfs_volume *vol,
- const unsigned long mft_no, const MFT_RECORD *m,
- ntfs_inode **locked_ni);
-
-extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
- ntfs_inode *base_ni, MFT_RECORD **mrec);
-extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_MFT_H */
diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c
deleted file mode 100644
index 16b3c884abfc..000000000000
--- a/fs/ntfs/mst.c
+++ /dev/null
@@ -1,189 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * mst.c - NTFS multi sector transfer protection handling code. Part of the
- * Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#include "ntfs.h"
-
-/**
- * post_read_mst_fixup - deprotect multi sector transfer protected data
- * @b: pointer to the data to deprotect
- * @size: size in bytes of @b
- *
- * Perform the necessary post read multi sector transfer fixup and detect the
- * presence of incomplete multi sector transfers. - In that case, overwrite the
- * magic of the ntfs record header being processed with "BAAD" (in memory only!)
- * and abort processing.
- *
- * Return 0 on success and -EINVAL on error ("BAAD" magic will be present).
- *
- * NOTE: We consider the absence / invalidity of an update sequence array to
- * mean that the structure is not protected at all and hence doesn't need to
- * be fixed up. Thus, we return success and not failure in this case. This is
- * in contrast to pre_write_mst_fixup(), see below.
- */
-int post_read_mst_fixup(NTFS_RECORD *b, const u32 size)
-{
- u16 usa_ofs, usa_count, usn;
- u16 *usa_pos, *data_pos;
-
- /* Setup the variables. */
- usa_ofs = le16_to_cpu(b->usa_ofs);
- /* Decrement usa_count to get number of fixups. */
- usa_count = le16_to_cpu(b->usa_count) - 1;
- /* Size and alignment checks. */
- if ( size & (NTFS_BLOCK_SIZE - 1) ||
- usa_ofs & 1 ||
- usa_ofs + (usa_count * 2) > size ||
- (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
- return 0;
- /* Position of usn in update sequence array. */
- usa_pos = (u16*)b + usa_ofs/sizeof(u16);
- /*
- * The update sequence number which has to be equal to each of the
- * u16 values before they are fixed up. Note no need to care for
- * endianness since we are comparing and moving data for on disk
- * structures which means the data is consistent. - If it is
- * consistenty the wrong endianness it doesn't make any difference.
- */
- usn = *usa_pos;
- /*
- * Position in protected data of first u16 that needs fixing up.
- */
- data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
- /*
- * Check for incomplete multi sector transfer(s).
- */
- while (usa_count--) {
- if (*data_pos != usn) {
- /*
- * Incomplete multi sector transfer detected! )-:
- * Set the magic to "BAAD" and return failure.
- * Note that magic_BAAD is already converted to le32.
- */
- b->magic = magic_BAAD;
- return -EINVAL;
- }
- data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
- }
- /* Re-setup the variables. */
- usa_count = le16_to_cpu(b->usa_count) - 1;
- data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
- /* Fixup all sectors. */
- while (usa_count--) {
- /*
- * Increment position in usa and restore original data from
- * the usa into the data buffer.
- */
- *data_pos = *(++usa_pos);
- /* Increment position in data as well. */
- data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
- }
- return 0;
-}
-
-/**
- * pre_write_mst_fixup - apply multi sector transfer protection
- * @b: pointer to the data to protect
- * @size: size in bytes of @b
- *
- * Perform the necessary pre write multi sector transfer fixup on the data
- * pointer to by @b of @size.
- *
- * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed
- * (assumed not needed). This is in contrast to post_read_mst_fixup() above.
- *
- * NOTE: We consider the absence / invalidity of an update sequence array to
- * mean that the structure is not subject to protection and hence doesn't need
- * to be fixed up. This means that you have to create a valid update sequence
- * array header in the ntfs record before calling this function, otherwise it
- * will fail (the header needs to contain the position of the update sequence
- * array together with the number of elements in the array). You also need to
- * initialise the update sequence number before calling this function
- * otherwise a random word will be used (whatever was in the record at that
- * position at that time).
- */
-int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size)
-{
- le16 *usa_pos, *data_pos;
- u16 usa_ofs, usa_count, usn;
- le16 le_usn;
-
- /* Sanity check + only fixup if it makes sense. */
- if (!b || ntfs_is_baad_record(b->magic) ||
- ntfs_is_hole_record(b->magic))
- return -EINVAL;
- /* Setup the variables. */
- usa_ofs = le16_to_cpu(b->usa_ofs);
- /* Decrement usa_count to get number of fixups. */
- usa_count = le16_to_cpu(b->usa_count) - 1;
- /* Size and alignment checks. */
- if ( size & (NTFS_BLOCK_SIZE - 1) ||
- usa_ofs & 1 ||
- usa_ofs + (usa_count * 2) > size ||
- (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
- return -EINVAL;
- /* Position of usn in update sequence array. */
- usa_pos = (le16*)((u8*)b + usa_ofs);
- /*
- * Cyclically increment the update sequence number
- * (skipping 0 and -1, i.e. 0xffff).
- */
- usn = le16_to_cpup(usa_pos) + 1;
- if (usn == 0xffff || !usn)
- usn = 1;
- le_usn = cpu_to_le16(usn);
- *usa_pos = le_usn;
- /* Position in data of first u16 that needs fixing up. */
- data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
- /* Fixup all sectors. */
- while (usa_count--) {
- /*
- * Increment the position in the usa and save the
- * original data from the data buffer into the usa.
- */
- *(++usa_pos) = *data_pos;
- /* Apply fixup to data. */
- *data_pos = le_usn;
- /* Increment position in data as well. */
- data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
- }
- return 0;
-}
-
-/**
- * post_write_mst_fixup - fast deprotect multi sector transfer protected data
- * @b: pointer to the data to deprotect
- *
- * Perform the necessary post write multi sector transfer fixup, not checking
- * for any errors, because we assume we have just used pre_write_mst_fixup(),
- * thus the data will be fine or we would never have gotten here.
- */
-void post_write_mst_fixup(NTFS_RECORD *b)
-{
- le16 *usa_pos, *data_pos;
-
- u16 usa_ofs = le16_to_cpu(b->usa_ofs);
- u16 usa_count = le16_to_cpu(b->usa_count) - 1;
-
- /* Position of usn in update sequence array. */
- usa_pos = (le16*)b + usa_ofs/sizeof(le16);
-
- /* Position in protected data of first u16 that needs fixing up. */
- data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
-
- /* Fixup all sectors. */
- while (usa_count--) {
- /*
- * Increment position in usa and restore original data from
- * the usa into the data buffer.
- */
- *data_pos = *(++usa_pos);
-
- /* Increment position in data as well. */
- data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
- }
-}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
deleted file mode 100644
index d7498ddc4a72..000000000000
--- a/fs/ntfs/namei.c
+++ /dev/null
@@ -1,392 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
- * project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include <linux/dcache.h>
-#include <linux/exportfs.h>
-#include <linux/security.h>
-#include <linux/slab.h>
-
-#include "attrib.h"
-#include "debug.h"
-#include "dir.h"
-#include "mft.h"
-#include "ntfs.h"
-
-/**
- * ntfs_lookup - find the inode represented by a dentry in a directory inode
- * @dir_ino: directory inode in which to look for the inode
- * @dent: dentry representing the inode to look for
- * @flags: lookup flags
- *
- * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
- * in the directory inode @dir_ino and if found attaches the inode to the
- * dentry @dent.
- *
- * In more detail, the dentry @dent specifies which inode to look for by
- * supplying the name of the inode in @dent->d_name.name. ntfs_lookup()
- * converts the name to Unicode and walks the contents of the directory inode
- * @dir_ino looking for the converted Unicode name. If the name is found in the
- * directory, the corresponding inode is loaded by calling ntfs_iget() on its
- * inode number and the inode is associated with the dentry @dent via a call to
- * d_splice_alias().
- *
- * If the name is not found in the directory, a NULL inode is inserted into the
- * dentry @dent via a call to d_add(). The dentry is then termed a negative
- * dentry.
- *
- * Only if an actual error occurs, do we return an error via ERR_PTR().
- *
- * In order to handle the case insensitivity issues of NTFS with regards to the
- * dcache and the dcache requiring only one dentry per directory, we deal with
- * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining
- * a case sensitive dcache. This means that we get the full benefit of dcache
- * speed when the file/directory is looked up with the same case as returned by
- * ->ntfs_readdir() but that a lookup for any other case (or for the short file
- * name) will not find anything in dcache and will enter ->ntfs_lookup()
- * instead, where we search the directory for a fully matching file name
- * (including case) and if that is not found, we search for a file name that
- * matches with different case and if that has non-POSIX semantics we return
- * that. We actually do only one search (case sensitive) and keep tabs on
- * whether we have found a case insensitive match in the process.
- *
- * To simplify matters for us, we do not treat the short vs long filenames as
- * two hard links but instead if the lookup matches a short filename, we
- * return the dentry for the corresponding long filename instead.
- *
- * There are three cases we need to distinguish here:
- *
- * 1) @dent perfectly matches (i.e. including case) a directory entry with a
- * file name in the WIN32 or POSIX namespaces. In this case
- * ntfs_lookup_inode_by_name() will return with name set to NULL and we
- * just d_splice_alias() @dent.
- * 2) @dent matches (not including case) a directory entry with a file name in
- * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return
- * with name set to point to a kmalloc()ed ntfs_name structure containing
- * the properly cased little endian Unicode name. We convert the name to the
- * current NLS code page, search if a dentry with this name already exists
- * and if so return that instead of @dent. At this point things are
- * complicated by the possibility of 'disconnected' dentries due to NFS
- * which we deal with appropriately (see the code comments). The VFS will
- * then destroy the old @dent and use the one we returned. If a dentry is
- * not found, we allocate a new one, d_splice_alias() it, and return it as
- * above.
- * 3) @dent matches either perfectly or not (i.e. we don't care about case) a
- * directory entry with a file name in the DOS namespace. In this case
- * ntfs_lookup_inode_by_name() will return with name set to point to a
- * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian)
- * of the inode. We use the mft reference to read the inode and to find the
- * file name in the WIN32 namespace corresponding to the matched short file
- * name. We then convert the name to the current NLS code page, and proceed
- * searching for a dentry with this name, etc, as in case 2), above.
- *
- * Locking: Caller must hold i_mutex on the directory.
- */
-static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
- unsigned int flags)
-{
- ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
- struct inode *dent_inode;
- ntfschar *uname;
- ntfs_name *name = NULL;
- MFT_REF mref;
- unsigned long dent_ino;
- int uname_len;
-
- ntfs_debug("Looking up %pd in directory inode 0x%lx.",
- dent, dir_ino->i_ino);
- /* Convert the name of the dentry to Unicode. */
- uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
- &uname);
- if (uname_len < 0) {
- if (uname_len != -ENAMETOOLONG)
- ntfs_error(vol->sb, "Failed to convert name to "
- "Unicode.");
- return ERR_PTR(uname_len);
- }
- mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len,
- &name);
- kmem_cache_free(ntfs_name_cache, uname);
- if (!IS_ERR_MREF(mref)) {
- dent_ino = MREF(mref);
- ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
- dent_inode = ntfs_iget(vol->sb, dent_ino);
- if (!IS_ERR(dent_inode)) {
- /* Consistency check. */
- if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
- NTFS_I(dent_inode)->seq_no ||
- dent_ino == FILE_MFT) {
- /* Perfect WIN32/POSIX match. -- Case 1. */
- if (!name) {
- ntfs_debug("Done. (Case 1.)");
- return d_splice_alias(dent_inode, dent);
- }
- /*
- * We are too indented. Handle imperfect
- * matches and short file names further below.
- */
- goto handle_name;
- }
- ntfs_error(vol->sb, "Found stale reference to inode "
- "0x%lx (reference sequence number = "
- "0x%x, inode sequence number = 0x%x), "
- "returning -EIO. Run chkdsk.",
- dent_ino, MSEQNO(mref),
- NTFS_I(dent_inode)->seq_no);
- iput(dent_inode);
- dent_inode = ERR_PTR(-EIO);
- } else
- ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with "
- "error code %li.", dent_ino,
- PTR_ERR(dent_inode));
- kfree(name);
- /* Return the error code. */
- return ERR_CAST(dent_inode);
- }
- /* It is guaranteed that @name is no longer allocated at this point. */
- if (MREF_ERR(mref) == -ENOENT) {
- ntfs_debug("Entry was not found, adding negative dentry.");
- /* The dcache will handle negative entries. */
- d_add(dent, NULL);
- ntfs_debug("Done.");
- return NULL;
- }
- ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error "
- "code %i.", -MREF_ERR(mref));
- return ERR_PTR(MREF_ERR(mref));
- // TODO: Consider moving this lot to a separate function! (AIA)
-handle_name:
- {
- MFT_RECORD *m;
- ntfs_attr_search_ctx *ctx;
- ntfs_inode *ni = NTFS_I(dent_inode);
- int err;
- struct qstr nls_name;
-
- nls_name.name = NULL;
- if (name->type != FILE_NAME_DOS) { /* Case 2. */
- ntfs_debug("Case 2.");
- nls_name.len = (unsigned)ntfs_ucstonls(vol,
- (ntfschar*)&name->name, name->len,
- (unsigned char**)&nls_name.name, 0);
- kfree(name);
- } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */
- FILE_NAME_ATTR *fn;
-
- ntfs_debug("Case 3.");
- kfree(name);
-
- /* Find the WIN32 name corresponding to the matched DOS name. */
- ni = NTFS_I(dent_inode);
- m = map_mft_record(ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- do {
- ATTR_RECORD *a;
- u32 val_len;
-
- err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0,
- NULL, 0, ctx);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Inode corrupt: No WIN32 "
- "namespace counterpart to DOS "
- "file name. Run chkdsk.");
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- /* Consistency checks. */
- a = ctx->attr;
- if (a->non_resident || a->flags)
- goto eio_err_out;
- val_len = le32_to_cpu(a->data.resident.value_length);
- if (le16_to_cpu(a->data.resident.value_offset) +
- val_len > le32_to_cpu(a->length))
- goto eio_err_out;
- fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu(
- ctx->attr->data.resident.value_offset));
- if ((u32)(fn->file_name_length * sizeof(ntfschar) +
- sizeof(FILE_NAME_ATTR)) > val_len)
- goto eio_err_out;
- } while (fn->file_name_type != FILE_NAME_WIN32);
-
- /* Convert the found WIN32 name to current NLS code page. */
- nls_name.len = (unsigned)ntfs_ucstonls(vol,
- (ntfschar*)&fn->file_name, fn->file_name_length,
- (unsigned char**)&nls_name.name, 0);
-
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- }
- m = NULL;
- ctx = NULL;
-
- /* Check if a conversion error occurred. */
- if ((signed)nls_name.len < 0) {
- err = (signed)nls_name.len;
- goto err_out;
- }
- nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len);
-
- dent = d_add_ci(dent, dent_inode, &nls_name);
- kfree(nls_name.name);
- return dent;
-
-eio_err_out:
- ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
- err = -EIO;
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(ni);
- iput(dent_inode);
- ntfs_error(vol->sb, "Failed, returning error code %i.", err);
- return ERR_PTR(err);
- }
-}
-
-/*
- * Inode operations for directories.
- */
-const struct inode_operations ntfs_dir_inode_ops = {
- .lookup = ntfs_lookup, /* VFS: Lookup directory. */
-};
-
-/**
- * ntfs_get_parent - find the dentry of the parent of a given directory dentry
- * @child_dent: dentry of the directory whose parent directory to find
- *
- * Find the dentry for the parent directory of the directory specified by the
- * dentry @child_dent. This function is called from
- * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the
- * default ->decode_fh() which is export_decode_fh() in the same file.
- *
- * The code is based on the ext3 ->get_parent() implementation found in
- * fs/ext3/namei.c::ext3_get_parent().
- *
- * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down.
- *
- * Return the dentry of the parent directory on success or the error code on
- * error (IS_ERR() is true).
- */
-static struct dentry *ntfs_get_parent(struct dentry *child_dent)
-{
- struct inode *vi = d_inode(child_dent);
- ntfs_inode *ni = NTFS_I(vi);
- MFT_RECORD *mrec;
- ntfs_attr_search_ctx *ctx;
- ATTR_RECORD *attr;
- FILE_NAME_ATTR *fn;
- unsigned long parent_ino;
- int err;
-
- ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
- /* Get the mft record of the inode belonging to the child dentry. */
- mrec = map_mft_record(ni);
- if (IS_ERR(mrec))
- return ERR_CAST(mrec);
- /* Find the first file name attribute in the mft record. */
- ctx = ntfs_attr_get_search_ctx(ni, mrec);
- if (unlikely(!ctx)) {
- unmap_mft_record(ni);
- return ERR_PTR(-ENOMEM);
- }
-try_next:
- err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL,
- 0, ctx);
- if (unlikely(err)) {
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- if (err == -ENOENT)
- ntfs_error(vi->i_sb, "Inode 0x%lx does not have a "
- "file name attribute. Run chkdsk.",
- vi->i_ino);
- return ERR_PTR(err);
- }
- attr = ctx->attr;
- if (unlikely(attr->non_resident))
- goto try_next;
- fn = (FILE_NAME_ATTR *)((u8 *)attr +
- le16_to_cpu(attr->data.resident.value_offset));
- if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) >
- (u8*)attr + le32_to_cpu(attr->length)))
- goto try_next;
- /* Get the inode number of the parent directory. */
- parent_ino = MREF_LE(fn->parent_directory);
- /* Release the search context and the mft record of the child. */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
-
- return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino));
-}
-
-static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
-{
- struct inode *inode;
-
- inode = ntfs_iget(sb, ino);
- if (!IS_ERR(inode)) {
- if (is_bad_inode(inode) || inode->i_generation != generation) {
- iput(inode);
- inode = ERR_PTR(-ESTALE);
- }
- }
-
- return inode;
-}
-
-static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
-{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- ntfs_nfs_get_inode);
-}
-
-static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
-{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- ntfs_nfs_get_inode);
-}
-
-/*
- * Export operations allowing NFS exporting of mounted NTFS partitions.
- *
- * We use the default ->encode_fh() for now. Note that they
- * use 32 bits to store the inode number which is an unsigned long so on 64-bit
- * architectures is usually 64 bits so it would all fail horribly on huge
- * volumes. I guess we need to define our own encode and decode fh functions
- * that store 64-bit inode numbers at some point but for now we will ignore the
- * problem...
- *
- * We also use the default ->get_name() helper (used by ->decode_fh() via
- * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
- * independent.
- *
- * The default ->get_parent() just returns -EACCES so we have to provide our
- * own and the default ->get_dentry() is incompatible with NTFS due to not
- * allowing the inode number 0 which is used in NTFS for the system file $MFT
- * and due to using iget() whereas NTFS needs ntfs_iget().
- */
-const struct export_operations ntfs_export_ops = {
- .encode_fh = generic_encode_ino32_fh,
- .get_parent = ntfs_get_parent, /* Find the parent of a given
- directory. */
- .fh_to_dentry = ntfs_fh_to_dentry,
- .fh_to_parent = ntfs_fh_to_parent,
-};
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
deleted file mode 100644
index e81376ea9152..000000000000
--- a/fs/ntfs/ntfs.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ntfs.h - Defines for NTFS Linux kernel driver.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- * Copyright (C) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_H
-#define _LINUX_NTFS_H
-
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
-#include <linux/nls.h>
-#include <linux/smp.h>
-#include <linux/pagemap.h>
-
-#include "types.h"
-#include "volume.h"
-#include "layout.h"
-
-typedef enum {
- NTFS_BLOCK_SIZE = 512,
- NTFS_BLOCK_SIZE_BITS = 9,
- NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */
- NTFS_MAX_NAME_LEN = 255,
- NTFS_MAX_ATTR_NAME_LEN = 255,
- NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */
- NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE,
-} NTFS_CONSTANTS;
-
-/* Global variables. */
-
-/* Slab caches (from super.c). */
-extern struct kmem_cache *ntfs_name_cache;
-extern struct kmem_cache *ntfs_inode_cache;
-extern struct kmem_cache *ntfs_big_inode_cache;
-extern struct kmem_cache *ntfs_attr_ctx_cache;
-extern struct kmem_cache *ntfs_index_ctx_cache;
-
-/* The various operations structs defined throughout the driver files. */
-extern const struct address_space_operations ntfs_normal_aops;
-extern const struct address_space_operations ntfs_compressed_aops;
-extern const struct address_space_operations ntfs_mst_aops;
-
-extern const struct file_operations ntfs_file_ops;
-extern const struct inode_operations ntfs_file_inode_ops;
-
-extern const struct file_operations ntfs_dir_ops;
-extern const struct inode_operations ntfs_dir_inode_ops;
-
-extern const struct file_operations ntfs_empty_file_ops;
-extern const struct inode_operations ntfs_empty_inode_ops;
-
-extern const struct export_operations ntfs_export_ops;
-
-/**
- * NTFS_SB - return the ntfs volume given a vfs super block
- * @sb: VFS super block
- *
- * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb.
- */
-static inline ntfs_volume *NTFS_SB(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-/* Declarations of functions and global variables. */
-
-/* From fs/ntfs/compress.c */
-extern int ntfs_read_compressed_block(struct page *page);
-extern int allocate_compression_buffers(void);
-extern void free_compression_buffers(void);
-
-/* From fs/ntfs/super.c */
-#define default_upcase_len 0x10000
-extern struct mutex ntfs_lock;
-
-typedef struct {
- int val;
- char *str;
-} option_t;
-extern const option_t on_errors_arr[];
-
-/* From fs/ntfs/mst.c */
-extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size);
-extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
-extern void post_write_mst_fixup(NTFS_RECORD *b);
-
-/* From fs/ntfs/unistr.c */
-extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
- const ntfschar *s2, size_t s2_len,
- const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_size);
-extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
- const ntfschar *name2, const u32 name2_len,
- const int err_val, const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n);
-extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
- const ntfschar *upcase, const u32 upcase_size);
-extern void ntfs_upcase_name(ntfschar *name, u32 name_len,
- const ntfschar *upcase, const u32 upcase_len);
-extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
- const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
- FILE_NAME_ATTR *file_name_attr2,
- const int err_val, const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
- const int ins_len, ntfschar **outs);
-extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
- const int ins_len, unsigned char **outs, int outs_len);
-
-/* From fs/ntfs/upcase.c */
-extern ntfschar *generate_default_upcase(void);
-
-static inline int ntfs_ffs(int x)
-{
- int r = 1;
-
- if (!x)
- return 0;
- if (!(x & 0xffff)) {
- x >>= 16;
- r += 16;
- }
- if (!(x & 0xff)) {
- x >>= 8;
- r += 8;
- }
- if (!(x & 0xf)) {
- x >>= 4;
- r += 4;
- }
- if (!(x & 3)) {
- x >>= 2;
- r += 2;
- }
- if (!(x & 1)) {
- x >>= 1;
- r += 1;
- }
- return r;
-}
-
-#endif /* _LINUX_NTFS_H */
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
deleted file mode 100644
index 9160480222fd..000000000000
--- a/fs/ntfs/quota.c
+++ /dev/null
@@ -1,103 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS
- * project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include "index.h"
-#include "quota.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/**
- * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
- * @vol: ntfs volume on which to mark the quotas out of date
- *
- * Mark the quotas out of date on the ntfs volume @vol and return 'true' on
- * success and 'false' on error.
- */
-bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
-{
- ntfs_index_context *ictx;
- QUOTA_CONTROL_ENTRY *qce;
- const le32 qid = QUOTA_DEFAULTS_ID;
- int err;
-
- ntfs_debug("Entering.");
- if (NVolQuotaOutOfDate(vol))
- goto done;
- if (!vol->quota_ino || !vol->quota_q_ino) {
- ntfs_error(vol->sb, "Quota inodes are not open.");
- return false;
- }
- inode_lock(vol->quota_q_ino);
- ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
- if (!ictx) {
- ntfs_error(vol->sb, "Failed to get index context.");
- goto err_out;
- }
- err = ntfs_index_lookup(&qid, sizeof(qid), ictx);
- if (err) {
- if (err == -ENOENT)
- ntfs_error(vol->sb, "Quota defaults entry is not "
- "present.");
- else
- ntfs_error(vol->sb, "Lookup of quota defaults entry "
- "failed.");
- goto err_out;
- }
- if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) {
- ntfs_error(vol->sb, "Quota defaults entry size is invalid. "
- "Run chkdsk.");
- goto err_out;
- }
- qce = (QUOTA_CONTROL_ENTRY*)ictx->data;
- if (le32_to_cpu(qce->version) != QUOTA_VERSION) {
- ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not "
- "supported.", le32_to_cpu(qce->version));
- goto err_out;
- }
- ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags));
- /* If quotas are already marked out of date, no need to do anything. */
- if (qce->flags & QUOTA_FLAG_OUT_OF_DATE)
- goto set_done;
- /*
- * If quota tracking is neither requested, nor enabled and there are no
- * pending deletes, no need to mark the quotas out of date.
- */
- if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED |
- QUOTA_FLAG_TRACKING_REQUESTED |
- QUOTA_FLAG_PENDING_DELETES)))
- goto set_done;
- /*
- * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date.
- * This is verified on WinXP to be sufficient to cause windows to
- * rescan the volume on boot and update all quota entries.
- */
- qce->flags |= QUOTA_FLAG_OUT_OF_DATE;
- /* Ensure the modified flags are written to disk. */
- ntfs_index_entry_flush_dcache_page(ictx);
- ntfs_index_entry_mark_dirty(ictx);
-set_done:
- ntfs_index_ctx_put(ictx);
- inode_unlock(vol->quota_q_ino);
- /*
- * We set the flag so we do not try to mark the quotas out of date
- * again on remount.
- */
- NVolSetQuotaOutOfDate(vol);
-done:
- ntfs_debug("Done.");
- return true;
-err_out:
- if (ictx)
- ntfs_index_ctx_put(ictx);
- inode_unlock(vol->quota_q_ino);
- return false;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
deleted file mode 100644
index fe3132a3d6d2..000000000000
--- a/fs/ntfs/quota.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the
- * Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_QUOTA_H
-#define _LINUX_NTFS_QUOTA_H
-
-#ifdef NTFS_RW
-
-#include "types.h"
-#include "volume.h"
-
-extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_QUOTA_H */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
deleted file mode 100644
index 0d448e9881f7..000000000000
--- a/fs/ntfs/runlist.c
+++ /dev/null
@@ -1,1893 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002-2005 Richard Russon
- */
-
-#include "debug.h"
-#include "dir.h"
-#include "endian.h"
-#include "malloc.h"
-#include "ntfs.h"
-
-/**
- * ntfs_rl_mm - runlist memmove
- *
- * It is up to the caller to serialize access to the runlist @base.
- */
-static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
- int size)
-{
- if (likely((dst != src) && (size > 0)))
- memmove(base + dst, base + src, size * sizeof(*base));
-}
-
-/**
- * ntfs_rl_mc - runlist memory copy
- *
- * It is up to the caller to serialize access to the runlists @dstbase and
- * @srcbase.
- */
-static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
- runlist_element *srcbase, int src, int size)
-{
- if (likely(size > 0))
- memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
-}
-
-/**
- * ntfs_rl_realloc - Reallocate memory for runlists
- * @rl: original runlist
- * @old_size: number of runlist elements in the original runlist @rl
- * @new_size: number of runlist elements we need space for
- *
- * As the runlists grow, more memory will be required. To prevent the
- * kernel having to allocate and reallocate large numbers of small bits of
- * memory, this function returns an entire page of memory.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * N.B. If the new allocation doesn't require a different number of pages in
- * memory, the function will return the original pointer.
- *
- * On success, return a pointer to the newly allocated, or recycled, memory.
- * On error, return -errno. The following error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
- int old_size, int new_size)
-{
- runlist_element *new_rl;
-
- old_size = PAGE_ALIGN(old_size * sizeof(*rl));
- new_size = PAGE_ALIGN(new_size * sizeof(*rl));
- if (old_size == new_size)
- return rl;
-
- new_rl = ntfs_malloc_nofs(new_size);
- if (unlikely(!new_rl))
- return ERR_PTR(-ENOMEM);
-
- if (likely(rl != NULL)) {
- if (unlikely(old_size > new_size))
- old_size = new_size;
- memcpy(new_rl, rl, old_size);
- ntfs_free(rl);
- }
- return new_rl;
-}
-
-/**
- * ntfs_rl_realloc_nofail - Reallocate memory for runlists
- * @rl: original runlist
- * @old_size: number of runlist elements in the original runlist @rl
- * @new_size: number of runlist elements we need space for
- *
- * As the runlists grow, more memory will be required. To prevent the
- * kernel having to allocate and reallocate large numbers of small bits of
- * memory, this function returns an entire page of memory.
- *
- * This function guarantees that the allocation will succeed. It will sleep
- * for as long as it takes to complete the allocation.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * N.B. If the new allocation doesn't require a different number of pages in
- * memory, the function will return the original pointer.
- *
- * On success, return a pointer to the newly allocated, or recycled, memory.
- * On error, return -errno. The following error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
- int old_size, int new_size)
-{
- runlist_element *new_rl;
-
- old_size = PAGE_ALIGN(old_size * sizeof(*rl));
- new_size = PAGE_ALIGN(new_size * sizeof(*rl));
- if (old_size == new_size)
- return rl;
-
- new_rl = ntfs_malloc_nofs_nofail(new_size);
- BUG_ON(!new_rl);
-
- if (likely(rl != NULL)) {
- if (unlikely(old_size > new_size))
- old_size = new_size;
- memcpy(new_rl, rl, old_size);
- ntfs_free(rl);
- }
- return new_rl;
-}
-
-/**
- * ntfs_are_rl_mergeable - test if two runlists can be joined together
- * @dst: original runlist
- * @src: new runlist to test for mergeability with @dst
- *
- * Test if two runlists can be joined together. For this, their VCNs and LCNs
- * must be adjacent.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * Return: true Success, the runlists can be merged.
- * false Failure, the runlists cannot be merged.
- */
-static inline bool ntfs_are_rl_mergeable(runlist_element *dst,
- runlist_element *src)
-{
- BUG_ON(!dst);
- BUG_ON(!src);
-
- /* We can merge unmapped regions even if they are misaligned. */
- if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED))
- return true;
- /* If the runs are misaligned, we cannot merge them. */
- if ((dst->vcn + dst->length) != src->vcn)
- return false;
- /* If both runs are non-sparse and contiguous, we can merge them. */
- if ((dst->lcn >= 0) && (src->lcn >= 0) &&
- ((dst->lcn + dst->length) == src->lcn))
- return true;
- /* If we are merging two holes, we can merge them. */
- if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE))
- return true;
- /* Cannot merge. */
- return false;
-}
-
-/**
- * __ntfs_rl_merge - merge two runlists without testing if they can be merged
- * @dst: original, destination runlist
- * @src: new runlist to merge with @dst
- *
- * Merge the two runlists, writing into the destination runlist @dst. The
- * caller must make sure the runlists can be merged or this will corrupt the
- * destination runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- */
-static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
-{
- dst->length += src->length;
-}
-
-/**
- * ntfs_rl_append - append a runlist after a given element
- * @dst: original runlist to be worked on
- * @dsize: number of elements in @dst (including end marker)
- * @src: runlist to be inserted into @dst
- * @ssize: number of elements in @src (excluding end marker)
- * @loc: append the new runlist @src after this element in @dst
- *
- * Append the runlist @src after element @loc in @dst. Merge the right end of
- * the new runlist, if necessary. Adjust the size of the hole before the
- * appended runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_append(runlist_element *dst,
- int dsize, runlist_element *src, int ssize, int loc)
-{
- bool right = false; /* Right end of @src needs merging. */
- int marker; /* End of the inserted runs. */
-
- BUG_ON(!dst);
- BUG_ON(!src);
-
- /* First, check if the right hand end needs merging. */
- if ((loc + 1) < dsize)
- right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
-
- /* Space required: @dst size + @src size, less one if we merged. */
- dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
- if (IS_ERR(dst))
- return dst;
- /*
- * We are guaranteed to succeed from here so can start modifying the
- * original runlists.
- */
-
- /* First, merge the right hand end, if necessary. */
- if (right)
- __ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
-
- /* First run after the @src runs that have been inserted. */
- marker = loc + ssize + 1;
-
- /* Move the tail of @dst out of the way, then copy in @src. */
- ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right));
- ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
-
- /* Adjust the size of the preceding hole. */
- dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
-
- /* We may have changed the length of the file, so fix the end marker */
- if (dst[marker].lcn == LCN_ENOENT)
- dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
-
- return dst;
-}
-
-/**
- * ntfs_rl_insert - insert a runlist into another
- * @dst: original runlist to be worked on
- * @dsize: number of elements in @dst (including end marker)
- * @src: new runlist to be inserted
- * @ssize: number of elements in @src (excluding end marker)
- * @loc: insert the new runlist @src before this element in @dst
- *
- * Insert the runlist @src before element @loc in the runlist @dst. Merge the
- * left end of the new runlist, if necessary. Adjust the size of the hole
- * after the inserted runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
- int dsize, runlist_element *src, int ssize, int loc)
-{
- bool left = false; /* Left end of @src needs merging. */
- bool disc = false; /* Discontinuity between @dst and @src. */
- int marker; /* End of the inserted runs. */
-
- BUG_ON(!dst);
- BUG_ON(!src);
-
- /*
- * disc => Discontinuity between the end of @dst and the start of @src.
- * This means we might need to insert a "not mapped" run.
- */
- if (loc == 0)
- disc = (src[0].vcn > 0);
- else {
- s64 merged_length;
-
- left = ntfs_are_rl_mergeable(dst + loc - 1, src);
-
- merged_length = dst[loc - 1].length;
- if (left)
- merged_length += src->length;
-
- disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
- }
- /*
- * Space required: @dst size + @src size, less one if we merged, plus
- * one if there was a discontinuity.
- */
- dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc);
- if (IS_ERR(dst))
- return dst;
- /*
- * We are guaranteed to succeed from here so can start modifying the
- * original runlist.
- */
- if (left)
- __ntfs_rl_merge(dst + loc - 1, src);
- /*
- * First run after the @src runs that have been inserted.
- * Nominally, @marker equals @loc + @ssize, i.e. location + number of
- * runs in @src. However, if @left, then the first run in @src has
- * been merged with one in @dst. And if @disc, then @dst and @src do
- * not meet and we need an extra run to fill the gap.
- */
- marker = loc + ssize - left + disc;
-
- /* Move the tail of @dst out of the way, then copy in @src. */
- ntfs_rl_mm(dst, marker, loc, dsize - loc);
- ntfs_rl_mc(dst, loc + disc, src, left, ssize - left);
-
- /* Adjust the VCN of the first run after the insertion... */
- dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
- /* ... and the length. */
- if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED)
- dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn;
-
- /* Writing beyond the end of the file and there is a discontinuity. */
- if (disc) {
- if (loc > 0) {
- dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length;
- dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
- } else {
- dst[loc].vcn = 0;
- dst[loc].length = dst[loc + 1].vcn;
- }
- dst[loc].lcn = LCN_RL_NOT_MAPPED;
- }
- return dst;
-}
-
-/**
- * ntfs_rl_replace - overwrite a runlist element with another runlist
- * @dst: original runlist to be worked on
- * @dsize: number of elements in @dst (including end marker)
- * @src: new runlist to be inserted
- * @ssize: number of elements in @src (excluding end marker)
- * @loc: index in runlist @dst to overwrite with @src
- *
- * Replace the runlist element @dst at @loc with @src. Merge the left and
- * right ends of the inserted runlist, if necessary.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
- int dsize, runlist_element *src, int ssize, int loc)
-{
- signed delta;
- bool left = false; /* Left end of @src needs merging. */
- bool right = false; /* Right end of @src needs merging. */
- int tail; /* Start of tail of @dst. */
- int marker; /* End of the inserted runs. */
-
- BUG_ON(!dst);
- BUG_ON(!src);
-
- /* First, see if the left and right ends need merging. */
- if ((loc + 1) < dsize)
- right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
- if (loc > 0)
- left = ntfs_are_rl_mergeable(dst + loc - 1, src);
- /*
- * Allocate some space. We will need less if the left, right, or both
- * ends get merged. The -1 accounts for the run being replaced.
- */
- delta = ssize - 1 - left - right;
- if (delta > 0) {
- dst = ntfs_rl_realloc(dst, dsize, dsize + delta);
- if (IS_ERR(dst))
- return dst;
- }
- /*
- * We are guaranteed to succeed from here so can start modifying the
- * original runlists.
- */
-
- /* First, merge the left and right ends, if necessary. */
- if (right)
- __ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
- if (left)
- __ntfs_rl_merge(dst + loc - 1, src);
- /*
- * Offset of the tail of @dst. This needs to be moved out of the way
- * to make space for the runs to be copied from @src, i.e. the first
- * run of the tail of @dst.
- * Nominally, @tail equals @loc + 1, i.e. location, skipping the
- * replaced run. However, if @right, then one of @dst's runs is
- * already merged into @src.
- */
- tail = loc + right + 1;
- /*
- * First run after the @src runs that have been inserted, i.e. where
- * the tail of @dst needs to be moved to.
- * Nominally, @marker equals @loc + @ssize, i.e. location + number of
- * runs in @src. However, if @left, then the first run in @src has
- * been merged with one in @dst.
- */
- marker = loc + ssize - left;
-
- /* Move the tail of @dst out of the way, then copy in @src. */
- ntfs_rl_mm(dst, marker, tail, dsize - tail);
- ntfs_rl_mc(dst, loc, src, left, ssize - left);
-
- /* We may have changed the length of the file, so fix the end marker. */
- if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT)
- dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
- return dst;
-}
-
-/**
- * ntfs_rl_split - insert a runlist into the centre of a hole
- * @dst: original runlist to be worked on
- * @dsize: number of elements in @dst (including end marker)
- * @src: new runlist to be inserted
- * @ssize: number of elements in @src (excluding end marker)
- * @loc: index in runlist @dst at which to split and insert @src
- *
- * Split the runlist @dst at @loc into two and insert @new in between the two
- * fragments. No merging of runlists is necessary. Adjust the size of the
- * holes either side.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
- runlist_element *src, int ssize, int loc)
-{
- BUG_ON(!dst);
- BUG_ON(!src);
-
- /* Space required: @dst size + @src size + one new hole. */
- dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
- if (IS_ERR(dst))
- return dst;
- /*
- * We are guaranteed to succeed from here so can start modifying the
- * original runlists.
- */
-
- /* Move the tail of @dst out of the way, then copy in @src. */
- ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
- ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
-
- /* Adjust the size of the holes either size of @src. */
- dst[loc].length = dst[loc+1].vcn - dst[loc].vcn;
- dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length;
- dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
-
- return dst;
-}
-
-/**
- * ntfs_runlists_merge - merge two runlists into one
- * @drl: original runlist to be worked on
- * @srl: new runlist to be merged into @drl
- *
- * First we sanity check the two runlists @srl and @drl to make sure that they
- * are sensible and can be merged. The runlist @srl must be either after the
- * runlist @drl or completely within a hole (or unmapped region) in @drl.
- *
- * It is up to the caller to serialize access to the runlists @drl and @srl.
- *
- * Merging of runlists is necessary in two cases:
- * 1. When attribute lists are used and a further extent is being mapped.
- * 2. When new clusters are allocated to fill a hole or extend a file.
- *
- * There are four possible ways @srl can be merged. It can:
- * - be inserted at the beginning of a hole,
- * - split the hole in two and be inserted between the two fragments,
- * - be appended at the end of a hole, or it can
- * - replace the whole hole.
- * It can also be appended to the end of the runlist, which is just a variant
- * of the insert case.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @drl and @srl are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- * -ERANGE - The runlists overlap and cannot be merged.
- */
-runlist_element *ntfs_runlists_merge(runlist_element *drl,
- runlist_element *srl)
-{
- int di, si; /* Current index into @[ds]rl. */
- int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */
- int dins; /* Index into @drl at which to insert @srl. */
- int dend, send; /* Last index into @[ds]rl. */
- int dfinal, sfinal; /* The last index into @[ds]rl with
- lcn >= LCN_HOLE. */
- int marker = 0;
- VCN marker_vcn = 0;
-
-#ifdef DEBUG
- ntfs_debug("dst:");
- ntfs_debug_dump_runlist(drl);
- ntfs_debug("src:");
- ntfs_debug_dump_runlist(srl);
-#endif
-
- /* Check for silly calling... */
- if (unlikely(!srl))
- return drl;
- if (IS_ERR(srl) || IS_ERR(drl))
- return ERR_PTR(-EINVAL);
-
- /* Check for the case where the first mapping is being done now. */
- if (unlikely(!drl)) {
- drl = srl;
- /* Complete the source runlist if necessary. */
- if (unlikely(drl[0].vcn)) {
- /* Scan to the end of the source runlist. */
- for (dend = 0; likely(drl[dend].length); dend++)
- ;
- dend++;
- drl = ntfs_rl_realloc(drl, dend, dend + 1);
- if (IS_ERR(drl))
- return drl;
- /* Insert start element at the front of the runlist. */
- ntfs_rl_mm(drl, 1, 0, dend);
- drl[0].vcn = 0;
- drl[0].lcn = LCN_RL_NOT_MAPPED;
- drl[0].length = drl[1].vcn;
- }
- goto finished;
- }
-
- si = di = 0;
-
- /* Skip any unmapped start element(s) in the source runlist. */
- while (srl[si].length && srl[si].lcn < LCN_HOLE)
- si++;
-
- /* Can't have an entirely unmapped source runlist. */
- BUG_ON(!srl[si].length);
-
- /* Record the starting points. */
- sstart = si;
-
- /*
- * Skip forward in @drl until we reach the position where @srl needs to
- * be inserted. If we reach the end of @drl, @srl just needs to be
- * appended to @drl.
- */
- for (; drl[di].length; di++) {
- if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
- break;
- }
- dins = di;
-
- /* Sanity check for illegal overlaps. */
- if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
- (srl[si].lcn >= 0)) {
- ntfs_error(NULL, "Run lists overlap. Cannot merge!");
- return ERR_PTR(-ERANGE);
- }
-
- /* Scan to the end of both runlists in order to know their sizes. */
- for (send = si; srl[send].length; send++)
- ;
- for (dend = di; drl[dend].length; dend++)
- ;
-
- if (srl[send].lcn == LCN_ENOENT)
- marker_vcn = srl[marker = send].vcn;
-
- /* Scan to the last element with lcn >= LCN_HOLE. */
- for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
- ;
- for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
- ;
-
- {
- bool start;
- bool finish;
- int ds = dend + 1; /* Number of elements in drl & srl */
- int ss = sfinal - sstart + 1;
-
- start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */
- (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */
- finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */
- ((drl[dins].vcn + drl[dins].length) <= /* End of hole */
- (srl[send - 1].vcn + srl[send - 1].length)));
-
- /* Or we will lose an end marker. */
- if (finish && !drl[dins].length)
- ss++;
- if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
- finish = false;
-#if 0
- ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
- ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
- ntfs_debug("start = %i, finish = %i", start, finish);
- ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
-#endif
- if (start) {
- if (finish)
- drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
- else
- drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
- } else {
- if (finish)
- drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
- else
- drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
- }
- if (IS_ERR(drl)) {
- ntfs_error(NULL, "Merge failed.");
- return drl;
- }
- ntfs_free(srl);
- if (marker) {
- ntfs_debug("Triggering marker code.");
- for (ds = dend; drl[ds].length; ds++)
- ;
- /* We only need to care if @srl ended after @drl. */
- if (drl[ds].vcn <= marker_vcn) {
- int slots = 0;
-
- if (drl[ds].vcn == marker_vcn) {
- ntfs_debug("Old marker = 0x%llx, replacing "
- "with LCN_ENOENT.",
- (unsigned long long)
- drl[ds].lcn);
- drl[ds].lcn = LCN_ENOENT;
- goto finished;
- }
- /*
- * We need to create an unmapped runlist element in
- * @drl or extend an existing one before adding the
- * ENOENT terminator.
- */
- if (drl[ds].lcn == LCN_ENOENT) {
- ds--;
- slots = 1;
- }
- if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
- /* Add an unmapped runlist element. */
- if (!slots) {
- drl = ntfs_rl_realloc_nofail(drl, ds,
- ds + 2);
- slots = 2;
- }
- ds++;
- /* Need to set vcn if it isn't set already. */
- if (slots != 1)
- drl[ds].vcn = drl[ds - 1].vcn +
- drl[ds - 1].length;
- drl[ds].lcn = LCN_RL_NOT_MAPPED;
- /* We now used up a slot. */
- slots--;
- }
- drl[ds].length = marker_vcn - drl[ds].vcn;
- /* Finally add the ENOENT terminator. */
- ds++;
- if (!slots)
- drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
- drl[ds].vcn = marker_vcn;
- drl[ds].lcn = LCN_ENOENT;
- drl[ds].length = (s64)0;
- }
- }
- }
-
-finished:
- /* The merge was completed successfully. */
- ntfs_debug("Merged runlist:");
- ntfs_debug_dump_runlist(drl);
- return drl;
-}
-
-/**
- * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist
- * @vol: ntfs volume on which the attribute resides
- * @attr: attribute record whose mapping pairs array to decompress
- * @old_rl: optional runlist in which to insert @attr's runlist
- *
- * It is up to the caller to serialize access to the runlist @old_rl.
- *
- * Decompress the attribute @attr's mapping pairs array into a runlist. On
- * success, return the decompressed runlist.
- *
- * If @old_rl is not NULL, decompressed runlist is inserted into the
- * appropriate place in @old_rl and the resultant, combined runlist is
- * returned. The original @old_rl is deallocated.
- *
- * On error, return -errno. @old_rl is left unmodified in that case.
- *
- * The following error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EIO - Corrupt runlist.
- * -EINVAL - Invalid parameters were passed in.
- * -ERANGE - The two runlists overlap.
- *
- * FIXME: For now we take the conceptionally simplest approach of creating the
- * new runlist disregarding the already existing one and then splicing the
- * two into one, if that is possible (we check for overlap and discard the new
- * runlist if overlap present before returning ERR_PTR(-ERANGE)).
- */
-runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
- const ATTR_RECORD *attr, runlist_element *old_rl)
-{
- VCN vcn; /* Current vcn. */
- LCN lcn; /* Current lcn. */
- s64 deltaxcn; /* Change in [vl]cn. */
- runlist_element *rl; /* The output runlist. */
- u8 *buf; /* Current position in mapping pairs array. */
- u8 *attr_end; /* End of attribute. */
- int rlsize; /* Size of runlist buffer. */
- u16 rlpos; /* Current runlist position in units of
- runlist_elements. */
- u8 b; /* Current byte offset in buf. */
-
-#ifdef DEBUG
- /* Make sure attr exists and is non-resident. */
- if (!attr || !attr->non_resident || sle64_to_cpu(
- attr->data.non_resident.lowest_vcn) < (VCN)0) {
- ntfs_error(vol->sb, "Invalid arguments.");
- return ERR_PTR(-EINVAL);
- }
-#endif
- /* Start at vcn = lowest_vcn and lcn 0. */
- vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
- lcn = 0;
- /* Get start of the mapping pairs array. */
- buf = (u8*)attr + le16_to_cpu(
- attr->data.non_resident.mapping_pairs_offset);
- attr_end = (u8*)attr + le32_to_cpu(attr->length);
- if (unlikely(buf < (u8*)attr || buf > attr_end)) {
- ntfs_error(vol->sb, "Corrupt attribute.");
- return ERR_PTR(-EIO);
- }
- /* If the mapping pairs array is valid but empty, nothing to do. */
- if (!vcn && !*buf)
- return old_rl;
- /* Current position in runlist array. */
- rlpos = 0;
- /* Allocate first page and set current runlist size to one page. */
- rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
- if (unlikely(!rl))
- return ERR_PTR(-ENOMEM);
- /* Insert unmapped starting element if necessary. */
- if (vcn) {
- rl->vcn = 0;
- rl->lcn = LCN_RL_NOT_MAPPED;
- rl->length = vcn;
- rlpos++;
- }
- while (buf < attr_end && *buf) {
- /*
- * Allocate more memory if needed, including space for the
- * not-mapped and terminator elements. ntfs_malloc_nofs()
- * operates on whole pages only.
- */
- if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
- runlist_element *rl2;
-
- rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
- if (unlikely(!rl2)) {
- ntfs_free(rl);
- return ERR_PTR(-ENOMEM);
- }
- memcpy(rl2, rl, rlsize);
- ntfs_free(rl);
- rl = rl2;
- rlsize += PAGE_SIZE;
- }
- /* Enter the current vcn into the current runlist element. */
- rl[rlpos].vcn = vcn;
- /*
- * Get the change in vcn, i.e. the run length in clusters.
- * Doing it this way ensures that we signextend negative values.
- * A negative run length doesn't make any sense, but hey, I
- * didn't make up the NTFS specs and Windows NT4 treats the run
- * length as a signed value so that's how it is...
- */
- b = *buf & 0xf;
- if (b) {
- if (unlikely(buf + b > attr_end))
- goto io_error;
- for (deltaxcn = (s8)buf[b--]; b; b--)
- deltaxcn = (deltaxcn << 8) + buf[b];
- } else { /* The length entry is compulsory. */
- ntfs_error(vol->sb, "Missing length entry in mapping "
- "pairs array.");
- deltaxcn = (s64)-1;
- }
- /*
- * Assume a negative length to indicate data corruption and
- * hence clean-up and return NULL.
- */
- if (unlikely(deltaxcn < 0)) {
- ntfs_error(vol->sb, "Invalid length in mapping pairs "
- "array.");
- goto err_out;
- }
- /*
- * Enter the current run length into the current runlist
- * element.
- */
- rl[rlpos].length = deltaxcn;
- /* Increment the current vcn by the current run length. */
- vcn += deltaxcn;
- /*
- * There might be no lcn change at all, as is the case for
- * sparse clusters on NTFS 3.0+, in which case we set the lcn
- * to LCN_HOLE.
- */
- if (!(*buf & 0xf0))
- rl[rlpos].lcn = LCN_HOLE;
- else {
- /* Get the lcn change which really can be negative. */
- u8 b2 = *buf & 0xf;
- b = b2 + ((*buf >> 4) & 0xf);
- if (buf + b > attr_end)
- goto io_error;
- for (deltaxcn = (s8)buf[b--]; b > b2; b--)
- deltaxcn = (deltaxcn << 8) + buf[b];
- /* Change the current lcn to its new value. */
- lcn += deltaxcn;
-#ifdef DEBUG
- /*
- * On NTFS 1.2-, apparently can have lcn == -1 to
- * indicate a hole. But we haven't verified ourselves
- * whether it is really the lcn or the deltaxcn that is
- * -1. So if either is found give us a message so we
- * can investigate it further!
- */
- if (vol->major_ver < 3) {
- if (unlikely(deltaxcn == (LCN)-1))
- ntfs_error(vol->sb, "lcn delta == -1");
- if (unlikely(lcn == (LCN)-1))
- ntfs_error(vol->sb, "lcn == -1");
- }
-#endif
- /* Check lcn is not below -1. */
- if (unlikely(lcn < (LCN)-1)) {
- ntfs_error(vol->sb, "Invalid LCN < -1 in "
- "mapping pairs array.");
- goto err_out;
- }
- /* Enter the current lcn into the runlist element. */
- rl[rlpos].lcn = lcn;
- }
- /* Get to the next runlist element. */
- rlpos++;
- /* Increment the buffer position to the next mapping pair. */
- buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
- }
- if (unlikely(buf >= attr_end))
- goto io_error;
- /*
- * If there is a highest_vcn specified, it must be equal to the final
- * vcn in the runlist - 1, or something has gone badly wrong.
- */
- deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
- if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
-mpa_err:
- ntfs_error(vol->sb, "Corrupt mapping pairs array in "
- "non-resident attribute.");
- goto err_out;
- }
- /* Setup not mapped runlist element if this is the base extent. */
- if (!attr->data.non_resident.lowest_vcn) {
- VCN max_cluster;
-
- max_cluster = ((sle64_to_cpu(
- attr->data.non_resident.allocated_size) +
- vol->cluster_size - 1) >>
- vol->cluster_size_bits) - 1;
- /*
- * A highest_vcn of zero means this is a single extent
- * attribute so simply terminate the runlist with LCN_ENOENT).
- */
- if (deltaxcn) {
- /*
- * If there is a difference between the highest_vcn and
- * the highest cluster, the runlist is either corrupt
- * or, more likely, there are more extents following
- * this one.
- */
- if (deltaxcn < max_cluster) {
- ntfs_debug("More extents to follow; deltaxcn "
- "= 0x%llx, max_cluster = "
- "0x%llx",
- (unsigned long long)deltaxcn,
- (unsigned long long)
- max_cluster);
- rl[rlpos].vcn = vcn;
- vcn += rl[rlpos].length = max_cluster -
- deltaxcn;
- rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
- rlpos++;
- } else if (unlikely(deltaxcn > max_cluster)) {
- ntfs_error(vol->sb, "Corrupt attribute. "
- "deltaxcn = 0x%llx, "
- "max_cluster = 0x%llx",
- (unsigned long long)deltaxcn,
- (unsigned long long)
- max_cluster);
- goto mpa_err;
- }
- }
- rl[rlpos].lcn = LCN_ENOENT;
- } else /* Not the base extent. There may be more extents to follow. */
- rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
-
- /* Setup terminating runlist element. */
- rl[rlpos].vcn = vcn;
- rl[rlpos].length = (s64)0;
- /* If no existing runlist was specified, we are done. */
- if (!old_rl) {
- ntfs_debug("Mapping pairs array successfully decompressed:");
- ntfs_debug_dump_runlist(rl);
- return rl;
- }
- /* Now combine the new and old runlists checking for overlaps. */
- old_rl = ntfs_runlists_merge(old_rl, rl);
- if (!IS_ERR(old_rl))
- return old_rl;
- ntfs_free(rl);
- ntfs_error(vol->sb, "Failed to merge runlists.");
- return old_rl;
-io_error:
- ntfs_error(vol->sb, "Corrupt attribute.");
-err_out:
- ntfs_free(rl);
- return ERR_PTR(-EIO);
-}
-
-/**
- * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist
- * @rl: runlist to use for conversion
- * @vcn: vcn to convert
- *
- * Convert the virtual cluster number @vcn of an attribute into a logical
- * cluster number (lcn) of a device using the runlist @rl to map vcns to their
- * corresponding lcns.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * Since lcns must be >= 0, we use negative return codes with special meaning:
- *
- * Return code Meaning / Description
- * ==================================================
- * LCN_HOLE Hole / not allocated on disk.
- * LCN_RL_NOT_MAPPED This is part of the runlist which has not been
- * inserted into the runlist yet.
- * LCN_ENOENT There is no such vcn in the attribute.
- *
- * Locking: - The caller must have locked the runlist (for reading or writing).
- * - This function does not touch the lock, nor does it modify the
- * runlist.
- */
-LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
-{
- int i;
-
- BUG_ON(vcn < 0);
- /*
- * If rl is NULL, assume that we have found an unmapped runlist. The
- * caller can then attempt to map it and fail appropriately if
- * necessary.
- */
- if (unlikely(!rl))
- return LCN_RL_NOT_MAPPED;
-
- /* Catch out of lower bounds vcn. */
- if (unlikely(vcn < rl[0].vcn))
- return LCN_ENOENT;
-
- for (i = 0; likely(rl[i].length); i++) {
- if (unlikely(vcn < rl[i+1].vcn)) {
- if (likely(rl[i].lcn >= (LCN)0))
- return rl[i].lcn + (vcn - rl[i].vcn);
- return rl[i].lcn;
- }
- }
- /*
- * The terminator element is setup to the correct value, i.e. one of
- * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
- */
- if (likely(rl[i].lcn < (LCN)0))
- return rl[i].lcn;
- /* Just in case... We could replace this with BUG() some day. */
- return LCN_ENOENT;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_rl_find_vcn_nolock - find a vcn in a runlist
- * @rl: runlist to search
- * @vcn: vcn to find
- *
- * Find the virtual cluster number @vcn in the runlist @rl and return the
- * address of the runlist element containing the @vcn on success.
- *
- * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of
- * the runlist.
- *
- * Locking: The runlist must be locked on entry.
- */
-runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn)
-{
- BUG_ON(vcn < 0);
- if (unlikely(!rl || vcn < rl[0].vcn))
- return NULL;
- while (likely(rl->length)) {
- if (unlikely(vcn < rl[1].vcn)) {
- if (likely(rl->lcn >= LCN_HOLE))
- return rl;
- return NULL;
- }
- rl++;
- }
- if (likely(rl->lcn == LCN_ENOENT))
- return rl;
- return NULL;
-}
-
-/**
- * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
- * @n: number for which to get the number of bytes for
- *
- * Return the number of bytes required to store @n unambiguously as
- * a signed number.
- *
- * This is used in the context of the mapping pairs array to determine how
- * many bytes will be needed in the array to store a given logical cluster
- * number (lcn) or a specific run length.
- *
- * Return the number of bytes written. This function cannot fail.
- */
-static inline int ntfs_get_nr_significant_bytes(const s64 n)
-{
- s64 l = n;
- int i;
- s8 j;
-
- i = 0;
- do {
- l >>= 8;
- i++;
- } while (l != 0 && l != -1);
- j = (n >> 8 * (i - 1)) & 0xff;
- /* If the sign bit is wrong, we need an extra byte. */
- if ((n < 0 && j >= 0) || (n > 0 && j < 0))
- i++;
- return i;
-}
-
-/**
- * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
- * @vol: ntfs volume (needed for the ntfs version)
- * @rl: locked runlist to determine the size of the mapping pairs of
- * @first_vcn: first vcn which to include in the mapping pairs array
- * @last_vcn: last vcn which to include in the mapping pairs array
- *
- * Walk the locked runlist @rl and calculate the size in bytes of the mapping
- * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and
- * finishing with vcn @last_vcn.
- *
- * A @last_vcn of -1 means end of runlist and in that case the size of the
- * mapping pairs array corresponding to the runlist starting at vcn @first_vcn
- * and finishing at the end of the runlist is determined.
- *
- * This for example allows us to allocate a buffer of the right size when
- * building the mapping pairs array.
- *
- * If @rl is NULL, just return 1 (for the single terminator byte).
- *
- * Return the calculated size in bytes on success. On error, return -errno.
- * The following error codes are defined:
- * -EINVAL - Run list contains unmapped elements. Make sure to only pass
- * fully mapped runlists to this function.
- * -EIO - The runlist is corrupt.
- *
- * Locking: @rl must be locked on entry (either for reading or writing), it
- * remains locked throughout, and is left locked upon return.
- */
-int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
- const runlist_element *rl, const VCN first_vcn,
- const VCN last_vcn)
-{
- LCN prev_lcn;
- int rls;
- bool the_end = false;
-
- BUG_ON(first_vcn < 0);
- BUG_ON(last_vcn < -1);
- BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
- if (!rl) {
- BUG_ON(first_vcn);
- BUG_ON(last_vcn > 0);
- return 1;
- }
- /* Skip to runlist element containing @first_vcn. */
- while (rl->length && first_vcn >= rl[1].vcn)
- rl++;
- if (unlikely((!rl->length && first_vcn > rl->vcn) ||
- first_vcn < rl->vcn))
- return -EINVAL;
- prev_lcn = 0;
- /* Always need the termining zero byte. */
- rls = 1;
- /* Do the first partial run if present. */
- if (first_vcn > rl->vcn) {
- s64 delta, length = rl->length;
-
- /* We know rl->length != 0 already. */
- if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
- goto err_out;
- /*
- * If @stop_vcn is given and finishes inside this run, cap the
- * run length.
- */
- if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
- s64 s1 = last_vcn + 1;
- if (unlikely(rl[1].vcn > s1))
- length = s1 - rl->vcn;
- the_end = true;
- }
- delta = first_vcn - rl->vcn;
- /* Header byte + length. */
- rls += 1 + ntfs_get_nr_significant_bytes(length - delta);
- /*
- * If the logical cluster number (lcn) denotes a hole and we
- * are on NTFS 3.0+, we don't store it at all, i.e. we need
- * zero space. On earlier NTFS versions we just store the lcn.
- * Note: this assumes that on NTFS 1.2-, holes are stored with
- * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
- */
- if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
- prev_lcn = rl->lcn;
- if (likely(rl->lcn >= 0))
- prev_lcn += delta;
- /* Change in lcn. */
- rls += ntfs_get_nr_significant_bytes(prev_lcn);
- }
- /* Go to next runlist element. */
- rl++;
- }
- /* Do the full runs. */
- for (; rl->length && !the_end; rl++) {
- s64 length = rl->length;
-
- if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
- goto err_out;
- /*
- * If @stop_vcn is given and finishes inside this run, cap the
- * run length.
- */
- if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
- s64 s1 = last_vcn + 1;
- if (unlikely(rl[1].vcn > s1))
- length = s1 - rl->vcn;
- the_end = true;
- }
- /* Header byte + length. */
- rls += 1 + ntfs_get_nr_significant_bytes(length);
- /*
- * If the logical cluster number (lcn) denotes a hole and we
- * are on NTFS 3.0+, we don't store it at all, i.e. we need
- * zero space. On earlier NTFS versions we just store the lcn.
- * Note: this assumes that on NTFS 1.2-, holes are stored with
- * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
- */
- if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
- /* Change in lcn. */
- rls += ntfs_get_nr_significant_bytes(rl->lcn -
- prev_lcn);
- prev_lcn = rl->lcn;
- }
- }
- return rls;
-err_out:
- if (rl->lcn == LCN_RL_NOT_MAPPED)
- rls = -EINVAL;
- else
- rls = -EIO;
- return rls;
-}
-
-/**
- * ntfs_write_significant_bytes - write the significant bytes of a number
- * @dst: destination buffer to write to
- * @dst_max: pointer to last byte of destination buffer for bounds checking
- * @n: number whose significant bytes to write
- *
- * Store in @dst, the minimum bytes of the number @n which are required to
- * identify @n unambiguously as a signed number, taking care not to exceed
- * @dest_max, the maximum position within @dst to which we are allowed to
- * write.
- *
- * This is used when building the mapping pairs array of a runlist to compress
- * a given logical cluster number (lcn) or a specific run length to the minimum
- * size possible.
- *
- * Return the number of bytes written on success. On error, i.e. the
- * destination buffer @dst is too small, return -ENOSPC.
- */
-static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
- const s64 n)
-{
- s64 l = n;
- int i;
- s8 j;
-
- i = 0;
- do {
- if (unlikely(dst > dst_max))
- goto err_out;
- *dst++ = l & 0xffll;
- l >>= 8;
- i++;
- } while (l != 0 && l != -1);
- j = (n >> 8 * (i - 1)) & 0xff;
- /* If the sign bit is wrong, we need an extra byte. */
- if (n < 0 && j >= 0) {
- if (unlikely(dst > dst_max))
- goto err_out;
- i++;
- *dst = (s8)-1;
- } else if (n > 0 && j < 0) {
- if (unlikely(dst > dst_max))
- goto err_out;
- i++;
- *dst = (s8)0;
- }
- return i;
-err_out:
- return -ENOSPC;
-}
-
-/**
- * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist
- * @vol: ntfs volume (needed for the ntfs version)
- * @dst: destination buffer to which to write the mapping pairs array
- * @dst_len: size of destination buffer @dst in bytes
- * @rl: locked runlist for which to build the mapping pairs array
- * @first_vcn: first vcn which to include in the mapping pairs array
- * @last_vcn: last vcn which to include in the mapping pairs array
- * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC
- *
- * Create the mapping pairs array from the locked runlist @rl, starting at vcn
- * @first_vcn and finishing with vcn @last_vcn and save the array in @dst.
- * @dst_len is the size of @dst in bytes and it should be at least equal to the
- * value obtained by calling ntfs_get_size_for_mapping_pairs().
- *
- * A @last_vcn of -1 means end of runlist and in that case the mapping pairs
- * array corresponding to the runlist starting at vcn @first_vcn and finishing
- * at the end of the runlist is created.
- *
- * If @rl is NULL, just write a single terminator byte to @dst.
- *
- * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to
- * the first vcn outside the destination buffer. Note that on error, @dst has
- * been filled with all the mapping pairs that will fit, thus it can be treated
- * as partial success, in that a new attribute extent needs to be created or
- * the next extent has to be used and the mapping pairs build has to be
- * continued with @first_vcn set to *@stop_vcn.
- *
- * Return 0 on success and -errno on error. The following error codes are
- * defined:
- * -EINVAL - Run list contains unmapped elements. Make sure to only pass
- * fully mapped runlists to this function.
- * -EIO - The runlist is corrupt.
- * -ENOSPC - The destination buffer is too small.
- *
- * Locking: @rl must be locked on entry (either for reading or writing), it
- * remains locked throughout, and is left locked upon return.
- */
-int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
- const int dst_len, const runlist_element *rl,
- const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn)
-{
- LCN prev_lcn;
- s8 *dst_max, *dst_next;
- int err = -ENOSPC;
- bool the_end = false;
- s8 len_len, lcn_len;
-
- BUG_ON(first_vcn < 0);
- BUG_ON(last_vcn < -1);
- BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
- BUG_ON(dst_len < 1);
- if (!rl) {
- BUG_ON(first_vcn);
- BUG_ON(last_vcn > 0);
- if (stop_vcn)
- *stop_vcn = 0;
- /* Terminator byte. */
- *dst = 0;
- return 0;
- }
- /* Skip to runlist element containing @first_vcn. */
- while (rl->length && first_vcn >= rl[1].vcn)
- rl++;
- if (unlikely((!rl->length && first_vcn > rl->vcn) ||
- first_vcn < rl->vcn))
- return -EINVAL;
- /*
- * @dst_max is used for bounds checking in
- * ntfs_write_significant_bytes().
- */
- dst_max = dst + dst_len - 1;
- prev_lcn = 0;
- /* Do the first partial run if present. */
- if (first_vcn > rl->vcn) {
- s64 delta, length = rl->length;
-
- /* We know rl->length != 0 already. */
- if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
- goto err_out;
- /*
- * If @stop_vcn is given and finishes inside this run, cap the
- * run length.
- */
- if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
- s64 s1 = last_vcn + 1;
- if (unlikely(rl[1].vcn > s1))
- length = s1 - rl->vcn;
- the_end = true;
- }
- delta = first_vcn - rl->vcn;
- /* Write length. */
- len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
- length - delta);
- if (unlikely(len_len < 0))
- goto size_err;
- /*
- * If the logical cluster number (lcn) denotes a hole and we
- * are on NTFS 3.0+, we don't store it at all, i.e. we need
- * zero space. On earlier NTFS versions we just write the lcn
- * change. FIXME: Do we need to write the lcn change or just
- * the lcn in that case? Not sure as I have never seen this
- * case on NT4. - We assume that we just need to write the lcn
- * change until someone tells us otherwise... (AIA)
- */
- if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
- prev_lcn = rl->lcn;
- if (likely(rl->lcn >= 0))
- prev_lcn += delta;
- /* Write change in lcn. */
- lcn_len = ntfs_write_significant_bytes(dst + 1 +
- len_len, dst_max, prev_lcn);
- if (unlikely(lcn_len < 0))
- goto size_err;
- } else
- lcn_len = 0;
- dst_next = dst + len_len + lcn_len + 1;
- if (unlikely(dst_next > dst_max))
- goto size_err;
- /* Update header byte. */
- *dst = lcn_len << 4 | len_len;
- /* Position at next mapping pairs array element. */
- dst = dst_next;
- /* Go to next runlist element. */
- rl++;
- }
- /* Do the full runs. */
- for (; rl->length && !the_end; rl++) {
- s64 length = rl->length;
-
- if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
- goto err_out;
- /*
- * If @stop_vcn is given and finishes inside this run, cap the
- * run length.
- */
- if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
- s64 s1 = last_vcn + 1;
- if (unlikely(rl[1].vcn > s1))
- length = s1 - rl->vcn;
- the_end = true;
- }
- /* Write length. */
- len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
- length);
- if (unlikely(len_len < 0))
- goto size_err;
- /*
- * If the logical cluster number (lcn) denotes a hole and we
- * are on NTFS 3.0+, we don't store it at all, i.e. we need
- * zero space. On earlier NTFS versions we just write the lcn
- * change. FIXME: Do we need to write the lcn change or just
- * the lcn in that case? Not sure as I have never seen this
- * case on NT4. - We assume that we just need to write the lcn
- * change until someone tells us otherwise... (AIA)
- */
- if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
- /* Write change in lcn. */
- lcn_len = ntfs_write_significant_bytes(dst + 1 +
- len_len, dst_max, rl->lcn - prev_lcn);
- if (unlikely(lcn_len < 0))
- goto size_err;
- prev_lcn = rl->lcn;
- } else
- lcn_len = 0;
- dst_next = dst + len_len + lcn_len + 1;
- if (unlikely(dst_next > dst_max))
- goto size_err;
- /* Update header byte. */
- *dst = lcn_len << 4 | len_len;
- /* Position at next mapping pairs array element. */
- dst = dst_next;
- }
- /* Success. */
- err = 0;
-size_err:
- /* Set stop vcn. */
- if (stop_vcn)
- *stop_vcn = rl->vcn;
- /* Add terminator byte. */
- *dst = 0;
- return err;
-err_out:
- if (rl->lcn == LCN_RL_NOT_MAPPED)
- err = -EINVAL;
- else
- err = -EIO;
- return err;
-}
-
-/**
- * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
- * @vol: ntfs volume (needed for error output)
- * @runlist: runlist to truncate
- * @new_length: the new length of the runlist in VCNs
- *
- * Truncate the runlist described by @runlist as well as the memory buffer
- * holding the runlist elements to a length of @new_length VCNs.
- *
- * If @new_length lies within the runlist, the runlist elements with VCNs of
- * @new_length and above are discarded. As a special case if @new_length is
- * zero, the runlist is discarded and set to NULL.
- *
- * If @new_length lies beyond the runlist, a sparse runlist element is added to
- * the end of the runlist @runlist or if the last runlist element is a sparse
- * one already, this is extended.
- *
- * Note, no checking is done for unmapped runlist elements. It is assumed that
- * the caller has mapped any elements that need to be mapped already.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: The caller must hold @runlist->lock for writing.
- */
-int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
- const s64 new_length)
-{
- runlist_element *rl;
- int old_size;
-
- ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length);
- BUG_ON(!runlist);
- BUG_ON(new_length < 0);
- rl = runlist->rl;
- if (!new_length) {
- ntfs_debug("Freeing runlist.");
- runlist->rl = NULL;
- if (rl)
- ntfs_free(rl);
- return 0;
- }
- if (unlikely(!rl)) {
- /*
- * Create a runlist consisting of a sparse runlist element of
- * length @new_length followed by a terminator runlist element.
- */
- rl = ntfs_malloc_nofs(PAGE_SIZE);
- if (unlikely(!rl)) {
- ntfs_error(vol->sb, "Not enough memory to allocate "
- "runlist element buffer.");
- return -ENOMEM;
- }
- runlist->rl = rl;
- rl[1].length = rl->vcn = 0;
- rl->lcn = LCN_HOLE;
- rl[1].vcn = rl->length = new_length;
- rl[1].lcn = LCN_ENOENT;
- return 0;
- }
- BUG_ON(new_length < rl->vcn);
- /* Find @new_length in the runlist. */
- while (likely(rl->length && new_length >= rl[1].vcn))
- rl++;
- /*
- * If not at the end of the runlist we need to shrink it.
- * If at the end of the runlist we need to expand it.
- */
- if (rl->length) {
- runlist_element *trl;
- bool is_end;
-
- ntfs_debug("Shrinking runlist.");
- /* Determine the runlist size. */
- trl = rl + 1;
- while (likely(trl->length))
- trl++;
- old_size = trl - runlist->rl + 1;
- /* Truncate the run. */
- rl->length = new_length - rl->vcn;
- /*
- * If a run was partially truncated, make the following runlist
- * element a terminator.
- */
- is_end = false;
- if (rl->length) {
- rl++;
- if (!rl->length)
- is_end = true;
- rl->vcn = new_length;
- rl->length = 0;
- }
- rl->lcn = LCN_ENOENT;
- /* Reallocate memory if necessary. */
- if (!is_end) {
- int new_size = rl - runlist->rl + 1;
- rl = ntfs_rl_realloc(runlist->rl, old_size, new_size);
- if (IS_ERR(rl))
- ntfs_warning(vol->sb, "Failed to shrink "
- "runlist buffer. This just "
- "wastes a bit of memory "
- "temporarily so we ignore it "
- "and return success.");
- else
- runlist->rl = rl;
- }
- } else if (likely(/* !rl->length && */ new_length > rl->vcn)) {
- ntfs_debug("Expanding runlist.");
- /*
- * If there is a previous runlist element and it is a sparse
- * one, extend it. Otherwise need to add a new, sparse runlist
- * element.
- */
- if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE))
- (rl - 1)->length = new_length - (rl - 1)->vcn;
- else {
- /* Determine the runlist size. */
- old_size = rl - runlist->rl + 1;
- /* Reallocate memory if necessary. */
- rl = ntfs_rl_realloc(runlist->rl, old_size,
- old_size + 1);
- if (IS_ERR(rl)) {
- ntfs_error(vol->sb, "Failed to expand runlist "
- "buffer, aborting.");
- return PTR_ERR(rl);
- }
- runlist->rl = rl;
- /*
- * Set @rl to the same runlist element in the new
- * runlist as before in the old runlist.
- */
- rl += old_size - 1;
- /* Add a new, sparse runlist element. */
- rl->lcn = LCN_HOLE;
- rl->length = new_length - rl->vcn;
- /* Add a new terminator runlist element. */
- rl++;
- rl->length = 0;
- }
- rl->vcn = new_length;
- rl->lcn = LCN_ENOENT;
- } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ {
- /* Runlist already has same size as requested. */
- rl->lcn = LCN_ENOENT;
- }
- ntfs_debug("Done.");
- return 0;
-}
-
-/**
- * ntfs_rl_punch_nolock - punch a hole into a runlist
- * @vol: ntfs volume (needed for error output)
- * @runlist: runlist to punch a hole into
- * @start: starting VCN of the hole to be created
- * @length: size of the hole to be created in units of clusters
- *
- * Punch a hole into the runlist @runlist starting at VCN @start and of size
- * @length clusters.
- *
- * Return 0 on success and -errno on error, in which case @runlist has not been
- * modified.
- *
- * If @start and/or @start + @length are outside the runlist return error code
- * -ENOENT.
- *
- * If the runlist contains unmapped or error elements between @start and @start
- * + @length return error code -EINVAL.
- *
- * Locking: The caller must hold @runlist->lock for writing.
- */
-int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
- const VCN start, const s64 length)
-{
- const VCN end = start + length;
- s64 delta;
- runlist_element *rl, *rl_end, *rl_real_end, *trl;
- int old_size;
- bool lcn_fixup = false;
-
- ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
- (long long)start, (long long)length);
- BUG_ON(!runlist);
- BUG_ON(start < 0);
- BUG_ON(length < 0);
- BUG_ON(end < 0);
- rl = runlist->rl;
- if (unlikely(!rl)) {
- if (likely(!start && !length))
- return 0;
- return -ENOENT;
- }
- /* Find @start in the runlist. */
- while (likely(rl->length && start >= rl[1].vcn))
- rl++;
- rl_end = rl;
- /* Find @end in the runlist. */
- while (likely(rl_end->length && end >= rl_end[1].vcn)) {
- /* Verify there are no unmapped or error elements. */
- if (unlikely(rl_end->lcn < LCN_HOLE))
- return -EINVAL;
- rl_end++;
- }
- /* Check the last element. */
- if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
- return -EINVAL;
- /* This covers @start being out of bounds, too. */
- if (!rl_end->length && end > rl_end->vcn)
- return -ENOENT;
- if (!length)
- return 0;
- if (!rl->length)
- return -ENOENT;
- rl_real_end = rl_end;
- /* Determine the runlist size. */
- while (likely(rl_real_end->length))
- rl_real_end++;
- old_size = rl_real_end - runlist->rl + 1;
- /* If @start is in a hole simply extend the hole. */
- if (rl->lcn == LCN_HOLE) {
- /*
- * If both @start and @end are in the same sparse run, we are
- * done.
- */
- if (end <= rl[1].vcn) {
- ntfs_debug("Done (requested hole is already sparse).");
- return 0;
- }
-extend_hole:
- /* Extend the hole. */
- rl->length = end - rl->vcn;
- /* If @end is in a hole, merge it with the current one. */
- if (rl_end->lcn == LCN_HOLE) {
- rl_end++;
- rl->length = rl_end->vcn - rl->vcn;
- }
- /* We have done the hole. Now deal with the remaining tail. */
- rl++;
- /* Cut out all runlist elements up to @end. */
- if (rl < rl_end)
- memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
- sizeof(*rl));
- /* Adjust the beginning of the tail if necessary. */
- if (end > rl->vcn) {
- delta = end - rl->vcn;
- rl->vcn = end;
- rl->length -= delta;
- /* Only adjust the lcn if it is real. */
- if (rl->lcn >= 0)
- rl->lcn += delta;
- }
-shrink_allocation:
- /* Reallocate memory if the allocation changed. */
- if (rl < rl_end) {
- rl = ntfs_rl_realloc(runlist->rl, old_size,
- old_size - (rl_end - rl));
- if (IS_ERR(rl))
- ntfs_warning(vol->sb, "Failed to shrink "
- "runlist buffer. This just "
- "wastes a bit of memory "
- "temporarily so we ignore it "
- "and return success.");
- else
- runlist->rl = rl;
- }
- ntfs_debug("Done (extend hole).");
- return 0;
- }
- /*
- * If @start is at the beginning of a run things are easier as there is
- * no need to split the first run.
- */
- if (start == rl->vcn) {
- /*
- * @start is at the beginning of a run.
- *
- * If the previous run is sparse, extend its hole.
- *
- * If @end is not in the same run, switch the run to be sparse
- * and extend the newly created hole.
- *
- * Thus both of these cases reduce the problem to the above
- * case of "@start is in a hole".
- */
- if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
- rl--;
- goto extend_hole;
- }
- if (end >= rl[1].vcn) {
- rl->lcn = LCN_HOLE;
- goto extend_hole;
- }
- /*
- * The final case is when @end is in the same run as @start.
- * For this need to split the run into two. One run for the
- * sparse region between the beginning of the old run, i.e.
- * @start, and @end and one for the remaining non-sparse
- * region, i.e. between @end and the end of the old run.
- */
- trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
- if (IS_ERR(trl))
- goto enomem_out;
- old_size++;
- if (runlist->rl != trl) {
- rl = trl + (rl - runlist->rl);
- rl_end = trl + (rl_end - runlist->rl);
- rl_real_end = trl + (rl_real_end - runlist->rl);
- runlist->rl = trl;
- }
-split_end:
- /* Shift all the runs up by one. */
- memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
- /* Finally, setup the two split runs. */
- rl->lcn = LCN_HOLE;
- rl->length = length;
- rl++;
- rl->vcn += length;
- /* Only adjust the lcn if it is real. */
- if (rl->lcn >= 0 || lcn_fixup)
- rl->lcn += length;
- rl->length -= length;
- ntfs_debug("Done (split one).");
- return 0;
- }
- /*
- * @start is neither in a hole nor at the beginning of a run.
- *
- * If @end is in a hole, things are easier as simply truncating the run
- * @start is in to end at @start - 1, deleting all runs after that up
- * to @end, and finally extending the beginning of the run @end is in
- * to be @start is all that is needed.
- */
- if (rl_end->lcn == LCN_HOLE) {
- /* Truncate the run containing @start. */
- rl->length = start - rl->vcn;
- rl++;
- /* Cut out all runlist elements up to @end. */
- if (rl < rl_end)
- memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
- sizeof(*rl));
- /* Extend the beginning of the run @end is in to be @start. */
- rl->vcn = start;
- rl->length = rl[1].vcn - start;
- goto shrink_allocation;
- }
- /*
- * If @end is not in a hole there are still two cases to distinguish.
- * Either @end is or is not in the same run as @start.
- *
- * The second case is easier as it can be reduced to an already solved
- * problem by truncating the run @start is in to end at @start - 1.
- * Then, if @end is in the next run need to split the run into a sparse
- * run followed by a non-sparse run (already covered above) and if @end
- * is not in the next run switching it to be sparse, again reduces the
- * problem to the already covered case of "@start is in a hole".
- */
- if (end >= rl[1].vcn) {
- /*
- * If @end is not in the next run, reduce the problem to the
- * case of "@start is in a hole".
- */
- if (rl[1].length && end >= rl[2].vcn) {
- /* Truncate the run containing @start. */
- rl->length = start - rl->vcn;
- rl++;
- rl->vcn = start;
- rl->lcn = LCN_HOLE;
- goto extend_hole;
- }
- trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
- if (IS_ERR(trl))
- goto enomem_out;
- old_size++;
- if (runlist->rl != trl) {
- rl = trl + (rl - runlist->rl);
- rl_end = trl + (rl_end - runlist->rl);
- rl_real_end = trl + (rl_real_end - runlist->rl);
- runlist->rl = trl;
- }
- /* Truncate the run containing @start. */
- rl->length = start - rl->vcn;
- rl++;
- /*
- * @end is in the next run, reduce the problem to the case
- * where "@start is at the beginning of a run and @end is in
- * the same run as @start".
- */
- delta = rl->vcn - start;
- rl->vcn = start;
- if (rl->lcn >= 0) {
- rl->lcn -= delta;
- /* Need this in case the lcn just became negative. */
- lcn_fixup = true;
- }
- rl->length += delta;
- goto split_end;
- }
- /*
- * The first case from above, i.e. @end is in the same run as @start.
- * We need to split the run into three. One run for the non-sparse
- * region between the beginning of the old run and @start, one for the
- * sparse region between @start and @end, and one for the remaining
- * non-sparse region, i.e. between @end and the end of the old run.
- */
- trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
- if (IS_ERR(trl))
- goto enomem_out;
- old_size += 2;
- if (runlist->rl != trl) {
- rl = trl + (rl - runlist->rl);
- rl_end = trl + (rl_end - runlist->rl);
- rl_real_end = trl + (rl_real_end - runlist->rl);
- runlist->rl = trl;
- }
- /* Shift all the runs up by two. */
- memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
- /* Finally, setup the three split runs. */
- rl->length = start - rl->vcn;
- rl++;
- rl->vcn = start;
- rl->lcn = LCN_HOLE;
- rl->length = length;
- rl++;
- delta = end - rl->vcn;
- rl->vcn = end;
- rl->lcn += delta;
- rl->length -= delta;
- ntfs_debug("Done (split both).");
- return 0;
-enomem_out:
- ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
- return -ENOMEM;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
deleted file mode 100644
index 38de0a375f59..000000000000
--- a/fs/ntfs/runlist.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_RUNLIST_H
-#define _LINUX_NTFS_RUNLIST_H
-
-#include "types.h"
-#include "layout.h"
-#include "volume.h"
-
-/**
- * runlist_element - in memory vcn to lcn mapping array element
- * @vcn: starting vcn of the current array element
- * @lcn: starting lcn of the current array element
- * @length: length in clusters of the current array element
- *
- * The last vcn (in fact the last vcn + 1) is reached when length == 0.
- *
- * When lcn == -1 this means that the count vcns starting at vcn are not
- * physically allocated (i.e. this is a hole / data is sparse).
- */
-typedef struct { /* In memory vcn to lcn mapping structure element. */
- VCN vcn; /* vcn = Starting virtual cluster number. */
- LCN lcn; /* lcn = Starting logical cluster number. */
- s64 length; /* Run length in clusters. */
-} runlist_element;
-
-/**
- * runlist - in memory vcn to lcn mapping array including a read/write lock
- * @rl: pointer to an array of runlist elements
- * @lock: read/write spinlock for serializing access to @rl
- *
- */
-typedef struct {
- runlist_element *rl;
- struct rw_semaphore lock;
-} runlist;
-
-static inline void ntfs_init_runlist(runlist *rl)
-{
- rl->rl = NULL;
- init_rwsem(&rl->lock);
-}
-
-typedef enum {
- LCN_HOLE = -1, /* Keep this as highest value or die! */
- LCN_RL_NOT_MAPPED = -2,
- LCN_ENOENT = -3,
- LCN_ENOMEM = -4,
- LCN_EIO = -5,
-} LCN_SPECIAL_VALUES;
-
-extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
- runlist_element *srl);
-
-extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
- const ATTR_RECORD *attr, runlist_element *old_rl);
-
-extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
-
-#ifdef NTFS_RW
-
-extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl,
- const VCN vcn);
-
-extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
- const runlist_element *rl, const VCN first_vcn,
- const VCN last_vcn);
-
-extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
- const int dst_len, const runlist_element *rl,
- const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn);
-
-extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
- runlist *const runlist, const s64 new_length);
-
-int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
- const VCN start, const s64 length);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
deleted file mode 100644
index 56a7d5bd33e4..000000000000
--- a/fs/ntfs/super.c
+++ /dev/null
@@ -1,3202 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2001,2002 Richard Russon
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/stddef.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
-#include <linux/blkdev.h> /* For bdev_logical_block_size(). */
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-#include <linux/moduleparam.h>
-#include <linux/bitmap.h>
-
-#include "sysctl.h"
-#include "logfile.h"
-#include "quota.h"
-#include "usnjrnl.h"
-#include "dir.h"
-#include "debug.h"
-#include "index.h"
-#include "inode.h"
-#include "aops.h"
-#include "layout.h"
-#include "malloc.h"
-#include "ntfs.h"
-
-/* Number of mounted filesystems which have compression enabled. */
-static unsigned long ntfs_nr_compression_users;
-
-/* A global default upcase table and a corresponding reference count. */
-static ntfschar *default_upcase;
-static unsigned long ntfs_nr_upcase_users;
-
-/* Error constants/strings used in inode.c::ntfs_show_options(). */
-typedef enum {
- /* One of these must be present, default is ON_ERRORS_CONTINUE. */
- ON_ERRORS_PANIC = 0x01,
- ON_ERRORS_REMOUNT_RO = 0x02,
- ON_ERRORS_CONTINUE = 0x04,
- /* Optional, can be combined with any of the above. */
- ON_ERRORS_RECOVER = 0x10,
-} ON_ERRORS_ACTIONS;
-
-const option_t on_errors_arr[] = {
- { ON_ERRORS_PANIC, "panic" },
- { ON_ERRORS_REMOUNT_RO, "remount-ro", },
- { ON_ERRORS_CONTINUE, "continue", },
- { ON_ERRORS_RECOVER, "recover" },
- { 0, NULL }
-};
-
-/**
- * simple_getbool - convert input string to a boolean value
- * @s: input string to convert
- * @setval: where to store the output boolean value
- *
- * Copied from old ntfs driver (which copied from vfat driver).
- *
- * "1", "yes", "true", or an empty string are converted to %true.
- * "0", "no", and "false" are converted to %false.
- *
- * Return: %1 if the string is converted or was empty and *setval contains it;
- * %0 if the string was not valid.
- */
-static int simple_getbool(char *s, bool *setval)
-{
- if (s) {
- if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
- *setval = true;
- else if (!strcmp(s, "0") || !strcmp(s, "no") ||
- !strcmp(s, "false"))
- *setval = false;
- else
- return 0;
- } else
- *setval = true;
- return 1;
-}
-
-/**
- * parse_options - parse the (re)mount options
- * @vol: ntfs volume
- * @opt: string containing the (re)mount options
- *
- * Parse the recognized options in @opt for the ntfs volume described by @vol.
- */
-static bool parse_options(ntfs_volume *vol, char *opt)
-{
- char *p, *v, *ov;
- static char *utf8 = "utf8";
- int errors = 0, sloppy = 0;
- kuid_t uid = INVALID_UID;
- kgid_t gid = INVALID_GID;
- umode_t fmask = (umode_t)-1, dmask = (umode_t)-1;
- int mft_zone_multiplier = -1, on_errors = -1;
- int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1;
- struct nls_table *nls_map = NULL, *old_nls;
-
- /* I am lazy... (-8 */
-#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \
- if (!strcmp(p, option)) { \
- if (!v || !*v) \
- variable = default_value; \
- else { \
- variable = simple_strtoul(ov = v, &v, 0); \
- if (*v) \
- goto needs_val; \
- } \
- }
-#define NTFS_GETOPT(option, variable) \
- if (!strcmp(p, option)) { \
- if (!v || !*v) \
- goto needs_arg; \
- variable = simple_strtoul(ov = v, &v, 0); \
- if (*v) \
- goto needs_val; \
- }
-#define NTFS_GETOPT_UID(option, variable) \
- if (!strcmp(p, option)) { \
- uid_t uid_value; \
- if (!v || !*v) \
- goto needs_arg; \
- uid_value = simple_strtoul(ov = v, &v, 0); \
- if (*v) \
- goto needs_val; \
- variable = make_kuid(current_user_ns(), uid_value); \
- if (!uid_valid(variable)) \
- goto needs_val; \
- }
-#define NTFS_GETOPT_GID(option, variable) \
- if (!strcmp(p, option)) { \
- gid_t gid_value; \
- if (!v || !*v) \
- goto needs_arg; \
- gid_value = simple_strtoul(ov = v, &v, 0); \
- if (*v) \
- goto needs_val; \
- variable = make_kgid(current_user_ns(), gid_value); \
- if (!gid_valid(variable)) \
- goto needs_val; \
- }
-#define NTFS_GETOPT_OCTAL(option, variable) \
- if (!strcmp(p, option)) { \
- if (!v || !*v) \
- goto needs_arg; \
- variable = simple_strtoul(ov = v, &v, 8); \
- if (*v) \
- goto needs_val; \
- }
-#define NTFS_GETOPT_BOOL(option, variable) \
- if (!strcmp(p, option)) { \
- bool val; \
- if (!simple_getbool(v, &val)) \
- goto needs_bool; \
- variable = val; \
- }
-#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \
- if (!strcmp(p, option)) { \
- int _i; \
- if (!v || !*v) \
- goto needs_arg; \
- ov = v; \
- if (variable == -1) \
- variable = 0; \
- for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \
- if (!strcmp(opt_array[_i].str, v)) { \
- variable |= opt_array[_i].val; \
- break; \
- } \
- if (!opt_array[_i].str || !*opt_array[_i].str) \
- goto needs_val; \
- }
- if (!opt || !*opt)
- goto no_mount_options;
- ntfs_debug("Entering with mount options string: %s", opt);
- while ((p = strsep(&opt, ","))) {
- if ((v = strchr(p, '=')))
- *v++ = 0;
- NTFS_GETOPT_UID("uid", uid)
- else NTFS_GETOPT_GID("gid", gid)
- else NTFS_GETOPT_OCTAL("umask", fmask = dmask)
- else NTFS_GETOPT_OCTAL("fmask", fmask)
- else NTFS_GETOPT_OCTAL("dmask", dmask)
- else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
- else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true)
- else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
- else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
- else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse)
- else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors,
- on_errors_arr)
- else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes"))
- ntfs_warning(vol->sb, "Ignoring obsolete option %s.",
- p);
- else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) {
- if (!strcmp(p, "iocharset"))
- ntfs_warning(vol->sb, "Option iocharset is "
- "deprecated. Please use "
- "option nls=<charsetname> in "
- "the future.");
- if (!v || !*v)
- goto needs_arg;
-use_utf8:
- old_nls = nls_map;
- nls_map = load_nls(v);
- if (!nls_map) {
- if (!old_nls) {
- ntfs_error(vol->sb, "NLS character set "
- "%s not found.", v);
- return false;
- }
- ntfs_error(vol->sb, "NLS character set %s not "
- "found. Using previous one %s.",
- v, old_nls->charset);
- nls_map = old_nls;
- } else /* nls_map */ {
- unload_nls(old_nls);
- }
- } else if (!strcmp(p, "utf8")) {
- bool val = false;
- ntfs_warning(vol->sb, "Option utf8 is no longer "
- "supported, using option nls=utf8. Please "
- "use option nls=utf8 in the future and "
- "make sure utf8 is compiled either as a "
- "module or into the kernel.");
- if (!v || !*v)
- val = true;
- else if (!simple_getbool(v, &val))
- goto needs_bool;
- if (val) {
- v = utf8;
- goto use_utf8;
- }
- } else {
- ntfs_error(vol->sb, "Unrecognized mount option %s.", p);
- if (errors < INT_MAX)
- errors++;
- }
-#undef NTFS_GETOPT_OPTIONS_ARRAY
-#undef NTFS_GETOPT_BOOL
-#undef NTFS_GETOPT
-#undef NTFS_GETOPT_WITH_DEFAULT
- }
-no_mount_options:
- if (errors && !sloppy)
- return false;
- if (sloppy)
- ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
- "unrecognized mount option(s) and continuing.");
- /* Keep this first! */
- if (on_errors != -1) {
- if (!on_errors) {
- ntfs_error(vol->sb, "Invalid errors option argument "
- "or bug in options parser.");
- return false;
- }
- }
- if (nls_map) {
- if (vol->nls_map && vol->nls_map != nls_map) {
- ntfs_error(vol->sb, "Cannot change NLS character set "
- "on remount.");
- return false;
- } /* else (!vol->nls_map) */
- ntfs_debug("Using NLS character set %s.", nls_map->charset);
- vol->nls_map = nls_map;
- } else /* (!nls_map) */ {
- if (!vol->nls_map) {
- vol->nls_map = load_nls_default();
- if (!vol->nls_map) {
- ntfs_error(vol->sb, "Failed to load default "
- "NLS character set.");
- return false;
- }
- ntfs_debug("Using default NLS character set (%s).",
- vol->nls_map->charset);
- }
- }
- if (mft_zone_multiplier != -1) {
- if (vol->mft_zone_multiplier && vol->mft_zone_multiplier !=
- mft_zone_multiplier) {
- ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
- "on remount.");
- return false;
- }
- if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
- ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
- "Using default value, i.e. 1.");
- mft_zone_multiplier = 1;
- }
- vol->mft_zone_multiplier = mft_zone_multiplier;
- }
- if (!vol->mft_zone_multiplier)
- vol->mft_zone_multiplier = 1;
- if (on_errors != -1)
- vol->on_errors = on_errors;
- if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
- vol->on_errors |= ON_ERRORS_CONTINUE;
- if (uid_valid(uid))
- vol->uid = uid;
- if (gid_valid(gid))
- vol->gid = gid;
- if (fmask != (umode_t)-1)
- vol->fmask = fmask;
- if (dmask != (umode_t)-1)
- vol->dmask = dmask;
- if (show_sys_files != -1) {
- if (show_sys_files)
- NVolSetShowSystemFiles(vol);
- else
- NVolClearShowSystemFiles(vol);
- }
- if (case_sensitive != -1) {
- if (case_sensitive)
- NVolSetCaseSensitive(vol);
- else
- NVolClearCaseSensitive(vol);
- }
- if (disable_sparse != -1) {
- if (disable_sparse)
- NVolClearSparseEnabled(vol);
- else {
- if (!NVolSparseEnabled(vol) &&
- vol->major_ver && vol->major_ver < 3)
- ntfs_warning(vol->sb, "Not enabling sparse "
- "support due to NTFS volume "
- "version %i.%i (need at least "
- "version 3.0).", vol->major_ver,
- vol->minor_ver);
- else
- NVolSetSparseEnabled(vol);
- }
- }
- return true;
-needs_arg:
- ntfs_error(vol->sb, "The %s option requires an argument.", p);
- return false;
-needs_bool:
- ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
- return false;
-needs_val:
- ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
- return false;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_write_volume_flags - write new flags to the volume information flags
- * @vol: ntfs volume on which to modify the flags
- * @flags: new flags value for the volume information flags
- *
- * Internal function. You probably want to use ntfs_{set,clear}_volume_flags()
- * instead (see below).
- *
- * Replace the volume information flags on the volume @vol with the value
- * supplied in @flags. Note, this overwrites the volume information flags, so
- * make sure to combine the flags you want to modify with the old flags and use
- * the result when calling ntfs_write_volume_flags().
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags)
-{
- ntfs_inode *ni = NTFS_I(vol->vol_ino);
- MFT_RECORD *m;
- VOLUME_INFORMATION *vi;
- ntfs_attr_search_ctx *ctx;
- int err;
-
- ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.",
- le16_to_cpu(vol->vol_flags), le16_to_cpu(flags));
- if (vol->vol_flags == flags)
- goto done;
- BUG_ON(!ni);
- m = map_mft_record(ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto put_unm_err_out;
- }
- err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
- ctx);
- if (err)
- goto put_unm_err_out;
- vi = (VOLUME_INFORMATION*)((u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset));
- vol->vol_flags = vi->flags = flags;
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
-done:
- ntfs_debug("Done.");
- return 0;
-put_unm_err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
-err_out:
- ntfs_error(vol->sb, "Failed with error code %i.", -err);
- return err;
-}
-
-/**
- * ntfs_set_volume_flags - set bits in the volume information flags
- * @vol: ntfs volume on which to modify the flags
- * @flags: flags to set on the volume
- *
- * Set the bits in @flags in the volume information flags on the volume @vol.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
-{
- flags &= VOLUME_FLAGS_MASK;
- return ntfs_write_volume_flags(vol, vol->vol_flags | flags);
-}
-
-/**
- * ntfs_clear_volume_flags - clear bits in the volume information flags
- * @vol: ntfs volume on which to modify the flags
- * @flags: flags to clear on the volume
- *
- * Clear the bits in @flags in the volume information flags on the volume @vol.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
-{
- flags &= VOLUME_FLAGS_MASK;
- flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags));
- return ntfs_write_volume_flags(vol, flags);
-}
-
-#endif /* NTFS_RW */
-
-/**
- * ntfs_remount - change the mount options of a mounted ntfs filesystem
- * @sb: superblock of mounted ntfs filesystem
- * @flags: remount flags
- * @opt: remount options string
- *
- * Change the mount options of an already mounted ntfs filesystem.
- *
- * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after
- * ntfs_remount() returns successfully (i.e. returns 0). Otherwise,
- * @sb->s_flags are not changed.
- */
-static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
-{
- ntfs_volume *vol = NTFS_SB(sb);
-
- ntfs_debug("Entering with remount options string: %s", opt);
-
- sync_filesystem(sb);
-
-#ifndef NTFS_RW
- /* For read-only compiled driver, enforce read-only flag. */
- *flags |= SB_RDONLY;
-#else /* NTFS_RW */
- /*
- * For the read-write compiled driver, if we are remounting read-write,
- * make sure there are no volume errors and that no unsupported volume
- * flags are set. Also, empty the logfile journal as it would become
- * stale as soon as something is written to the volume and mark the
- * volume dirty so that chkdsk is run if the volume is not umounted
- * cleanly. Finally, mark the quotas out of date so Windows rescans
- * the volume on boot and updates them.
- *
- * When remounting read-only, mark the volume clean if no volume errors
- * have occurred.
- */
- if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
- static const char *es = ". Cannot remount read-write.";
-
- /* Remounting read-write. */
- if (NVolErrors(vol)) {
- ntfs_error(sb, "Volume has errors and is read-only%s",
- es);
- return -EROFS;
- }
- if (vol->vol_flags & VOLUME_IS_DIRTY) {
- ntfs_error(sb, "Volume is dirty and read-only%s", es);
- return -EROFS;
- }
- if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
- ntfs_error(sb, "Volume has been modified by chkdsk "
- "and is read-only%s", es);
- return -EROFS;
- }
- if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
- ntfs_error(sb, "Volume has unsupported flags set "
- "(0x%x) and is read-only%s",
- (unsigned)le16_to_cpu(vol->vol_flags),
- es);
- return -EROFS;
- }
- if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
- ntfs_error(sb, "Failed to set dirty bit in volume "
- "information flags%s", es);
- return -EROFS;
- }
-#if 0
- // TODO: Enable this code once we start modifying anything that
- // is different between NTFS 1.2 and 3.x...
- /* Set NT4 compatibility flag on newer NTFS version volumes. */
- if ((vol->major_ver > 1)) {
- if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
- ntfs_error(sb, "Failed to set NT4 "
- "compatibility flag%s", es);
- NVolSetErrors(vol);
- return -EROFS;
- }
- }
-#endif
- if (!ntfs_empty_logfile(vol->logfile_ino)) {
- ntfs_error(sb, "Failed to empty journal $LogFile%s",
- es);
- NVolSetErrors(vol);
- return -EROFS;
- }
- if (!ntfs_mark_quotas_out_of_date(vol)) {
- ntfs_error(sb, "Failed to mark quotas out of date%s",
- es);
- NVolSetErrors(vol);
- return -EROFS;
- }
- if (!ntfs_stamp_usnjrnl(vol)) {
- ntfs_error(sb, "Failed to stamp transaction log "
- "($UsnJrnl)%s", es);
- NVolSetErrors(vol);
- return -EROFS;
- }
- } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
- /* Remounting read-only. */
- if (!NVolErrors(vol)) {
- if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
- ntfs_warning(sb, "Failed to clear dirty bit "
- "in volume information "
- "flags. Run chkdsk.");
- }
- }
-#endif /* NTFS_RW */
-
- // TODO: Deal with *flags.
-
- if (!parse_options(vol, opt))
- return -EINVAL;
-
- ntfs_debug("Done.");
- return 0;
-}
-
-/**
- * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
- * @sb: Super block of the device to which @b belongs.
- * @b: Boot sector of device @sb to check.
- * @silent: If 'true', all output will be silenced.
- *
- * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
- * sector. Returns 'true' if it is valid and 'false' if not.
- *
- * @sb is only needed for warning/error output, i.e. it can be NULL when silent
- * is 'true'.
- */
-static bool is_boot_sector_ntfs(const struct super_block *sb,
- const NTFS_BOOT_SECTOR *b, const bool silent)
-{
- /*
- * Check that checksum == sum of u32 values from b to the checksum
- * field. If checksum is zero, no checking is done. We will work when
- * the checksum test fails, since some utilities update the boot sector
- * ignoring the checksum which leaves the checksum out-of-date. We
- * report a warning if this is the case.
- */
- if ((void*)b < (void*)&b->checksum && b->checksum && !silent) {
- le32 *u;
- u32 i;
-
- for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u)
- i += le32_to_cpup(u);
- if (le32_to_cpu(b->checksum) != i)
- ntfs_warning(sb, "Invalid boot sector checksum.");
- }
- /* Check OEMidentifier is "NTFS " */
- if (b->oem_id != magicNTFS)
- goto not_ntfs;
- /* Check bytes per sector value is between 256 and 4096. */
- if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 ||
- le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000)
- goto not_ntfs;
- /* Check sectors per cluster value is valid. */
- switch (b->bpb.sectors_per_cluster) {
- case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128:
- break;
- default:
- goto not_ntfs;
- }
- /* Check the cluster size is not above the maximum (64kiB). */
- if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) *
- b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE)
- goto not_ntfs;
- /* Check reserved/unused fields are really zero. */
- if (le16_to_cpu(b->bpb.reserved_sectors) ||
- le16_to_cpu(b->bpb.root_entries) ||
- le16_to_cpu(b->bpb.sectors) ||
- le16_to_cpu(b->bpb.sectors_per_fat) ||
- le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats)
- goto not_ntfs;
- /* Check clusters per file mft record value is valid. */
- if ((u8)b->clusters_per_mft_record < 0xe1 ||
- (u8)b->clusters_per_mft_record > 0xf7)
- switch (b->clusters_per_mft_record) {
- case 1: case 2: case 4: case 8: case 16: case 32: case 64:
- break;
- default:
- goto not_ntfs;
- }
- /* Check clusters per index block value is valid. */
- if ((u8)b->clusters_per_index_record < 0xe1 ||
- (u8)b->clusters_per_index_record > 0xf7)
- switch (b->clusters_per_index_record) {
- case 1: case 2: case 4: case 8: case 16: case 32: case 64:
- break;
- default:
- goto not_ntfs;
- }
- /*
- * Check for valid end of sector marker. We will work without it, but
- * many BIOSes will refuse to boot from a bootsector if the magic is
- * incorrect, so we emit a warning.
- */
- if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
- ntfs_warning(sb, "Invalid end of sector marker.");
- return true;
-not_ntfs:
- return false;
-}
-
-/**
- * read_ntfs_boot_sector - read the NTFS boot sector of a device
- * @sb: super block of device to read the boot sector from
- * @silent: if true, suppress all output
- *
- * Reads the boot sector from the device and validates it. If that fails, tries
- * to read the backup boot sector, first from the end of the device a-la NT4 and
- * later and then from the middle of the device a-la NT3.51 and before.
- *
- * If a valid boot sector is found but it is not the primary boot sector, we
- * repair the primary boot sector silently (unless the device is read-only or
- * the primary boot sector is not accessible).
- *
- * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super
- * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized
- * to their respective values.
- *
- * Return the unlocked buffer head containing the boot sector or NULL on error.
- */
-static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
- const int silent)
-{
- const char *read_err_str = "Unable to read %s boot sector.";
- struct buffer_head *bh_primary, *bh_backup;
- sector_t nr_blocks = NTFS_SB(sb)->nr_blocks;
-
- /* Try to read primary boot sector. */
- if ((bh_primary = sb_bread(sb, 0))) {
- if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
- bh_primary->b_data, silent))
- return bh_primary;
- if (!silent)
- ntfs_error(sb, "Primary boot sector is invalid.");
- } else if (!silent)
- ntfs_error(sb, read_err_str, "primary");
- if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) {
- if (bh_primary)
- brelse(bh_primary);
- if (!silent)
- ntfs_error(sb, "Mount option errors=recover not used. "
- "Aborting without trying to recover.");
- return NULL;
- }
- /* Try to read NT4+ backup boot sector. */
- if ((bh_backup = sb_bread(sb, nr_blocks - 1))) {
- if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
- bh_backup->b_data, silent))
- goto hotfix_primary_boot_sector;
- brelse(bh_backup);
- } else if (!silent)
- ntfs_error(sb, read_err_str, "backup");
- /* Try to read NT3.51- backup boot sector. */
- if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) {
- if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
- bh_backup->b_data, silent))
- goto hotfix_primary_boot_sector;
- if (!silent)
- ntfs_error(sb, "Could not find a valid backup boot "
- "sector.");
- brelse(bh_backup);
- } else if (!silent)
- ntfs_error(sb, read_err_str, "backup");
- /* We failed. Cleanup and return. */
- if (bh_primary)
- brelse(bh_primary);
- return NULL;
-hotfix_primary_boot_sector:
- if (bh_primary) {
- /*
- * If we managed to read sector zero and the volume is not
- * read-only, copy the found, valid backup boot sector to the
- * primary boot sector. Note we only copy the actual boot
- * sector structure, not the actual whole device sector as that
- * may be bigger and would potentially damage the $Boot system
- * file (FIXME: Would be nice to know if the backup boot sector
- * on a large sector device contains the whole boot loader or
- * just the first 512 bytes).
- */
- if (!sb_rdonly(sb)) {
- ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
- "boot sector from backup copy.");
- memcpy(bh_primary->b_data, bh_backup->b_data,
- NTFS_BLOCK_SIZE);
- mark_buffer_dirty(bh_primary);
- sync_dirty_buffer(bh_primary);
- if (buffer_uptodate(bh_primary)) {
- brelse(bh_backup);
- return bh_primary;
- }
- ntfs_error(sb, "Hot-fix: Device write error while "
- "recovering primary boot sector.");
- } else {
- ntfs_warning(sb, "Hot-fix: Recovery of primary boot "
- "sector failed: Read-only mount.");
- }
- brelse(bh_primary);
- }
- ntfs_warning(sb, "Using backup boot sector.");
- return bh_backup;
-}
-
-/**
- * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol
- * @vol: volume structure to initialise with data from boot sector
- * @b: boot sector to parse
- *
- * Parse the ntfs boot sector @b and store all imporant information therein in
- * the ntfs super block @vol. Return 'true' on success and 'false' on error.
- */
-static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
-{
- unsigned int sectors_per_cluster_bits, nr_hidden_sects;
- int clusters_per_mft_record, clusters_per_index_record;
- s64 ll;
-
- vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector);
- vol->sector_size_bits = ffs(vol->sector_size) - 1;
- ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size,
- vol->sector_size);
- ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
- vol->sector_size_bits);
- if (vol->sector_size < vol->sb->s_blocksize) {
- ntfs_error(vol->sb, "Sector size (%i) is smaller than the "
- "device block size (%lu). This is not "
- "supported. Sorry.", vol->sector_size,
- vol->sb->s_blocksize);
- return false;
- }
- ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
- sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
- ntfs_debug("sectors_per_cluster_bits = 0x%x",
- sectors_per_cluster_bits);
- nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors);
- ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects);
- vol->cluster_size = vol->sector_size << sectors_per_cluster_bits;
- vol->cluster_size_mask = vol->cluster_size - 1;
- vol->cluster_size_bits = ffs(vol->cluster_size) - 1;
- ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
- vol->cluster_size);
- ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
- ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits);
- if (vol->cluster_size < vol->sector_size) {
- ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
- "sector size (%i). This is not supported. "
- "Sorry.", vol->cluster_size, vol->sector_size);
- return false;
- }
- clusters_per_mft_record = b->clusters_per_mft_record;
- ntfs_debug("clusters_per_mft_record = %i (0x%x)",
- clusters_per_mft_record, clusters_per_mft_record);
- if (clusters_per_mft_record > 0)
- vol->mft_record_size = vol->cluster_size <<
- (ffs(clusters_per_mft_record) - 1);
- else
- /*
- * When mft_record_size < cluster_size, clusters_per_mft_record
- * = -log2(mft_record_size) bytes. mft_record_size normaly is
- * 1024 bytes, which is encoded as 0xF6 (-10 in decimal).
- */
- vol->mft_record_size = 1 << -clusters_per_mft_record;
- vol->mft_record_size_mask = vol->mft_record_size - 1;
- vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1;
- ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size,
- vol->mft_record_size);
- ntfs_debug("vol->mft_record_size_mask = 0x%x",
- vol->mft_record_size_mask);
- ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
- vol->mft_record_size_bits, vol->mft_record_size_bits);
- /*
- * We cannot support mft record sizes above the PAGE_SIZE since
- * we store $MFT/$DATA, the table of mft records in the page cache.
- */
- if (vol->mft_record_size > PAGE_SIZE) {
- ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
- "PAGE_SIZE on your system (%lu). "
- "This is not supported. Sorry.",
- vol->mft_record_size, PAGE_SIZE);
- return false;
- }
- /* We cannot support mft record sizes below the sector size. */
- if (vol->mft_record_size < vol->sector_size) {
- ntfs_error(vol->sb, "Mft record size (%i) is smaller than the "
- "sector size (%i). This is not supported. "
- "Sorry.", vol->mft_record_size,
- vol->sector_size);
- return false;
- }
- clusters_per_index_record = b->clusters_per_index_record;
- ntfs_debug("clusters_per_index_record = %i (0x%x)",
- clusters_per_index_record, clusters_per_index_record);
- if (clusters_per_index_record > 0)
- vol->index_record_size = vol->cluster_size <<
- (ffs(clusters_per_index_record) - 1);
- else
- /*
- * When index_record_size < cluster_size,
- * clusters_per_index_record = -log2(index_record_size) bytes.
- * index_record_size normaly equals 4096 bytes, which is
- * encoded as 0xF4 (-12 in decimal).
- */
- vol->index_record_size = 1 << -clusters_per_index_record;
- vol->index_record_size_mask = vol->index_record_size - 1;
- vol->index_record_size_bits = ffs(vol->index_record_size) - 1;
- ntfs_debug("vol->index_record_size = %i (0x%x)",
- vol->index_record_size, vol->index_record_size);
- ntfs_debug("vol->index_record_size_mask = 0x%x",
- vol->index_record_size_mask);
- ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
- vol->index_record_size_bits,
- vol->index_record_size_bits);
- /* We cannot support index record sizes below the sector size. */
- if (vol->index_record_size < vol->sector_size) {
- ntfs_error(vol->sb, "Index record size (%i) is smaller than "
- "the sector size (%i). This is not "
- "supported. Sorry.", vol->index_record_size,
- vol->sector_size);
- return false;
- }
- /*
- * Get the size of the volume in clusters and check for 64-bit-ness.
- * Windows currently only uses 32 bits to save the clusters so we do
- * the same as it is much faster on 32-bit CPUs.
- */
- ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
- if ((u64)ll >= 1ULL << 32) {
- ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry.");
- return false;
- }
- vol->nr_clusters = ll;
- ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
- /*
- * On an architecture where unsigned long is 32-bits, we restrict the
- * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler
- * will hopefully optimize the whole check away.
- */
- if (sizeof(unsigned long) < 8) {
- if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) {
- ntfs_error(vol->sb, "Volume size (%lluTiB) is too "
- "large for this architecture. "
- "Maximum supported is 2TiB. Sorry.",
- (unsigned long long)ll >> (40 -
- vol->cluster_size_bits));
- return false;
- }
- }
- ll = sle64_to_cpu(b->mft_lcn);
- if (ll >= vol->nr_clusters) {
- ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
- "volume. Weird.", (unsigned long long)ll,
- (unsigned long long)ll);
- return false;
- }
- vol->mft_lcn = ll;
- ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
- ll = sle64_to_cpu(b->mftmirr_lcn);
- if (ll >= vol->nr_clusters) {
- ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
- "of volume. Weird.", (unsigned long long)ll,
- (unsigned long long)ll);
- return false;
- }
- vol->mftmirr_lcn = ll;
- ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
-#ifdef NTFS_RW
- /*
- * Work out the size of the mft mirror in number of mft records. If the
- * cluster size is less than or equal to the size taken by four mft
- * records, the mft mirror stores the first four mft records. If the
- * cluster size is bigger than the size taken by four mft records, the
- * mft mirror contains as many mft records as will fit into one
- * cluster.
- */
- if (vol->cluster_size <= (4 << vol->mft_record_size_bits))
- vol->mftmirr_size = 4;
- else
- vol->mftmirr_size = vol->cluster_size >>
- vol->mft_record_size_bits;
- ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size);
-#endif /* NTFS_RW */
- vol->serial_no = le64_to_cpu(b->volume_serial_number);
- ntfs_debug("vol->serial_no = 0x%llx",
- (unsigned long long)vol->serial_no);
- return true;
-}
-
-/**
- * ntfs_setup_allocators - initialize the cluster and mft allocators
- * @vol: volume structure for which to setup the allocators
- *
- * Setup the cluster (lcn) and mft allocators to the starting values.
- */
-static void ntfs_setup_allocators(ntfs_volume *vol)
-{
-#ifdef NTFS_RW
- LCN mft_zone_size, mft_lcn;
-#endif /* NTFS_RW */
-
- ntfs_debug("vol->mft_zone_multiplier = 0x%x",
- vol->mft_zone_multiplier);
-#ifdef NTFS_RW
- /* Determine the size of the MFT zone. */
- mft_zone_size = vol->nr_clusters;
- switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */
- case 4:
- mft_zone_size >>= 1; /* 50% */
- break;
- case 3:
- mft_zone_size = (mft_zone_size +
- (mft_zone_size >> 1)) >> 2; /* 37.5% */
- break;
- case 2:
- mft_zone_size >>= 2; /* 25% */
- break;
- /* case 1: */
- default:
- mft_zone_size >>= 3; /* 12.5% */
- break;
- }
- /* Setup the mft zone. */
- vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn;
- ntfs_debug("vol->mft_zone_pos = 0x%llx",
- (unsigned long long)vol->mft_zone_pos);
- /*
- * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs
- * source) and if the actual mft_lcn is in the expected place or even
- * further to the front of the volume, extend the mft_zone to cover the
- * beginning of the volume as well. This is in order to protect the
- * area reserved for the mft bitmap as well within the mft_zone itself.
- * On non-standard volumes we do not protect it as the overhead would
- * be higher than the speed increase we would get by doing it.
- */
- mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size;
- if (mft_lcn * vol->cluster_size < 16 * 1024)
- mft_lcn = (16 * 1024 + vol->cluster_size - 1) /
- vol->cluster_size;
- if (vol->mft_zone_start <= mft_lcn)
- vol->mft_zone_start = 0;
- ntfs_debug("vol->mft_zone_start = 0x%llx",
- (unsigned long long)vol->mft_zone_start);
- /*
- * Need to cap the mft zone on non-standard volumes so that it does
- * not point outside the boundaries of the volume. We do this by
- * halving the zone size until we are inside the volume.
- */
- vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
- while (vol->mft_zone_end >= vol->nr_clusters) {
- mft_zone_size >>= 1;
- vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
- }
- ntfs_debug("vol->mft_zone_end = 0x%llx",
- (unsigned long long)vol->mft_zone_end);
- /*
- * Set the current position within each data zone to the start of the
- * respective zone.
- */
- vol->data1_zone_pos = vol->mft_zone_end;
- ntfs_debug("vol->data1_zone_pos = 0x%llx",
- (unsigned long long)vol->data1_zone_pos);
- vol->data2_zone_pos = 0;
- ntfs_debug("vol->data2_zone_pos = 0x%llx",
- (unsigned long long)vol->data2_zone_pos);
-
- /* Set the mft data allocation position to mft record 24. */
- vol->mft_data_pos = 24;
- ntfs_debug("vol->mft_data_pos = 0x%llx",
- (unsigned long long)vol->mft_data_pos);
-#endif /* NTFS_RW */
-}
-
-#ifdef NTFS_RW
-
-/**
- * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
- * @vol: ntfs super block describing device whose mft mirror to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_mft_mirror(ntfs_volume *vol)
-{
- struct inode *tmp_ino;
- ntfs_inode *tmp_ni;
-
- ntfs_debug("Entering.");
- /* Get mft mirror inode. */
- tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr);
- if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
- if (!IS_ERR(tmp_ino))
- iput(tmp_ino);
- /* Caller will display error message. */
- return false;
- }
- /*
- * Re-initialize some specifics about $MFTMirr's inode as
- * ntfs_read_inode() will have set up the default ones.
- */
- /* Set uid and gid to root. */
- tmp_ino->i_uid = GLOBAL_ROOT_UID;
- tmp_ino->i_gid = GLOBAL_ROOT_GID;
- /* Regular file. No access for anyone. */
- tmp_ino->i_mode = S_IFREG;
- /* No VFS initiated operations allowed for $MFTMirr. */
- tmp_ino->i_op = &ntfs_empty_inode_ops;
- tmp_ino->i_fop = &ntfs_empty_file_ops;
- /* Put in our special address space operations. */
- tmp_ino->i_mapping->a_ops = &ntfs_mst_aops;
- tmp_ni = NTFS_I(tmp_ino);
- /* The $MFTMirr, like the $MFT is multi sector transfer protected. */
- NInoSetMstProtected(tmp_ni);
- NInoSetSparseDisabled(tmp_ni);
- /*
- * Set up our little cheat allowing us to reuse the async read io
- * completion handler for directories.
- */
- tmp_ni->itype.index.block_size = vol->mft_record_size;
- tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
- vol->mftmirr_ino = tmp_ino;
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * check_mft_mirror - compare contents of the mft mirror with the mft
- * @vol: ntfs super block describing device whose mft mirror to check
- *
- * Return 'true' on success or 'false' on error.
- *
- * Note, this function also results in the mft mirror runlist being completely
- * mapped into memory. The mft mirror write code requires this and will BUG()
- * should it find an unmapped runlist element.
- */
-static bool check_mft_mirror(ntfs_volume *vol)
-{
- struct super_block *sb = vol->sb;
- ntfs_inode *mirr_ni;
- struct page *mft_page, *mirr_page;
- u8 *kmft, *kmirr;
- runlist_element *rl, rl2[2];
- pgoff_t index;
- int mrecs_per_page, i;
-
- ntfs_debug("Entering.");
- /* Compare contents of $MFT and $MFTMirr. */
- mrecs_per_page = PAGE_SIZE / vol->mft_record_size;
- BUG_ON(!mrecs_per_page);
- BUG_ON(!vol->mftmirr_size);
- mft_page = mirr_page = NULL;
- kmft = kmirr = NULL;
- index = i = 0;
- do {
- u32 bytes;
-
- /* Switch pages if necessary. */
- if (!(i % mrecs_per_page)) {
- if (index) {
- ntfs_unmap_page(mft_page);
- ntfs_unmap_page(mirr_page);
- }
- /* Get the $MFT page. */
- mft_page = ntfs_map_page(vol->mft_ino->i_mapping,
- index);
- if (IS_ERR(mft_page)) {
- ntfs_error(sb, "Failed to read $MFT.");
- return false;
- }
- kmft = page_address(mft_page);
- /* Get the $MFTMirr page. */
- mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping,
- index);
- if (IS_ERR(mirr_page)) {
- ntfs_error(sb, "Failed to read $MFTMirr.");
- goto mft_unmap_out;
- }
- kmirr = page_address(mirr_page);
- ++index;
- }
- /* Do not check the record if it is not in use. */
- if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) {
- /* Make sure the record is ok. */
- if (ntfs_is_baad_recordp((le32*)kmft)) {
- ntfs_error(sb, "Incomplete multi sector "
- "transfer detected in mft "
- "record %i.", i);
-mm_unmap_out:
- ntfs_unmap_page(mirr_page);
-mft_unmap_out:
- ntfs_unmap_page(mft_page);
- return false;
- }
- }
- /* Do not check the mirror record if it is not in use. */
- if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) {
- if (ntfs_is_baad_recordp((le32*)kmirr)) {
- ntfs_error(sb, "Incomplete multi sector "
- "transfer detected in mft "
- "mirror record %i.", i);
- goto mm_unmap_out;
- }
- }
- /* Get the amount of data in the current record. */
- bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use);
- if (bytes < sizeof(MFT_RECORD_OLD) ||
- bytes > vol->mft_record_size ||
- ntfs_is_baad_recordp((le32*)kmft)) {
- bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use);
- if (bytes < sizeof(MFT_RECORD_OLD) ||
- bytes > vol->mft_record_size ||
- ntfs_is_baad_recordp((le32*)kmirr))
- bytes = vol->mft_record_size;
- }
- /* Compare the two records. */
- if (memcmp(kmft, kmirr, bytes)) {
- ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not "
- "match. Run ntfsfix or chkdsk.", i);
- goto mm_unmap_out;
- }
- kmft += vol->mft_record_size;
- kmirr += vol->mft_record_size;
- } while (++i < vol->mftmirr_size);
- /* Release the last pages. */
- ntfs_unmap_page(mft_page);
- ntfs_unmap_page(mirr_page);
-
- /* Construct the mft mirror runlist by hand. */
- rl2[0].vcn = 0;
- rl2[0].lcn = vol->mftmirr_lcn;
- rl2[0].length = (vol->mftmirr_size * vol->mft_record_size +
- vol->cluster_size - 1) / vol->cluster_size;
- rl2[1].vcn = rl2[0].length;
- rl2[1].lcn = LCN_ENOENT;
- rl2[1].length = 0;
- /*
- * Because we have just read all of the mft mirror, we know we have
- * mapped the full runlist for it.
- */
- mirr_ni = NTFS_I(vol->mftmirr_ino);
- down_read(&mirr_ni->runlist.lock);
- rl = mirr_ni->runlist.rl;
- /* Compare the two runlists. They must be identical. */
- i = 0;
- do {
- if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn ||
- rl2[i].length != rl[i].length) {
- ntfs_error(sb, "$MFTMirr location mismatch. "
- "Run chkdsk.");
- up_read(&mirr_ni->runlist.lock);
- return false;
- }
- } while (rl2[i++].length);
- up_read(&mirr_ni->runlist.lock);
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * load_and_check_logfile - load and check the logfile inode for a volume
- * @vol: ntfs super block describing device whose logfile to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_check_logfile(ntfs_volume *vol,
- RESTART_PAGE_HEADER **rp)
-{
- struct inode *tmp_ino;
-
- ntfs_debug("Entering.");
- tmp_ino = ntfs_iget(vol->sb, FILE_LogFile);
- if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
- if (!IS_ERR(tmp_ino))
- iput(tmp_ino);
- /* Caller will display error message. */
- return false;
- }
- if (!ntfs_check_logfile(tmp_ino, rp)) {
- iput(tmp_ino);
- /* ntfs_check_logfile() will have displayed error output. */
- return false;
- }
- NInoSetSparseDisabled(NTFS_I(tmp_ino));
- vol->logfile_ino = tmp_ino;
- ntfs_debug("Done.");
- return true;
-}
-
-#define NTFS_HIBERFIL_HEADER_SIZE 4096
-
-/**
- * check_windows_hibernation_status - check if Windows is suspended on a volume
- * @vol: ntfs super block of device to check
- *
- * Check if Windows is hibernated on the ntfs volume @vol. This is done by
- * looking for the file hiberfil.sys in the root directory of the volume. If
- * the file is not present Windows is definitely not suspended.
- *
- * If hiberfil.sys exists and is less than 4kiB in size it means Windows is
- * definitely suspended (this volume is not the system volume). Caveat: on a
- * system with many volumes it is possible that the < 4kiB check is bogus but
- * for now this should do fine.
- *
- * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the
- * hiberfil header (which is the first 4kiB). If this begins with "hibr",
- * Windows is definitely suspended. If it is completely full of zeroes,
- * Windows is definitely not hibernated. Any other case is treated as if
- * Windows is suspended. This caters for the above mentioned caveat of a
- * system with many volumes where no "hibr" magic would be present and there is
- * no zero header.
- *
- * Return 0 if Windows is not hibernated on the volume, >0 if Windows is
- * hibernated on the volume, and -errno on error.
- */
-static int check_windows_hibernation_status(ntfs_volume *vol)
-{
- MFT_REF mref;
- struct inode *vi;
- struct page *page;
- u32 *kaddr, *kend;
- ntfs_name *name = NULL;
- int ret = 1;
- static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
- cpu_to_le16('i'), cpu_to_le16('b'),
- cpu_to_le16('e'), cpu_to_le16('r'),
- cpu_to_le16('f'), cpu_to_le16('i'),
- cpu_to_le16('l'), cpu_to_le16('.'),
- cpu_to_le16('s'), cpu_to_le16('y'),
- cpu_to_le16('s'), 0 };
-
- ntfs_debug("Entering.");
- /*
- * Find the inode number for the hibernation file by looking up the
- * filename hiberfil.sys in the root directory.
- */
- inode_lock(vol->root_ino);
- mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
- &name);
- inode_unlock(vol->root_ino);
- if (IS_ERR_MREF(mref)) {
- ret = MREF_ERR(mref);
- /* If the file does not exist, Windows is not hibernated. */
- if (ret == -ENOENT) {
- ntfs_debug("hiberfil.sys not present. Windows is not "
- "hibernated on the volume.");
- return 0;
- }
- /* A real error occurred. */
- ntfs_error(vol->sb, "Failed to find inode number for "
- "hiberfil.sys.");
- return ret;
- }
- /* We do not care for the type of match that was found. */
- kfree(name);
- /* Get the inode. */
- vi = ntfs_iget(vol->sb, MREF(mref));
- if (IS_ERR(vi) || is_bad_inode(vi)) {
- if (!IS_ERR(vi))
- iput(vi);
- ntfs_error(vol->sb, "Failed to load hiberfil.sys.");
- return IS_ERR(vi) ? PTR_ERR(vi) : -EIO;
- }
- if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) {
- ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). "
- "Windows is hibernated on the volume. This "
- "is not the system volume.", i_size_read(vi));
- goto iput_out;
- }
- page = ntfs_map_page(vi->i_mapping, 0);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read from hiberfil.sys.");
- ret = PTR_ERR(page);
- goto iput_out;
- }
- kaddr = (u32*)page_address(page);
- if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
- ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is "
- "hibernated on the volume. This is the "
- "system volume.");
- goto unm_iput_out;
- }
- kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr);
- do {
- if (unlikely(*kaddr)) {
- ntfs_debug("hiberfil.sys is larger than 4kiB "
- "(0x%llx), does not contain the "
- "\"hibr\" magic, and does not have a "
- "zero header. Windows is hibernated "
- "on the volume. This is not the "
- "system volume.", i_size_read(vi));
- goto unm_iput_out;
- }
- } while (++kaddr < kend);
- ntfs_debug("hiberfil.sys contains a zero header. Windows is not "
- "hibernated on the volume. This is the system "
- "volume.");
- ret = 0;
-unm_iput_out:
- ntfs_unmap_page(page);
-iput_out:
- iput(vi);
- return ret;
-}
-
-/**
- * load_and_init_quota - load and setup the quota file for a volume if present
- * @vol: ntfs super block describing device whose quota file to load
- *
- * Return 'true' on success or 'false' on error. If $Quota is not present, we
- * leave vol->quota_ino as NULL and return success.
- */
-static bool load_and_init_quota(ntfs_volume *vol)
-{
- MFT_REF mref;
- struct inode *tmp_ino;
- ntfs_name *name = NULL;
- static const ntfschar Quota[7] = { cpu_to_le16('$'),
- cpu_to_le16('Q'), cpu_to_le16('u'),
- cpu_to_le16('o'), cpu_to_le16('t'),
- cpu_to_le16('a'), 0 };
- static ntfschar Q[3] = { cpu_to_le16('$'),
- cpu_to_le16('Q'), 0 };
-
- ntfs_debug("Entering.");
- /*
- * Find the inode number for the quota file by looking up the filename
- * $Quota in the extended system files directory $Extend.
- */
- inode_lock(vol->extend_ino);
- mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
- &name);
- inode_unlock(vol->extend_ino);
- if (IS_ERR_MREF(mref)) {
- /*
- * If the file does not exist, quotas are disabled and have
- * never been enabled on this volume, just return success.
- */
- if (MREF_ERR(mref) == -ENOENT) {
- ntfs_debug("$Quota not present. Volume does not have "
- "quotas enabled.");
- /*
- * No need to try to set quotas out of date if they are
- * not enabled.
- */
- NVolSetQuotaOutOfDate(vol);
- return true;
- }
- /* A real error occurred. */
- ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
- return false;
- }
- /* We do not care for the type of match that was found. */
- kfree(name);
- /* Get the inode. */
- tmp_ino = ntfs_iget(vol->sb, MREF(mref));
- if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
- if (!IS_ERR(tmp_ino))
- iput(tmp_ino);
- ntfs_error(vol->sb, "Failed to load $Quota.");
- return false;
- }
- vol->quota_ino = tmp_ino;
- /* Get the $Q index allocation attribute. */
- tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
- if (IS_ERR(tmp_ino)) {
- ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
- return false;
- }
- vol->quota_q_ino = tmp_ino;
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * load_and_init_usnjrnl - load and setup the transaction log if present
- * @vol: ntfs super block describing device whose usnjrnl file to load
- *
- * Return 'true' on success or 'false' on error.
- *
- * If $UsnJrnl is not present or in the process of being disabled, we set
- * NVolUsnJrnlStamped() and return success.
- *
- * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn,
- * i.e. transaction logging has only just been enabled or the journal has been
- * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped()
- * and return success.
- */
-static bool load_and_init_usnjrnl(ntfs_volume *vol)
-{
- MFT_REF mref;
- struct inode *tmp_ino;
- ntfs_inode *tmp_ni;
- struct page *page;
- ntfs_name *name = NULL;
- USN_HEADER *uh;
- static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
- cpu_to_le16('U'), cpu_to_le16('s'),
- cpu_to_le16('n'), cpu_to_le16('J'),
- cpu_to_le16('r'), cpu_to_le16('n'),
- cpu_to_le16('l'), 0 };
- static ntfschar Max[5] = { cpu_to_le16('$'),
- cpu_to_le16('M'), cpu_to_le16('a'),
- cpu_to_le16('x'), 0 };
- static ntfschar J[3] = { cpu_to_le16('$'),
- cpu_to_le16('J'), 0 };
-
- ntfs_debug("Entering.");
- /*
- * Find the inode number for the transaction log file by looking up the
- * filename $UsnJrnl in the extended system files directory $Extend.
- */
- inode_lock(vol->extend_ino);
- mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
- &name);
- inode_unlock(vol->extend_ino);
- if (IS_ERR_MREF(mref)) {
- /*
- * If the file does not exist, transaction logging is disabled,
- * just return success.
- */
- if (MREF_ERR(mref) == -ENOENT) {
- ntfs_debug("$UsnJrnl not present. Volume does not "
- "have transaction logging enabled.");
-not_enabled:
- /*
- * No need to try to stamp the transaction log if
- * transaction logging is not enabled.
- */
- NVolSetUsnJrnlStamped(vol);
- return true;
- }
- /* A real error occurred. */
- ntfs_error(vol->sb, "Failed to find inode number for "
- "$UsnJrnl.");
- return false;
- }
- /* We do not care for the type of match that was found. */
- kfree(name);
- /* Get the inode. */
- tmp_ino = ntfs_iget(vol->sb, MREF(mref));
- if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) {
- if (!IS_ERR(tmp_ino))
- iput(tmp_ino);
- ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
- return false;
- }
- vol->usnjrnl_ino = tmp_ino;
- /*
- * If the transaction log is in the process of being deleted, we can
- * ignore it.
- */
- if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) {
- ntfs_debug("$UsnJrnl in the process of being disabled. "
- "Volume does not have transaction logging "
- "enabled.");
- goto not_enabled;
- }
- /* Get the $DATA/$Max attribute. */
- tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4);
- if (IS_ERR(tmp_ino)) {
- ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max "
- "attribute.");
- return false;
- }
- vol->usnjrnl_max_ino = tmp_ino;
- if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
- ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
- "attribute (size is 0x%llx but should be at "
- "least 0x%zx bytes).", i_size_read(tmp_ino),
- sizeof(USN_HEADER));
- return false;
- }
- /* Get the $DATA/$J attribute. */
- tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2);
- if (IS_ERR(tmp_ino)) {
- ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J "
- "attribute.");
- return false;
- }
- vol->usnjrnl_j_ino = tmp_ino;
- /* Verify $J is non-resident and sparse. */
- tmp_ni = NTFS_I(vol->usnjrnl_j_ino);
- if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) {
- ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident "
- "and/or not sparse.");
- return false;
- }
- /* Read the USN_HEADER from $DATA/$Max. */
- page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max "
- "attribute.");
- return false;
- }
- uh = (USN_HEADER*)page_address(page);
- /* Sanity check the $Max. */
- if (unlikely(sle64_to_cpu(uh->allocation_delta) >
- sle64_to_cpu(uh->maximum_size))) {
- ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds "
- "maximum size (0x%llx). $UsnJrnl is corrupt.",
- (long long)sle64_to_cpu(uh->allocation_delta),
- (long long)sle64_to_cpu(uh->maximum_size));
- ntfs_unmap_page(page);
- return false;
- }
- /*
- * If the transaction log has been stamped and nothing has been written
- * to it since, we do not need to stamp it.
- */
- if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >=
- i_size_read(vol->usnjrnl_j_ino))) {
- if (likely(sle64_to_cpu(uh->lowest_valid_usn) ==
- i_size_read(vol->usnjrnl_j_ino))) {
- ntfs_unmap_page(page);
- ntfs_debug("$UsnJrnl is enabled but nothing has been "
- "logged since it was last stamped. "
- "Treating this as if the volume does "
- "not have transaction logging "
- "enabled.");
- goto not_enabled;
- }
- ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) "
- "which is out of bounds (0x%llx). $UsnJrnl "
- "is corrupt.",
- (long long)sle64_to_cpu(uh->lowest_valid_usn),
- i_size_read(vol->usnjrnl_j_ino));
- ntfs_unmap_page(page);
- return false;
- }
- ntfs_unmap_page(page);
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * load_and_init_attrdef - load the attribute definitions table for a volume
- * @vol: ntfs super block describing device whose attrdef to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_attrdef(ntfs_volume *vol)
-{
- loff_t i_size;
- struct super_block *sb = vol->sb;
- struct inode *ino;
- struct page *page;
- pgoff_t index, max_index;
- unsigned int size;
-
- ntfs_debug("Entering.");
- /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */
- ino = ntfs_iget(sb, FILE_AttrDef);
- if (IS_ERR(ino) || is_bad_inode(ino)) {
- if (!IS_ERR(ino))
- iput(ino);
- goto failed;
- }
- NInoSetSparseDisabled(NTFS_I(ino));
- /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */
- i_size = i_size_read(ino);
- if (i_size <= 0 || i_size > 0x7fffffff)
- goto iput_failed;
- vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size);
- if (!vol->attrdef)
- goto iput_failed;
- index = 0;
- max_index = i_size >> PAGE_SHIFT;
- size = PAGE_SIZE;
- while (index < max_index) {
- /* Read the attrdef table and copy it into the linear buffer. */
-read_partial_attrdef_page:
- page = ntfs_map_page(ino->i_mapping, index);
- if (IS_ERR(page))
- goto free_iput_failed;
- memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT),
- page_address(page), size);
- ntfs_unmap_page(page);
- }
- if (size == PAGE_SIZE) {
- size = i_size & ~PAGE_MASK;
- if (size)
- goto read_partial_attrdef_page;
- }
- vol->attrdef_size = i_size;
- ntfs_debug("Read %llu bytes from $AttrDef.", i_size);
- iput(ino);
- return true;
-free_iput_failed:
- ntfs_free(vol->attrdef);
- vol->attrdef = NULL;
-iput_failed:
- iput(ino);
-failed:
- ntfs_error(sb, "Failed to initialize attribute definition table.");
- return false;
-}
-
-#endif /* NTFS_RW */
-
-/**
- * load_and_init_upcase - load the upcase table for an ntfs volume
- * @vol: ntfs super block describing device whose upcase to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_upcase(ntfs_volume *vol)
-{
- loff_t i_size;
- struct super_block *sb = vol->sb;
- struct inode *ino;
- struct page *page;
- pgoff_t index, max_index;
- unsigned int size;
- int i, max;
-
- ntfs_debug("Entering.");
- /* Read upcase table and setup vol->upcase and vol->upcase_len. */
- ino = ntfs_iget(sb, FILE_UpCase);
- if (IS_ERR(ino) || is_bad_inode(ino)) {
- if (!IS_ERR(ino))
- iput(ino);
- goto upcase_failed;
- }
- /*
- * The upcase size must not be above 64k Unicode characters, must not
- * be zero and must be a multiple of sizeof(ntfschar).
- */
- i_size = i_size_read(ino);
- if (!i_size || i_size & (sizeof(ntfschar) - 1) ||
- i_size > 64ULL * 1024 * sizeof(ntfschar))
- goto iput_upcase_failed;
- vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size);
- if (!vol->upcase)
- goto iput_upcase_failed;
- index = 0;
- max_index = i_size >> PAGE_SHIFT;
- size = PAGE_SIZE;
- while (index < max_index) {
- /* Read the upcase table and copy it into the linear buffer. */
-read_partial_upcase_page:
- page = ntfs_map_page(ino->i_mapping, index);
- if (IS_ERR(page))
- goto iput_upcase_failed;
- memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT),
- page_address(page), size);
- ntfs_unmap_page(page);
- }
- if (size == PAGE_SIZE) {
- size = i_size & ~PAGE_MASK;
- if (size)
- goto read_partial_upcase_page;
- }
- vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS;
- ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).",
- i_size, 64 * 1024 * sizeof(ntfschar));
- iput(ino);
- mutex_lock(&ntfs_lock);
- if (!default_upcase) {
- ntfs_debug("Using volume specified $UpCase since default is "
- "not present.");
- mutex_unlock(&ntfs_lock);
- return true;
- }
- max = default_upcase_len;
- if (max > vol->upcase_len)
- max = vol->upcase_len;
- for (i = 0; i < max; i++)
- if (vol->upcase[i] != default_upcase[i])
- break;
- if (i == max) {
- ntfs_free(vol->upcase);
- vol->upcase = default_upcase;
- vol->upcase_len = max;
- ntfs_nr_upcase_users++;
- mutex_unlock(&ntfs_lock);
- ntfs_debug("Volume specified $UpCase matches default. Using "
- "default.");
- return true;
- }
- mutex_unlock(&ntfs_lock);
- ntfs_debug("Using volume specified $UpCase since it does not match "
- "the default.");
- return true;
-iput_upcase_failed:
- iput(ino);
- ntfs_free(vol->upcase);
- vol->upcase = NULL;
-upcase_failed:
- mutex_lock(&ntfs_lock);
- if (default_upcase) {
- vol->upcase = default_upcase;
- vol->upcase_len = default_upcase_len;
- ntfs_nr_upcase_users++;
- mutex_unlock(&ntfs_lock);
- ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
- "default.");
- return true;
- }
- mutex_unlock(&ntfs_lock);
- ntfs_error(sb, "Failed to initialize upcase table.");
- return false;
-}
-
-/*
- * The lcn and mft bitmap inodes are NTFS-internal inodes with
- * their own special locking rules:
- */
-static struct lock_class_key
- lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key,
- mftbmp_runlist_lock_key, mftbmp_mrec_lock_key;
-
-/**
- * load_system_files - open the system files using normal functions
- * @vol: ntfs super block describing device whose system files to load
- *
- * Open the system files with normal access functions and complete setting up
- * the ntfs super block @vol.
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_system_files(ntfs_volume *vol)
-{
- struct super_block *sb = vol->sb;
- MFT_RECORD *m;
- VOLUME_INFORMATION *vi;
- ntfs_attr_search_ctx *ctx;
-#ifdef NTFS_RW
- RESTART_PAGE_HEADER *rp;
- int err;
-#endif /* NTFS_RW */
-
- ntfs_debug("Entering.");
-#ifdef NTFS_RW
- /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */
- if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) {
- static const char *es1 = "Failed to load $MFTMirr";
- static const char *es2 = "$MFTMirr does not match $MFT";
- static const char *es3 = ". Run ntfsfix and/or chkdsk.";
-
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- !vol->mftmirr_ino ? es1 : es2,
- es3);
- goto iput_mirr_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s",
- !vol->mftmirr_ino ? es1 : es2, es3);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s",
- !vol->mftmirr_ino ? es1 : es2, es3);
- /* This will prevent a read-write remount. */
- NVolSetErrors(vol);
- }
-#endif /* NTFS_RW */
- /* Get mft bitmap attribute inode. */
- vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0);
- if (IS_ERR(vol->mftbmp_ino)) {
- ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
- goto iput_mirr_err_out;
- }
- lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock,
- &mftbmp_runlist_lock_key);
- lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock,
- &mftbmp_mrec_lock_key);
- /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
- if (!load_and_init_upcase(vol))
- goto iput_mftbmp_err_out;
-#ifdef NTFS_RW
- /*
- * Read attribute definitions table and setup @vol->attrdef and
- * @vol->attrdef_size.
- */
- if (!load_and_init_attrdef(vol))
- goto iput_upcase_err_out;
-#endif /* NTFS_RW */
- /*
- * Get the cluster allocation bitmap inode and verify the size, no
- * need for any locking at this stage as we are already running
- * exclusively as we are mount in progress task.
- */
- vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap);
- if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) {
- if (!IS_ERR(vol->lcnbmp_ino))
- iput(vol->lcnbmp_ino);
- goto bitmap_failed;
- }
- lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock,
- &lcnbmp_runlist_lock_key);
- lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock,
- &lcnbmp_mrec_lock_key);
-
- NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino));
- if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) {
- iput(vol->lcnbmp_ino);
-bitmap_failed:
- ntfs_error(sb, "Failed to load $Bitmap.");
- goto iput_attrdef_err_out;
- }
- /*
- * Get the volume inode and setup our cache of the volume flags and
- * version.
- */
- vol->vol_ino = ntfs_iget(sb, FILE_Volume);
- if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) {
- if (!IS_ERR(vol->vol_ino))
- iput(vol->vol_ino);
-volume_failed:
- ntfs_error(sb, "Failed to load $Volume.");
- goto iput_lcnbmp_err_out;
- }
- m = map_mft_record(NTFS_I(vol->vol_ino));
- if (IS_ERR(m)) {
-iput_volume_failed:
- iput(vol->vol_ino);
- goto volume_failed;
- }
- if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) {
- ntfs_error(sb, "Failed to get attribute search context.");
- goto get_ctx_vol_failed;
- }
- if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
- ctx) || ctx->attr->non_resident || ctx->attr->flags) {
-err_put_vol:
- ntfs_attr_put_search_ctx(ctx);
-get_ctx_vol_failed:
- unmap_mft_record(NTFS_I(vol->vol_ino));
- goto iput_volume_failed;
- }
- vi = (VOLUME_INFORMATION*)((char*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset));
- /* Some bounds checks. */
- if ((u8*)vi < (u8*)ctx->attr || (u8*)vi +
- le32_to_cpu(ctx->attr->data.resident.value_length) >
- (u8*)ctx->attr + le32_to_cpu(ctx->attr->length))
- goto err_put_vol;
- /* Copy the volume flags and version to the ntfs_volume structure. */
- vol->vol_flags = vi->flags;
- vol->major_ver = vi->major_ver;
- vol->minor_ver = vi->minor_ver;
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(NTFS_I(vol->vol_ino));
- pr_info("volume version %i.%i.\n", vol->major_ver,
- vol->minor_ver);
- if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
- ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
- "volume version %i.%i (need at least version "
- "3.0).", vol->major_ver, vol->minor_ver);
- NVolClearSparseEnabled(vol);
- }
-#ifdef NTFS_RW
- /* Make sure that no unsupported volume flags are set. */
- if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
- static const char *es1a = "Volume is dirty";
- static const char *es1b = "Volume has been modified by chkdsk";
- static const char *es1c = "Volume has unsupported flags set";
- static const char *es2a = ". Run chkdsk and mount in Windows.";
- static const char *es2b = ". Mount in Windows.";
- const char *es1, *es2;
-
- es2 = es2a;
- if (vol->vol_flags & VOLUME_IS_DIRTY)
- es1 = es1a;
- else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
- es1 = es1b;
- es2 = es2b;
- } else {
- es1 = es1c;
- ntfs_warning(sb, "Unsupported volume flags 0x%x "
- "encountered.",
- (unsigned)le16_to_cpu(vol->vol_flags));
- }
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- es1, es2);
- goto iput_vol_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s", es1, es2);
- /*
- * Do not set NVolErrors() because ntfs_remount() re-checks the
- * flags which we need to do in case any flags have changed.
- */
- }
- /*
- * Get the inode for the logfile, check it and determine if the volume
- * was shutdown cleanly.
- */
- rp = NULL;
- if (!load_and_check_logfile(vol, &rp) ||
- !ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
- static const char *es1a = "Failed to load $LogFile";
- static const char *es1b = "$LogFile is not clean";
- static const char *es2 = ". Mount in Windows.";
- const char *es1;
-
- es1 = !vol->logfile_ino ? es1a : es1b;
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- es1, es2);
- if (vol->logfile_ino) {
- BUG_ON(!rp);
- ntfs_free(rp);
- }
- goto iput_logfile_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s", es1, es2);
- /* This will prevent a read-write remount. */
- NVolSetErrors(vol);
- }
- ntfs_free(rp);
-#endif /* NTFS_RW */
- /* Get the root directory inode so we can do path lookups. */
- vol->root_ino = ntfs_iget(sb, FILE_root);
- if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
- if (!IS_ERR(vol->root_ino))
- iput(vol->root_ino);
- ntfs_error(sb, "Failed to load root directory.");
- goto iput_logfile_err_out;
- }
-#ifdef NTFS_RW
- /*
- * Check if Windows is suspended to disk on the target volume. If it
- * is hibernated, we must not write *anything* to the disk so set
- * NVolErrors() without setting the dirty volume flag and mount
- * read-only. This will prevent read-write remounting and it will also
- * prevent all writes.
- */
- err = check_windows_hibernation_status(vol);
- if (unlikely(err)) {
- static const char *es1a = "Failed to determine if Windows is "
- "hibernated";
- static const char *es1b = "Windows is hibernated";
- static const char *es2 = ". Run chkdsk.";
- const char *es1;
-
- es1 = err < 0 ? es1a : es1b;
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- es1, es2);
- goto iput_root_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s", es1, es2);
- /* This will prevent a read-write remount. */
- NVolSetErrors(vol);
- }
- /* If (still) a read-write mount, mark the volume dirty. */
- if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
- static const char *es1 = "Failed to set dirty bit in volume "
- "information flags";
- static const char *es2 = ". Run chkdsk.";
-
- /* Convert to a read-only mount. */
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors=continue nor "
- "on_errors=remount-ro was specified%s",
- es1, es2);
- goto iput_root_err_out;
- }
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= SB_RDONLY;
- /*
- * Do not set NVolErrors() because ntfs_remount() might manage
- * to set the dirty flag in which case all would be well.
- */
- }
-#if 0
- // TODO: Enable this code once we start modifying anything that is
- // different between NTFS 1.2 and 3.x...
- /*
- * If (still) a read-write mount, set the NT4 compatibility flag on
- * newer NTFS version volumes.
- */
- if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) &&
- ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
- static const char *es1 = "Failed to set NT4 compatibility flag";
- static const char *es2 = ". Run chkdsk.";
-
- /* Convert to a read-only mount. */
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors=continue nor "
- "on_errors=remount-ro was specified%s",
- es1, es2);
- goto iput_root_err_out;
- }
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= SB_RDONLY;
- NVolSetErrors(vol);
- }
-#endif
- /* If (still) a read-write mount, empty the logfile. */
- if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) {
- static const char *es1 = "Failed to empty $LogFile";
- static const char *es2 = ". Mount in Windows.";
-
- /* Convert to a read-only mount. */
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors=continue nor "
- "on_errors=remount-ro was specified%s",
- es1, es2);
- goto iput_root_err_out;
- }
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= SB_RDONLY;
- NVolSetErrors(vol);
- }
-#endif /* NTFS_RW */
- /* If on NTFS versions before 3.0, we are done. */
- if (unlikely(vol->major_ver < 3))
- return true;
- /* NTFS 3.0+ specific initialization. */
- /* Get the security descriptors inode. */
- vol->secure_ino = ntfs_iget(sb, FILE_Secure);
- if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) {
- if (!IS_ERR(vol->secure_ino))
- iput(vol->secure_ino);
- ntfs_error(sb, "Failed to load $Secure.");
- goto iput_root_err_out;
- }
- // TODO: Initialize security.
- /* Get the extended system files' directory inode. */
- vol->extend_ino = ntfs_iget(sb, FILE_Extend);
- if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
- !S_ISDIR(vol->extend_ino->i_mode)) {
- if (!IS_ERR(vol->extend_ino))
- iput(vol->extend_ino);
- ntfs_error(sb, "Failed to load $Extend.");
- goto iput_sec_err_out;
- }
-#ifdef NTFS_RW
- /* Find the quota file, load it if present, and set it up. */
- if (!load_and_init_quota(vol)) {
- static const char *es1 = "Failed to load $Quota";
- static const char *es2 = ". Run chkdsk.";
-
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- es1, es2);
- goto iput_quota_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s", es1, es2);
- /* This will prevent a read-write remount. */
- NVolSetErrors(vol);
- }
- /* If (still) a read-write mount, mark the quotas out of date. */
- if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) {
- static const char *es1 = "Failed to mark quotas out of date";
- static const char *es2 = ". Run chkdsk.";
-
- /* Convert to a read-only mount. */
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors=continue nor "
- "on_errors=remount-ro was specified%s",
- es1, es2);
- goto iput_quota_err_out;
- }
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= SB_RDONLY;
- NVolSetErrors(vol);
- }
- /*
- * Find the transaction log file ($UsnJrnl), load it if present, check
- * it, and set it up.
- */
- if (!load_and_init_usnjrnl(vol)) {
- static const char *es1 = "Failed to load $UsnJrnl";
- static const char *es2 = ". Run chkdsk.";
-
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- es1, es2);
- goto iput_usnjrnl_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s", es1, es2);
- /* This will prevent a read-write remount. */
- NVolSetErrors(vol);
- }
- /* If (still) a read-write mount, stamp the transaction log. */
- if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) {
- static const char *es1 = "Failed to stamp transaction log "
- "($UsnJrnl)";
- static const char *es2 = ". Run chkdsk.";
-
- /* Convert to a read-only mount. */
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors=continue nor "
- "on_errors=remount-ro was specified%s",
- es1, es2);
- goto iput_usnjrnl_err_out;
- }
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= SB_RDONLY;
- NVolSetErrors(vol);
- }
-#endif /* NTFS_RW */
- return true;
-#ifdef NTFS_RW
-iput_usnjrnl_err_out:
- iput(vol->usnjrnl_j_ino);
- iput(vol->usnjrnl_max_ino);
- iput(vol->usnjrnl_ino);
-iput_quota_err_out:
- iput(vol->quota_q_ino);
- iput(vol->quota_ino);
- iput(vol->extend_ino);
-#endif /* NTFS_RW */
-iput_sec_err_out:
- iput(vol->secure_ino);
-iput_root_err_out:
- iput(vol->root_ino);
-iput_logfile_err_out:
-#ifdef NTFS_RW
- iput(vol->logfile_ino);
-iput_vol_err_out:
-#endif /* NTFS_RW */
- iput(vol->vol_ino);
-iput_lcnbmp_err_out:
- iput(vol->lcnbmp_ino);
-iput_attrdef_err_out:
- vol->attrdef_size = 0;
- if (vol->attrdef) {
- ntfs_free(vol->attrdef);
- vol->attrdef = NULL;
- }
-#ifdef NTFS_RW
-iput_upcase_err_out:
-#endif /* NTFS_RW */
- vol->upcase_len = 0;
- mutex_lock(&ntfs_lock);
- if (vol->upcase == default_upcase) {
- ntfs_nr_upcase_users--;
- vol->upcase = NULL;
- }
- mutex_unlock(&ntfs_lock);
- if (vol->upcase) {
- ntfs_free(vol->upcase);
- vol->upcase = NULL;
- }
-iput_mftbmp_err_out:
- iput(vol->mftbmp_ino);
-iput_mirr_err_out:
-#ifdef NTFS_RW
- iput(vol->mftmirr_ino);
-#endif /* NTFS_RW */
- return false;
-}
-
-/**
- * ntfs_put_super - called by the vfs to unmount a volume
- * @sb: vfs superblock of volume to unmount
- *
- * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when
- * the volume is being unmounted (umount system call has been invoked) and it
- * releases all inodes and memory belonging to the NTFS specific part of the
- * super block.
- */
-static void ntfs_put_super(struct super_block *sb)
-{
- ntfs_volume *vol = NTFS_SB(sb);
-
- ntfs_debug("Entering.");
-
-#ifdef NTFS_RW
- /*
- * Commit all inodes while they are still open in case some of them
- * cause others to be dirtied.
- */
- ntfs_commit_inode(vol->vol_ino);
-
- /* NTFS 3.0+ specific. */
- if (vol->major_ver >= 3) {
- if (vol->usnjrnl_j_ino)
- ntfs_commit_inode(vol->usnjrnl_j_ino);
- if (vol->usnjrnl_max_ino)
- ntfs_commit_inode(vol->usnjrnl_max_ino);
- if (vol->usnjrnl_ino)
- ntfs_commit_inode(vol->usnjrnl_ino);
- if (vol->quota_q_ino)
- ntfs_commit_inode(vol->quota_q_ino);
- if (vol->quota_ino)
- ntfs_commit_inode(vol->quota_ino);
- if (vol->extend_ino)
- ntfs_commit_inode(vol->extend_ino);
- if (vol->secure_ino)
- ntfs_commit_inode(vol->secure_ino);
- }
-
- ntfs_commit_inode(vol->root_ino);
-
- down_write(&vol->lcnbmp_lock);
- ntfs_commit_inode(vol->lcnbmp_ino);
- up_write(&vol->lcnbmp_lock);
-
- down_write(&vol->mftbmp_lock);
- ntfs_commit_inode(vol->mftbmp_ino);
- up_write(&vol->mftbmp_lock);
-
- if (vol->logfile_ino)
- ntfs_commit_inode(vol->logfile_ino);
-
- if (vol->mftmirr_ino)
- ntfs_commit_inode(vol->mftmirr_ino);
- ntfs_commit_inode(vol->mft_ino);
-
- /*
- * If a read-write mount and no volume errors have occurred, mark the
- * volume clean. Also, re-commit all affected inodes.
- */
- if (!sb_rdonly(sb)) {
- if (!NVolErrors(vol)) {
- if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
- ntfs_warning(sb, "Failed to clear dirty bit "
- "in volume information "
- "flags. Run chkdsk.");
- ntfs_commit_inode(vol->vol_ino);
- ntfs_commit_inode(vol->root_ino);
- if (vol->mftmirr_ino)
- ntfs_commit_inode(vol->mftmirr_ino);
- ntfs_commit_inode(vol->mft_ino);
- } else {
- ntfs_warning(sb, "Volume has errors. Leaving volume "
- "marked dirty. Run chkdsk.");
- }
- }
-#endif /* NTFS_RW */
-
- iput(vol->vol_ino);
- vol->vol_ino = NULL;
-
- /* NTFS 3.0+ specific clean up. */
- if (vol->major_ver >= 3) {
-#ifdef NTFS_RW
- if (vol->usnjrnl_j_ino) {
- iput(vol->usnjrnl_j_ino);
- vol->usnjrnl_j_ino = NULL;
- }
- if (vol->usnjrnl_max_ino) {
- iput(vol->usnjrnl_max_ino);
- vol->usnjrnl_max_ino = NULL;
- }
- if (vol->usnjrnl_ino) {
- iput(vol->usnjrnl_ino);
- vol->usnjrnl_ino = NULL;
- }
- if (vol->quota_q_ino) {
- iput(vol->quota_q_ino);
- vol->quota_q_ino = NULL;
- }
- if (vol->quota_ino) {
- iput(vol->quota_ino);
- vol->quota_ino = NULL;
- }
-#endif /* NTFS_RW */
- if (vol->extend_ino) {
- iput(vol->extend_ino);
- vol->extend_ino = NULL;
- }
- if (vol->secure_ino) {
- iput(vol->secure_ino);
- vol->secure_ino = NULL;
- }
- }
-
- iput(vol->root_ino);
- vol->root_ino = NULL;
-
- down_write(&vol->lcnbmp_lock);
- iput(vol->lcnbmp_ino);
- vol->lcnbmp_ino = NULL;
- up_write(&vol->lcnbmp_lock);
-
- down_write(&vol->mftbmp_lock);
- iput(vol->mftbmp_ino);
- vol->mftbmp_ino = NULL;
- up_write(&vol->mftbmp_lock);
-
-#ifdef NTFS_RW
- if (vol->logfile_ino) {
- iput(vol->logfile_ino);
- vol->logfile_ino = NULL;
- }
- if (vol->mftmirr_ino) {
- /* Re-commit the mft mirror and mft just in case. */
- ntfs_commit_inode(vol->mftmirr_ino);
- ntfs_commit_inode(vol->mft_ino);
- iput(vol->mftmirr_ino);
- vol->mftmirr_ino = NULL;
- }
- /*
- * We should have no dirty inodes left, due to
- * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
- * the underlying mft records are written out and cleaned.
- */
- ntfs_commit_inode(vol->mft_ino);
- write_inode_now(vol->mft_ino, 1);
-#endif /* NTFS_RW */
-
- iput(vol->mft_ino);
- vol->mft_ino = NULL;
-
- /* Throw away the table of attribute definitions. */
- vol->attrdef_size = 0;
- if (vol->attrdef) {
- ntfs_free(vol->attrdef);
- vol->attrdef = NULL;
- }
- vol->upcase_len = 0;
- /*
- * Destroy the global default upcase table if necessary. Also decrease
- * the number of upcase users if we are a user.
- */
- mutex_lock(&ntfs_lock);
- if (vol->upcase == default_upcase) {
- ntfs_nr_upcase_users--;
- vol->upcase = NULL;
- }
- if (!ntfs_nr_upcase_users && default_upcase) {
- ntfs_free(default_upcase);
- default_upcase = NULL;
- }
- if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
- free_compression_buffers();
- mutex_unlock(&ntfs_lock);
- if (vol->upcase) {
- ntfs_free(vol->upcase);
- vol->upcase = NULL;
- }
-
- unload_nls(vol->nls_map);
-
- sb->s_fs_info = NULL;
- kfree(vol);
-}
-
-/**
- * get_nr_free_clusters - return the number of free clusters on a volume
- * @vol: ntfs volume for which to obtain free cluster count
- *
- * Calculate the number of free clusters on the mounted NTFS volume @vol. We
- * actually calculate the number of clusters in use instead because this
- * allows us to not care about partial pages as these will be just zero filled
- * and hence not be counted as allocated clusters.
- *
- * The only particularity is that clusters beyond the end of the logical ntfs
- * volume will be marked as allocated to prevent errors which means we have to
- * discount those at the end. This is important as the cluster bitmap always
- * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside
- * the logical volume and marked in use when they are not as they do not exist.
- *
- * If any pages cannot be read we assume all clusters in the erroring pages are
- * in use. This means we return an underestimate on errors which is better than
- * an overestimate.
- */
-static s64 get_nr_free_clusters(ntfs_volume *vol)
-{
- s64 nr_free = vol->nr_clusters;
- struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
- struct page *page;
- pgoff_t index, max_index;
-
- ntfs_debug("Entering.");
- /* Serialize accesses to the cluster bitmap. */
- down_read(&vol->lcnbmp_lock);
- /*
- * Convert the number of bits into bytes rounded up, then convert into
- * multiples of PAGE_SIZE, rounding up so that if we have one
- * full and one partial page max_index = 2.
- */
- max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
- ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
- max_index, PAGE_SIZE / 4);
- for (index = 0; index < max_index; index++) {
- unsigned long *kaddr;
-
- /*
- * Read the page from page cache, getting it from backing store
- * if necessary, and increment the use count.
- */
- page = read_mapping_page(mapping, index, NULL);
- /* Ignore pages which errored synchronously. */
- if (IS_ERR(page)) {
- ntfs_debug("read_mapping_page() error. Skipping "
- "page (index 0x%lx).", index);
- nr_free -= PAGE_SIZE * 8;
- continue;
- }
- kaddr = kmap_atomic(page);
- /*
- * Subtract the number of set bits. If this
- * is the last page and it is partial we don't really care as
- * it just means we do a little extra work but it won't affect
- * the result as all out of range bytes are set to zero by
- * ntfs_readpage().
- */
- nr_free -= bitmap_weight(kaddr,
- PAGE_SIZE * BITS_PER_BYTE);
- kunmap_atomic(kaddr);
- put_page(page);
- }
- ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
- /*
- * Fixup for eventual bits outside logical ntfs volume (see function
- * description above).
- */
- if (vol->nr_clusters & 63)
- nr_free += 64 - (vol->nr_clusters & 63);
- up_read(&vol->lcnbmp_lock);
- /* If errors occurred we may well have gone below zero, fix this. */
- if (nr_free < 0)
- nr_free = 0;
- ntfs_debug("Exiting.");
- return nr_free;
-}
-
-/**
- * __get_nr_free_mft_records - return the number of free inodes on a volume
- * @vol: ntfs volume for which to obtain free inode count
- * @nr_free: number of mft records in filesystem
- * @max_index: maximum number of pages containing set bits
- *
- * Calculate the number of free mft records (inodes) on the mounted NTFS
- * volume @vol. We actually calculate the number of mft records in use instead
- * because this allows us to not care about partial pages as these will be just
- * zero filled and hence not be counted as allocated mft record.
- *
- * If any pages cannot be read we assume all mft records in the erroring pages
- * are in use. This means we return an underestimate on errors which is better
- * than an overestimate.
- *
- * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing.
- */
-static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
- s64 nr_free, const pgoff_t max_index)
-{
- struct address_space *mapping = vol->mftbmp_ino->i_mapping;
- struct page *page;
- pgoff_t index;
-
- ntfs_debug("Entering.");
- /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
- ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
- "0x%lx.", max_index, PAGE_SIZE / 4);
- for (index = 0; index < max_index; index++) {
- unsigned long *kaddr;
-
- /*
- * Read the page from page cache, getting it from backing store
- * if necessary, and increment the use count.
- */
- page = read_mapping_page(mapping, index, NULL);
- /* Ignore pages which errored synchronously. */
- if (IS_ERR(page)) {
- ntfs_debug("read_mapping_page() error. Skipping "
- "page (index 0x%lx).", index);
- nr_free -= PAGE_SIZE * 8;
- continue;
- }
- kaddr = kmap_atomic(page);
- /*
- * Subtract the number of set bits. If this
- * is the last page and it is partial we don't really care as
- * it just means we do a little extra work but it won't affect
- * the result as all out of range bytes are set to zero by
- * ntfs_readpage().
- */
- nr_free -= bitmap_weight(kaddr,
- PAGE_SIZE * BITS_PER_BYTE);
- kunmap_atomic(kaddr);
- put_page(page);
- }
- ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
- index - 1);
- /* If errors occurred we may well have gone below zero, fix this. */
- if (nr_free < 0)
- nr_free = 0;
- ntfs_debug("Exiting.");
- return nr_free;
-}
-
-/**
- * ntfs_statfs - return information about mounted NTFS volume
- * @dentry: dentry from mounted volume
- * @sfs: statfs structure in which to return the information
- *
- * Return information about the mounted NTFS volume @dentry in the statfs structure
- * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
- * called). We interpret the values to be correct of the moment in time at
- * which we are called. Most values are variable otherwise and this isn't just
- * the free values but the totals as well. For example we can increase the
- * total number of file nodes if we run out and we can keep doing this until
- * there is no more space on the volume left at all.
- *
- * Called from vfs_statfs which is used to handle the statfs, fstatfs, and
- * ustat system calls.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
-{
- struct super_block *sb = dentry->d_sb;
- s64 size;
- ntfs_volume *vol = NTFS_SB(sb);
- ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
- pgoff_t max_index;
- unsigned long flags;
-
- ntfs_debug("Entering.");
- /* Type of filesystem. */
- sfs->f_type = NTFS_SB_MAGIC;
- /* Optimal transfer block size. */
- sfs->f_bsize = PAGE_SIZE;
- /*
- * Total data blocks in filesystem in units of f_bsize and since
- * inodes are also stored in data blocs ($MFT is a file) this is just
- * the total clusters.
- */
- sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
- PAGE_SHIFT;
- /* Free data blocks in filesystem in units of f_bsize. */
- size = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
- PAGE_SHIFT;
- if (size < 0LL)
- size = 0LL;
- /* Free blocks avail to non-superuser, same as above on NTFS. */
- sfs->f_bavail = sfs->f_bfree = size;
- /* Serialize accesses to the inode bitmap. */
- down_read(&vol->mftbmp_lock);
- read_lock_irqsave(&mft_ni->size_lock, flags);
- size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits;
- /*
- * Convert the maximum number of set bits into bytes rounded up, then
- * convert into multiples of PAGE_SIZE, rounding up so that if we
- * have one full and one partial page max_index = 2.
- */
- max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
- + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- /* Number of inodes in filesystem (at this point in time). */
- sfs->f_files = size;
- /* Free inodes in fs (based on current total count). */
- sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index);
- up_read(&vol->mftbmp_lock);
- /*
- * File system id. This is extremely *nix flavour dependent and even
- * within Linux itself all fs do their own thing. I interpret this to
- * mean a unique id associated with the mounted fs and not the id
- * associated with the filesystem driver, the latter is already given
- * by the filesystem type in sfs->f_type. Thus we use the 64-bit
- * volume serial number splitting it into two 32-bit parts. We enter
- * the least significant 32-bits in f_fsid[0] and the most significant
- * 32-bits in f_fsid[1].
- */
- sfs->f_fsid = u64_to_fsid(vol->serial_no);
- /* Maximum length of filenames. */
- sfs->f_namelen = NTFS_MAX_NAME_LEN;
- return 0;
-}
-
-#ifdef NTFS_RW
-static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
-{
- return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
-}
-#endif
-
-/*
- * The complete super operations.
- */
-static const struct super_operations ntfs_sops = {
- .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */
- .free_inode = ntfs_free_big_inode, /* VFS: Deallocate inode. */
-#ifdef NTFS_RW
- .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to
- disk. */
-#endif /* NTFS_RW */
- .put_super = ntfs_put_super, /* Syscall: umount. */
- .statfs = ntfs_statfs, /* Syscall: statfs */
- .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */
- .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is
- removed from memory. */
- .show_options = ntfs_show_options, /* Show mount options in
- proc. */
-};
-
-/**
- * ntfs_fill_super - mount an ntfs filesystem
- * @sb: super block of ntfs filesystem to mount
- * @opt: string containing the mount options
- * @silent: silence error output
- *
- * ntfs_fill_super() is called by the VFS to mount the device described by @sb
- * with the mount otions in @data with the NTFS filesystem.
- *
- * If @silent is true, remain silent even if errors are detected. This is used
- * during bootup, when the kernel tries to mount the root filesystem with all
- * registered filesystems one after the other until one succeeds. This implies
- * that all filesystems except the correct one will quite correctly and
- * expectedly return an error, but nobody wants to see error messages when in
- * fact this is what is supposed to happen.
- *
- * NOTE: @sb->s_flags contains the mount options flags.
- */
-static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
-{
- ntfs_volume *vol;
- struct buffer_head *bh;
- struct inode *tmp_ino;
- int blocksize, result;
-
- /*
- * We do a pretty difficult piece of bootstrap by reading the
- * MFT (and other metadata) from disk into memory. We'll only
- * release this metadata during umount, so the locking patterns
- * observed during bootstrap do not count. So turn off the
- * observation of locking patterns (strictly for this context
- * only) while mounting NTFS. [The validator is still active
- * otherwise, even for this context: it will for example record
- * lock class registrations.]
- */
- lockdep_off();
- ntfs_debug("Entering.");
-#ifndef NTFS_RW
- sb->s_flags |= SB_RDONLY;
-#endif /* ! NTFS_RW */
- /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
- sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
- vol = NTFS_SB(sb);
- if (!vol) {
- if (!silent)
- ntfs_error(sb, "Allocation of NTFS volume structure "
- "failed. Aborting mount...");
- lockdep_on();
- return -ENOMEM;
- }
- /* Initialize ntfs_volume structure. */
- *vol = (ntfs_volume) {
- .sb = sb,
- /*
- * Default is group and other don't have any access to files or
- * directories while owner has full access. Further, files by
- * default are not executable but directories are of course
- * browseable.
- */
- .fmask = 0177,
- .dmask = 0077,
- };
- init_rwsem(&vol->mftbmp_lock);
- init_rwsem(&vol->lcnbmp_lock);
-
- /* By default, enable sparse support. */
- NVolSetSparseEnabled(vol);
-
- /* Important to get the mount options dealt with now. */
- if (!parse_options(vol, (char*)opt))
- goto err_out_now;
-
- /* We support sector sizes up to the PAGE_SIZE. */
- if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) {
- if (!silent)
- ntfs_error(sb, "Device has unsupported sector size "
- "(%i). The maximum supported sector "
- "size on this architecture is %lu "
- "bytes.",
- bdev_logical_block_size(sb->s_bdev),
- PAGE_SIZE);
- goto err_out_now;
- }
- /*
- * Setup the device access block size to NTFS_BLOCK_SIZE or the hard
- * sector size, whichever is bigger.
- */
- blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE);
- if (blocksize < NTFS_BLOCK_SIZE) {
- if (!silent)
- ntfs_error(sb, "Unable to set device block size.");
- goto err_out_now;
- }
- BUG_ON(blocksize != sb->s_blocksize);
- ntfs_debug("Set device block size to %i bytes (block size bits %i).",
- blocksize, sb->s_blocksize_bits);
- /* Determine the size of the device in units of block_size bytes. */
- vol->nr_blocks = sb_bdev_nr_blocks(sb);
- if (!vol->nr_blocks) {
- if (!silent)
- ntfs_error(sb, "Unable to determine device size.");
- goto err_out_now;
- }
- /* Read the boot sector and return unlocked buffer head to it. */
- if (!(bh = read_ntfs_boot_sector(sb, silent))) {
- if (!silent)
- ntfs_error(sb, "Not an NTFS volume.");
- goto err_out_now;
- }
- /*
- * Extract the data from the boot sector and setup the ntfs volume
- * using it.
- */
- result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
- brelse(bh);
- if (!result) {
- if (!silent)
- ntfs_error(sb, "Unsupported NTFS filesystem.");
- goto err_out_now;
- }
- /*
- * If the boot sector indicates a sector size bigger than the current
- * device block size, switch the device block size to the sector size.
- * TODO: It may be possible to support this case even when the set
- * below fails, we would just be breaking up the i/o for each sector
- * into multiple blocks for i/o purposes but otherwise it should just
- * work. However it is safer to leave disabled until someone hits this
- * error message and then we can get them to try it without the setting
- * so we know for sure that it works.
- */
- if (vol->sector_size > blocksize) {
- blocksize = sb_set_blocksize(sb, vol->sector_size);
- if (blocksize != vol->sector_size) {
- if (!silent)
- ntfs_error(sb, "Unable to set device block "
- "size to sector size (%i).",
- vol->sector_size);
- goto err_out_now;
- }
- BUG_ON(blocksize != sb->s_blocksize);
- vol->nr_blocks = sb_bdev_nr_blocks(sb);
- ntfs_debug("Changed device block size to %i bytes (block size "
- "bits %i) to match volume sector size.",
- blocksize, sb->s_blocksize_bits);
- }
- /* Initialize the cluster and mft allocators. */
- ntfs_setup_allocators(vol);
- /* Setup remaining fields in the super block. */
- sb->s_magic = NTFS_SB_MAGIC;
- /*
- * Ntfs allows 63 bits for the file size, i.e. correct would be:
- * sb->s_maxbytes = ~0ULL >> 1;
- * But the kernel uses a long as the page cache page index which on
- * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel
- * defined to the maximum the page cache page index can cope with
- * without overflowing the index or to 2^63 - 1, whichever is smaller.
- */
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- /* Ntfs measures time in 100ns intervals. */
- sb->s_time_gran = 100;
- /*
- * Now load the metadata required for the page cache and our address
- * space operations to function. We do this by setting up a specialised
- * read_inode method and then just calling the normal iget() to obtain
- * the inode for $MFT which is sufficient to allow our normal inode
- * operations and associated address space operations to function.
- */
- sb->s_op = &ntfs_sops;
- tmp_ino = new_inode(sb);
- if (!tmp_ino) {
- if (!silent)
- ntfs_error(sb, "Failed to load essential metadata.");
- goto err_out_now;
- }
- tmp_ino->i_ino = FILE_MFT;
- insert_inode_hash(tmp_ino);
- if (ntfs_read_inode_mount(tmp_ino) < 0) {
- if (!silent)
- ntfs_error(sb, "Failed to load essential metadata.");
- goto iput_tmp_ino_err_out_now;
- }
- mutex_lock(&ntfs_lock);
- /*
- * The current mount is a compression user if the cluster size is
- * less than or equal 4kiB.
- */
- if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) {
- result = allocate_compression_buffers();
- if (result) {
- ntfs_error(NULL, "Failed to allocate buffers "
- "for compression engine.");
- ntfs_nr_compression_users--;
- mutex_unlock(&ntfs_lock);
- goto iput_tmp_ino_err_out_now;
- }
- }
- /*
- * Generate the global default upcase table if necessary. Also
- * temporarily increment the number of upcase users to avoid race
- * conditions with concurrent (u)mounts.
- */
- if (!default_upcase)
- default_upcase = generate_default_upcase();
- ntfs_nr_upcase_users++;
- mutex_unlock(&ntfs_lock);
- /*
- * From now on, ignore @silent parameter. If we fail below this line,
- * it will be due to a corrupt fs or a system error, so we report it.
- */
- /*
- * Open the system files with normal access functions and complete
- * setting up the ntfs super block.
- */
- if (!load_system_files(vol)) {
- ntfs_error(sb, "Failed to load system files.");
- goto unl_upcase_iput_tmp_ino_err_out_now;
- }
-
- /* We grab a reference, simulating an ntfs_iget(). */
- ihold(vol->root_ino);
- if ((sb->s_root = d_make_root(vol->root_ino))) {
- ntfs_debug("Exiting, status successful.");
- /* Release the default upcase if it has no users. */
- mutex_lock(&ntfs_lock);
- if (!--ntfs_nr_upcase_users && default_upcase) {
- ntfs_free(default_upcase);
- default_upcase = NULL;
- }
- mutex_unlock(&ntfs_lock);
- sb->s_export_op = &ntfs_export_ops;
- lockdep_on();
- return 0;
- }
- ntfs_error(sb, "Failed to allocate root directory.");
- /* Clean up after the successful load_system_files() call from above. */
- // TODO: Use ntfs_put_super() instead of repeating all this code...
- // FIXME: Should mark the volume clean as the error is most likely
- // -ENOMEM.
- iput(vol->vol_ino);
- vol->vol_ino = NULL;
- /* NTFS 3.0+ specific clean up. */
- if (vol->major_ver >= 3) {
-#ifdef NTFS_RW
- if (vol->usnjrnl_j_ino) {
- iput(vol->usnjrnl_j_ino);
- vol->usnjrnl_j_ino = NULL;
- }
- if (vol->usnjrnl_max_ino) {
- iput(vol->usnjrnl_max_ino);
- vol->usnjrnl_max_ino = NULL;
- }
- if (vol->usnjrnl_ino) {
- iput(vol->usnjrnl_ino);
- vol->usnjrnl_ino = NULL;
- }
- if (vol->quota_q_ino) {
- iput(vol->quota_q_ino);
- vol->quota_q_ino = NULL;
- }
- if (vol->quota_ino) {
- iput(vol->quota_ino);
- vol->quota_ino = NULL;
- }
-#endif /* NTFS_RW */
- if (vol->extend_ino) {
- iput(vol->extend_ino);
- vol->extend_ino = NULL;
- }
- if (vol->secure_ino) {
- iput(vol->secure_ino);
- vol->secure_ino = NULL;
- }
- }
- iput(vol->root_ino);
- vol->root_ino = NULL;
- iput(vol->lcnbmp_ino);
- vol->lcnbmp_ino = NULL;
- iput(vol->mftbmp_ino);
- vol->mftbmp_ino = NULL;
-#ifdef NTFS_RW
- if (vol->logfile_ino) {
- iput(vol->logfile_ino);
- vol->logfile_ino = NULL;
- }
- if (vol->mftmirr_ino) {
- iput(vol->mftmirr_ino);
- vol->mftmirr_ino = NULL;
- }
-#endif /* NTFS_RW */
- /* Throw away the table of attribute definitions. */
- vol->attrdef_size = 0;
- if (vol->attrdef) {
- ntfs_free(vol->attrdef);
- vol->attrdef = NULL;
- }
- vol->upcase_len = 0;
- mutex_lock(&ntfs_lock);
- if (vol->upcase == default_upcase) {
- ntfs_nr_upcase_users--;
- vol->upcase = NULL;
- }
- mutex_unlock(&ntfs_lock);
- if (vol->upcase) {
- ntfs_free(vol->upcase);
- vol->upcase = NULL;
- }
- if (vol->nls_map) {
- unload_nls(vol->nls_map);
- vol->nls_map = NULL;
- }
- /* Error exit code path. */
-unl_upcase_iput_tmp_ino_err_out_now:
- /*
- * Decrease the number of upcase users and destroy the global default
- * upcase table if necessary.
- */
- mutex_lock(&ntfs_lock);
- if (!--ntfs_nr_upcase_users && default_upcase) {
- ntfs_free(default_upcase);
- default_upcase = NULL;
- }
- if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
- free_compression_buffers();
- mutex_unlock(&ntfs_lock);
-iput_tmp_ino_err_out_now:
- iput(tmp_ino);
- if (vol->mft_ino && vol->mft_ino != tmp_ino)
- iput(vol->mft_ino);
- vol->mft_ino = NULL;
- /* Errors at this stage are irrelevant. */
-err_out_now:
- sb->s_fs_info = NULL;
- kfree(vol);
- ntfs_debug("Failed, returning -EINVAL.");
- lockdep_on();
- return -EINVAL;
-}
-
-/*
- * This is a slab cache to optimize allocations and deallocations of Unicode
- * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
- * (255) Unicode characters + a terminating NULL Unicode character.
- */
-struct kmem_cache *ntfs_name_cache;
-
-/* Slab caches for efficient allocation/deallocation of inodes. */
-struct kmem_cache *ntfs_inode_cache;
-struct kmem_cache *ntfs_big_inode_cache;
-
-/* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(void *foo)
-{
- ntfs_inode *ni = (ntfs_inode *)foo;
-
- inode_init_once(VFS_I(ni));
-}
-
-/*
- * Slab caches to optimize allocations and deallocations of attribute search
- * contexts and index contexts, respectively.
- */
-struct kmem_cache *ntfs_attr_ctx_cache;
-struct kmem_cache *ntfs_index_ctx_cache;
-
-/* Driver wide mutex. */
-DEFINE_MUTEX(ntfs_lock);
-
-static struct dentry *ntfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
-}
-
-static struct file_system_type ntfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "ntfs",
- .mount = ntfs_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("ntfs");
-
-/* Stable names for the slab caches. */
-static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
-static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache";
-static const char ntfs_name_cache_name[] = "ntfs_name_cache";
-static const char ntfs_inode_cache_name[] = "ntfs_inode_cache";
-static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache";
-
-static int __init init_ntfs_fs(void)
-{
- int err = 0;
-
- /* This may be ugly but it results in pretty output so who cares. (-8 */
- pr_info("driver " NTFS_VERSION " [Flags: R/"
-#ifdef NTFS_RW
- "W"
-#else
- "O"
-#endif
-#ifdef DEBUG
- " DEBUG"
-#endif
-#ifdef MODULE
- " MODULE"
-#endif
- "].\n");
-
- ntfs_debug("Debug messages are enabled.");
-
- ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
- sizeof(ntfs_index_context), 0 /* offset */,
- SLAB_HWCACHE_ALIGN, NULL /* ctor */);
- if (!ntfs_index_ctx_cache) {
- pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
- goto ictx_err_out;
- }
- ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
- sizeof(ntfs_attr_search_ctx), 0 /* offset */,
- SLAB_HWCACHE_ALIGN, NULL /* ctor */);
- if (!ntfs_attr_ctx_cache) {
- pr_crit("NTFS: Failed to create %s!\n",
- ntfs_attr_ctx_cache_name);
- goto actx_err_out;
- }
-
- ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
- (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
- SLAB_HWCACHE_ALIGN, NULL);
- if (!ntfs_name_cache) {
- pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
- goto name_err_out;
- }
-
- ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
- sizeof(ntfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
- if (!ntfs_inode_cache) {
- pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
- goto inode_err_out;
- }
-
- ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
- sizeof(big_ntfs_inode), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT, ntfs_big_inode_init_once);
- if (!ntfs_big_inode_cache) {
- pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
- goto big_inode_err_out;
- }
-
- /* Register the ntfs sysctls. */
- err = ntfs_sysctl(1);
- if (err) {
- pr_crit("Failed to register NTFS sysctls!\n");
- goto sysctl_err_out;
- }
-
- err = register_filesystem(&ntfs_fs_type);
- if (!err) {
- ntfs_debug("NTFS driver registered successfully.");
- return 0; /* Success! */
- }
- pr_crit("Failed to register NTFS filesystem driver!\n");
-
- /* Unregister the ntfs sysctls. */
- ntfs_sysctl(0);
-sysctl_err_out:
- kmem_cache_destroy(ntfs_big_inode_cache);
-big_inode_err_out:
- kmem_cache_destroy(ntfs_inode_cache);
-inode_err_out:
- kmem_cache_destroy(ntfs_name_cache);
-name_err_out:
- kmem_cache_destroy(ntfs_attr_ctx_cache);
-actx_err_out:
- kmem_cache_destroy(ntfs_index_ctx_cache);
-ictx_err_out:
- if (!err) {
- pr_crit("Aborting NTFS filesystem driver registration...\n");
- err = -ENOMEM;
- }
- return err;
-}
-
-static void __exit exit_ntfs_fs(void)
-{
- ntfs_debug("Unregistering NTFS driver.");
-
- unregister_filesystem(&ntfs_fs_type);
-
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(ntfs_big_inode_cache);
- kmem_cache_destroy(ntfs_inode_cache);
- kmem_cache_destroy(ntfs_name_cache);
- kmem_cache_destroy(ntfs_attr_ctx_cache);
- kmem_cache_destroy(ntfs_index_ctx_cache);
- /* Unregister the ntfs sysctls. */
- ntfs_sysctl(0);
-}
-
-MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
-MODULE_VERSION(NTFS_VERSION);
-MODULE_LICENSE("GPL");
-#ifdef DEBUG
-module_param(debug_msgs, bint, 0);
-MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
-#endif
-
-module_init(init_ntfs_fs)
-module_exit(exit_ntfs_fs)
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
deleted file mode 100644
index 4e980170d86a..000000000000
--- a/fs/ntfs/sysctl.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
- * the Linux-NTFS project. Adapted from the old NTFS driver,
- * Copyright (C) 1997 Martin von Löwis, Régis Duchesne
- *
- * Copyright (c) 2002-2005 Anton Altaparmakov
- */
-
-#ifdef DEBUG
-
-#include <linux/module.h>
-
-#ifdef CONFIG_SYSCTL
-
-#include <linux/proc_fs.h>
-#include <linux/sysctl.h>
-
-#include "sysctl.h"
-#include "debug.h"
-
-/* Definition of the ntfs sysctl. */
-static struct ctl_table ntfs_sysctls[] = {
- {
- .procname = "ntfs-debug",
- .data = &debug_msgs, /* Data pointer and size. */
- .maxlen = sizeof(debug_msgs),
- .mode = 0644, /* Mode, proc handler. */
- .proc_handler = proc_dointvec
- },
-};
-
-/* Storage for the sysctls header. */
-static struct ctl_table_header *sysctls_root_table;
-
-/**
- * ntfs_sysctl - add or remove the debug sysctl
- * @add: add (1) or remove (0) the sysctl
- *
- * Add or remove the debug sysctl. Return 0 on success or -errno on error.
- */
-int ntfs_sysctl(int add)
-{
- if (add) {
- BUG_ON(sysctls_root_table);
- sysctls_root_table = register_sysctl("fs", ntfs_sysctls);
- if (!sysctls_root_table)
- return -ENOMEM;
- } else {
- BUG_ON(!sysctls_root_table);
- unregister_sysctl_table(sysctls_root_table);
- sysctls_root_table = NULL;
- }
- return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-#endif /* DEBUG */
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
deleted file mode 100644
index 96bb2299d2d5..000000000000
--- a/fs/ntfs/sysctl.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
- * the Linux-NTFS project. Adapted from the old NTFS driver,
- * Copyright (C) 1997 Martin von Löwis, Régis Duchesne
- *
- * Copyright (c) 2002-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_SYSCTL_H
-#define _LINUX_NTFS_SYSCTL_H
-
-
-#if defined(DEBUG) && defined(CONFIG_SYSCTL)
-
-extern int ntfs_sysctl(int add);
-
-#else
-
-/* Just return success. */
-static inline int ntfs_sysctl(int add)
-{
- return 0;
-}
-
-#endif /* DEBUG && CONFIG_SYSCTL */
-#endif /* _LINUX_NTFS_SYSCTL_H */
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
deleted file mode 100644
index 6b63261300cc..000000000000
--- a/fs/ntfs/time.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * time.h - NTFS time conversion functions. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_TIME_H
-#define _LINUX_NTFS_TIME_H
-
-#include <linux/time.h> /* For current_kernel_time(). */
-#include <asm/div64.h> /* For do_div(). */
-
-#include "endian.h"
-
-#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000)
-
-/**
- * utc2ntfs - convert Linux UTC time to NTFS time
- * @ts: Linux UTC time to convert to NTFS time
- *
- * Convert the Linux UTC time @ts to its corresponding NTFS time and return
- * that in little endian format.
- *
- * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
- * and a long tv_nsec where tv_sec is the number of 1-second intervals since
- * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
- * intervals since the value of tv_sec.
- *
- * NTFS uses Microsoft's standard time format which is stored in a s64 and is
- * measured as the number of 100-nano-second intervals since 1st January 1601,
- * 00:00:00 UTC.
- */
-static inline sle64 utc2ntfs(const struct timespec64 ts)
-{
- /*
- * Convert the seconds to 100ns intervals, add the nano-seconds
- * converted to 100ns intervals, and then add the NTFS time offset.
- */
- return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 +
- NTFS_TIME_OFFSET);
-}
-
-/**
- * get_current_ntfs_time - get the current time in little endian NTFS format
- *
- * Get the current time from the Linux kernel, convert it to its corresponding
- * NTFS time and return that in little endian format.
- */
-static inline sle64 get_current_ntfs_time(void)
-{
- struct timespec64 ts;
-
- ktime_get_coarse_real_ts64(&ts);
- return utc2ntfs(ts);
-}
-
-/**
- * ntfs2utc - convert NTFS time to Linux time
- * @time: NTFS time (little endian) to convert to Linux UTC
- *
- * Convert the little endian NTFS time @time to its corresponding Linux UTC
- * time and return that in cpu format.
- *
- * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
- * and a long tv_nsec where tv_sec is the number of 1-second intervals since
- * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
- * intervals since the value of tv_sec.
- *
- * NTFS uses Microsoft's standard time format which is stored in a s64 and is
- * measured as the number of 100 nano-second intervals since 1st January 1601,
- * 00:00:00 UTC.
- */
-static inline struct timespec64 ntfs2utc(const sle64 time)
-{
- struct timespec64 ts;
-
- /* Subtract the NTFS time offset. */
- u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
- /*
- * Convert the time to 1-second intervals and the remainder to
- * 1-nano-second intervals.
- */
- ts.tv_nsec = do_div(t, 10000000) * 100;
- ts.tv_sec = t;
- return ts;
-}
-
-#endif /* _LINUX_NTFS_TIME_H */
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
deleted file mode 100644
index 9a47859e7a06..000000000000
--- a/fs/ntfs/types.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * types.h - Defines for NTFS Linux kernel driver specific types.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_TYPES_H
-#define _LINUX_NTFS_TYPES_H
-
-#include <linux/types.h>
-
-typedef __le16 le16;
-typedef __le32 le32;
-typedef __le64 le64;
-typedef __u16 __bitwise sle16;
-typedef __u32 __bitwise sle32;
-typedef __u64 __bitwise sle64;
-
-/* 2-byte Unicode character type. */
-typedef le16 ntfschar;
-#define UCHAR_T_SIZE_BITS 1
-
-/*
- * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN
- * and VCN, to allow for type checking and better code readability.
- */
-typedef s64 VCN;
-typedef sle64 leVCN;
-typedef s64 LCN;
-typedef sle64 leLCN;
-
-/*
- * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit
- * values. We define our own type LSN, to allow for type checking and better
- * code readability.
- */
-typedef s64 LSN;
-typedef sle64 leLSN;
-
-/*
- * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values.
- * We define our own type USN, to allow for type checking and better code
- * readability.
- */
-typedef s64 USN;
-typedef sle64 leUSN;
-
-typedef enum {
- CASE_SENSITIVE = 0,
- IGNORE_CASE = 1,
-} IGNORE_CASE_BOOL;
-
-#endif /* _LINUX_NTFS_TYPES_H */
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
deleted file mode 100644
index a6b6c64f14a9..000000000000
--- a/fs/ntfs/unistr.c
+++ /dev/null
@@ -1,384 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include <linux/slab.h>
-
-#include "types.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/*
- * IMPORTANT
- * =========
- *
- * All these routines assume that the Unicode characters are in little endian
- * encoding inside the strings!!!
- */
-
-/*
- * This is used by the name collation functions to quickly determine what
- * characters are (in)valid.
- */
-static const u8 legal_ansi_char_array[0x40] = {
- 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
- 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-
- 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
- 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-
- 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
- 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
-
- 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
- 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
-};
-
-/**
- * ntfs_are_names_equal - compare two Unicode names for equality
- * @s1: name to compare to @s2
- * @s1_len: length in Unicode characters of @s1
- * @s2: name to compare to @s1
- * @s2_len: length in Unicode characters of @s2
- * @ic: ignore case bool
- * @upcase: upcase table (only if @ic == IGNORE_CASE)
- * @upcase_size: length in Unicode characters of @upcase (if present)
- *
- * Compare the names @s1 and @s2 and return 'true' (1) if the names are
- * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
- * the @upcase table is used to performa a case insensitive comparison.
- */
-bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
- const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_size)
-{
- if (s1_len != s2_len)
- return false;
- if (ic == CASE_SENSITIVE)
- return !ntfs_ucsncmp(s1, s2, s1_len);
- return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
-}
-
-/**
- * ntfs_collate_names - collate two Unicode names
- * @name1: first Unicode name to compare
- * @name2: second Unicode name to compare
- * @err_val: if @name1 contains an invalid character return this value
- * @ic: either CASE_SENSITIVE or IGNORE_CASE
- * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE)
- * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
- *
- * ntfs_collate_names collates two Unicode names and returns:
- *
- * -1 if the first name collates before the second one,
- * 0 if the names match,
- * 1 if the second name collates before the first one, or
- * @err_val if an invalid character is found in @name1 during the comparison.
- *
- * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
- */
-int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
- const ntfschar *name2, const u32 name2_len,
- const int err_val, const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_len)
-{
- u32 cnt, min_len;
- u16 c1, c2;
-
- min_len = name1_len;
- if (name1_len > name2_len)
- min_len = name2_len;
- for (cnt = 0; cnt < min_len; ++cnt) {
- c1 = le16_to_cpu(*name1++);
- c2 = le16_to_cpu(*name2++);
- if (ic) {
- if (c1 < upcase_len)
- c1 = le16_to_cpu(upcase[c1]);
- if (c2 < upcase_len)
- c2 = le16_to_cpu(upcase[c2]);
- }
- if (c1 < 64 && legal_ansi_char_array[c1] & 8)
- return err_val;
- if (c1 < c2)
- return -1;
- if (c1 > c2)
- return 1;
- }
- if (name1_len < name2_len)
- return -1;
- if (name1_len == name2_len)
- return 0;
- /* name1_len > name2_len */
- c1 = le16_to_cpu(*name1);
- if (c1 < 64 && legal_ansi_char_array[c1] & 8)
- return err_val;
- return 1;
-}
-
-/**
- * ntfs_ucsncmp - compare two little endian Unicode strings
- * @s1: first string
- * @s2: second string
- * @n: maximum unicode characters to compare
- *
- * Compare the first @n characters of the Unicode strings @s1 and @s2,
- * The strings in little endian format and appropriate le16_to_cpu()
- * conversion is performed on non-little endian machines.
- *
- * The function returns an integer less than, equal to, or greater than zero
- * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
- * to be less than, to match, or be greater than @s2.
- */
-int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
-{
- u16 c1, c2;
- size_t i;
-
- for (i = 0; i < n; ++i) {
- c1 = le16_to_cpu(s1[i]);
- c2 = le16_to_cpu(s2[i]);
- if (c1 < c2)
- return -1;
- if (c1 > c2)
- return 1;
- if (!c1)
- break;
- }
- return 0;
-}
-
-/**
- * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
- * @s1: first string
- * @s2: second string
- * @n: maximum unicode characters to compare
- * @upcase: upcase table
- * @upcase_size: upcase table size in Unicode characters
- *
- * Compare the first @n characters of the Unicode strings @s1 and @s2,
- * ignoring case. The strings in little endian format and appropriate
- * le16_to_cpu() conversion is performed on non-little endian machines.
- *
- * Each character is uppercased using the @upcase table before the comparison.
- *
- * The function returns an integer less than, equal to, or greater than zero
- * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
- * to be less than, to match, or be greater than @s2.
- */
-int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
- const ntfschar *upcase, const u32 upcase_size)
-{
- size_t i;
- u16 c1, c2;
-
- for (i = 0; i < n; ++i) {
- if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
- c1 = le16_to_cpu(upcase[c1]);
- if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
- c2 = le16_to_cpu(upcase[c2]);
- if (c1 < c2)
- return -1;
- if (c1 > c2)
- return 1;
- if (!c1)
- break;
- }
- return 0;
-}
-
-void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
- const u32 upcase_len)
-{
- u32 i;
- u16 u;
-
- for (i = 0; i < name_len; i++)
- if ((u = le16_to_cpu(name[i])) < upcase_len)
- name[i] = upcase[u];
-}
-
-void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
- const ntfschar *upcase, const u32 upcase_len)
-{
- ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
- file_name_attr->file_name_length, upcase, upcase_len);
-}
-
-int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
- FILE_NAME_ATTR *file_name_attr2,
- const int err_val, const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_len)
-{
- return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
- file_name_attr1->file_name_length,
- (ntfschar*)&file_name_attr2->file_name,
- file_name_attr2->file_name_length,
- err_val, ic, upcase, upcase_len);
-}
-
-/**
- * ntfs_nlstoucs - convert NLS string to little endian Unicode string
- * @vol: ntfs volume which we are working with
- * @ins: input NLS string buffer
- * @ins_len: length of input string in bytes
- * @outs: on return contains the allocated output Unicode string buffer
- *
- * Convert the input string @ins, which is in whatever format the loaded NLS
- * map dictates, into a little endian, 2-byte Unicode string.
- *
- * This function allocates the string and the caller is responsible for
- * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it.
- *
- * On success the function returns the number of Unicode characters written to
- * the output string *@outs (>= 0), not counting the terminating Unicode NULL
- * character. *@outs is set to the allocated output string buffer.
- *
- * On error, a negative number corresponding to the error code is returned. In
- * that case the output string is not allocated. Both *@outs and *@outs_len
- * are then undefined.
- *
- * This might look a bit odd due to fast path optimization...
- */
-int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
- const int ins_len, ntfschar **outs)
-{
- struct nls_table *nls = vol->nls_map;
- ntfschar *ucs;
- wchar_t wc;
- int i, o, wc_len;
-
- /* We do not trust outside sources. */
- if (likely(ins)) {
- ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
- if (likely(ucs)) {
- for (i = o = 0; i < ins_len; i += wc_len) {
- wc_len = nls->char2uni(ins + i, ins_len - i,
- &wc);
- if (likely(wc_len >= 0 &&
- o < NTFS_MAX_NAME_LEN)) {
- if (likely(wc)) {
- ucs[o++] = cpu_to_le16(wc);
- continue;
- } /* else if (!wc) */
- break;
- } /* else if (wc_len < 0 ||
- o >= NTFS_MAX_NAME_LEN) */
- goto name_err;
- }
- ucs[o] = 0;
- *outs = ucs;
- return o;
- } /* else if (!ucs) */
- ntfs_error(vol->sb, "Failed to allocate buffer for converted "
- "name from ntfs_name_cache.");
- return -ENOMEM;
- } /* else if (!ins) */
- ntfs_error(vol->sb, "Received NULL pointer.");
- return -EINVAL;
-name_err:
- kmem_cache_free(ntfs_name_cache, ucs);
- if (wc_len < 0) {
- ntfs_error(vol->sb, "Name using character set %s contains "
- "characters that cannot be converted to "
- "Unicode.", nls->charset);
- i = -EILSEQ;
- } else /* if (o >= NTFS_MAX_NAME_LEN) */ {
- ntfs_error(vol->sb, "Name is too long (maximum length for a "
- "name on NTFS is %d Unicode characters.",
- NTFS_MAX_NAME_LEN);
- i = -ENAMETOOLONG;
- }
- return i;
-}
-
-/**
- * ntfs_ucstonls - convert little endian Unicode string to NLS string
- * @vol: ntfs volume which we are working with
- * @ins: input Unicode string buffer
- * @ins_len: length of input string in Unicode characters
- * @outs: on return contains the (allocated) output NLS string buffer
- * @outs_len: length of output string buffer in bytes
- *
- * Convert the input little endian, 2-byte Unicode string @ins, of length
- * @ins_len into the string format dictated by the loaded NLS.
- *
- * If *@outs is NULL, this function allocates the string and the caller is
- * responsible for calling kfree(*@outs); when finished with it. In this case
- * @outs_len is ignored and can be 0.
- *
- * On success the function returns the number of bytes written to the output
- * string *@outs (>= 0), not counting the terminating NULL byte. If the output
- * string buffer was allocated, *@outs is set to it.
- *
- * On error, a negative number corresponding to the error code is returned. In
- * that case the output string is not allocated. The contents of *@outs are
- * then undefined.
- *
- * This might look a bit odd due to fast path optimization...
- */
-int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
- const int ins_len, unsigned char **outs, int outs_len)
-{
- struct nls_table *nls = vol->nls_map;
- unsigned char *ns;
- int i, o, ns_len, wc;
-
- /* We don't trust outside sources. */
- if (ins) {
- ns = *outs;
- ns_len = outs_len;
- if (ns && !ns_len) {
- wc = -ENAMETOOLONG;
- goto conversion_err;
- }
- if (!ns) {
- ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
- ns = kmalloc(ns_len + 1, GFP_NOFS);
- if (!ns)
- goto mem_err_out;
- }
- for (i = o = 0; i < ins_len; i++) {
-retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
- ns_len - o);
- if (wc > 0) {
- o += wc;
- continue;
- } else if (!wc)
- break;
- else if (wc == -ENAMETOOLONG && ns != *outs) {
- unsigned char *tc;
- /* Grow in multiples of 64 bytes. */
- tc = kmalloc((ns_len + 64) &
- ~63, GFP_NOFS);
- if (tc) {
- memcpy(tc, ns, ns_len);
- ns_len = ((ns_len + 64) & ~63) - 1;
- kfree(ns);
- ns = tc;
- goto retry;
- } /* No memory so goto conversion_error; */
- } /* wc < 0, real error. */
- goto conversion_err;
- }
- ns[o] = 0;
- *outs = ns;
- return o;
- } /* else (!ins) */
- ntfs_error(vol->sb, "Received NULL pointer.");
- return -EINVAL;
-conversion_err:
- ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
- "converted to character set %s. You might want to "
- "try to use the mount option nls=utf8.", nls->charset);
- if (ns != *outs)
- kfree(ns);
- if (wc != -ENAMETOOLONG)
- wc = -EILSEQ;
- return wc;
-mem_err_out:
- ntfs_error(vol->sb, "Failed to allocate name!");
- return -ENOMEM;
-}
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
deleted file mode 100644
index 4ebe84a78dea..000000000000
--- a/fs/ntfs/upcase.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include "malloc.h"
-#include "ntfs.h"
-
-ntfschar *generate_default_upcase(void)
-{
- static const int uc_run_table[][3] = { /* Start, End, Add */
- {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74},
- {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86},
- {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
- {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128},
- {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112},
- {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126},
- {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8},
- {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8},
- {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8},
- {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7},
- {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16},
- {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26},
- {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32},
- {0}
- };
-
- static const int uc_dup_table[][2] = { /* Start, End */
- {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
- {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
- {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
- {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
- {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
- {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
- {0}
- };
-
- static const int uc_word_table[][2] = { /* Offset, Value */
- {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
- {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
- {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
- {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
- {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
- {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
- {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
- {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
- {0}
- };
-
- int i, r;
- ntfschar *uc;
-
- uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar));
- if (!uc)
- return uc;
- memset(uc, 0, default_upcase_len * sizeof(ntfschar));
- /* Generate the little endian Unicode upcase table used by ntfs. */
- for (i = 0; i < default_upcase_len; i++)
- uc[i] = cpu_to_le16(i);
- for (r = 0; uc_run_table[r][0]; r++)
- for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
- le16_add_cpu(&uc[i], uc_run_table[r][2]);
- for (r = 0; uc_dup_table[r][0]; r++)
- for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
- le16_add_cpu(&uc[i + 1], -1);
- for (r = 0; uc_word_table[r][0]; r++)
- uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
- return uc;
-}
diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c
deleted file mode 100644
index 9097a0b4ef25..000000000000
--- a/fs/ntfs/usnjrnl.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling. Part of the
- * Linux-NTFS project.
- *
- * Copyright (c) 2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-
-#include "aops.h"
-#include "debug.h"
-#include "endian.h"
-#include "time.h"
-#include "types.h"
-#include "usnjrnl.h"
-#include "volume.h"
-
-/**
- * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume
- * @vol: ntfs volume on which to stamp the transaction log
- *
- * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return
- * 'true' on success and 'false' on error.
- *
- * This function assumes that the transaction log has already been loaded and
- * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl().
- */
-bool ntfs_stamp_usnjrnl(ntfs_volume *vol)
-{
- ntfs_debug("Entering.");
- if (likely(!NVolUsnJrnlStamped(vol))) {
- sle64 stamp;
- struct page *page;
- USN_HEADER *uh;
-
- page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read from "
- "$UsnJrnl/$DATA/$Max attribute.");
- return false;
- }
- uh = (USN_HEADER*)page_address(page);
- stamp = get_current_ntfs_time();
- ntfs_debug("Stamping transaction log ($UsnJrnl): old "
- "journal_id 0x%llx, old lowest_valid_usn "
- "0x%llx, new journal_id 0x%llx, new "
- "lowest_valid_usn 0x%llx.",
- (long long)sle64_to_cpu(uh->journal_id),
- (long long)sle64_to_cpu(uh->lowest_valid_usn),
- (long long)sle64_to_cpu(stamp),
- i_size_read(vol->usnjrnl_j_ino));
- uh->lowest_valid_usn =
- cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino));
- uh->journal_id = stamp;
- flush_dcache_page(page);
- set_page_dirty(page);
- ntfs_unmap_page(page);
- /* Set the flag so we do not have to do it again on remount. */
- NVolSetUsnJrnlStamped(vol);
- }
- ntfs_debug("Done.");
- return true;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
deleted file mode 100644
index 85f531b59395..000000000000
--- a/fs/ntfs/usnjrnl.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_USNJRNL_H
-#define _LINUX_NTFS_USNJRNL_H
-
-#ifdef NTFS_RW
-
-#include "types.h"
-#include "endian.h"
-#include "layout.h"
-#include "volume.h"
-
-/*
- * Transaction log ($UsnJrnl) organization:
- *
- * The transaction log records whenever a file is modified in any way. So for
- * example it will record that file "blah" was written to at a particular time
- * but not what was written. If will record that a file was deleted or
- * created, that a file was truncated, etc. See below for all the reason
- * codes used.
- *
- * The transaction log is in the $Extend directory which is in the root
- * directory of each volume. If it is not present it means transaction
- * logging is disabled. If it is present it means transaction logging is
- * either enabled or in the process of being disabled in which case we can
- * ignore it as it will go away as soon as Windows gets its hands on it.
- *
- * To determine whether the transaction logging is enabled or in the process
- * of being disabled, need to check the volume flags in the
- * $VOLUME_INFORMATION attribute in the $Volume system file (which is present
- * in the root directory and has a fixed mft record number, see layout.h).
- * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log
- * is in the process of being disabled and if this flag is clear it means the
- * transaction log is enabled.
- *
- * The transaction log consists of two parts; the $DATA/$Max attribute as well
- * as the $DATA/$J attribute. $Max is a header describing the transaction
- * log whilst $J is the transaction log data itself as a sequence of variable
- * sized USN_RECORDs (see below for all the structures).
- *
- * We do not care about transaction logging at this point in time but we still
- * need to let windows know that the transaction log is out of date. To do
- * this we need to stamp the transaction log. This involves setting the
- * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used
- * for the next added USN_RECORD to the $DATA/$J attribute as well as
- * generating a new journal_id in $DATA/$Max.
- *
- * The journal_id is as of the current version (2.0) of the transaction log
- * simply the 64-bit timestamp of when the journal was either created or last
- * stamped.
- *
- * To determine the next usn there are two ways. The first is to parse
- * $DATA/$J and to find the last USN_RECORD in it and to add its record_length
- * to its usn (which is the byte offset in the $DATA/$J attribute). The
- * second is simply to take the data size of the attribute. Since the usns
- * are simply byte offsets into $DATA/$J, this is exactly the next usn. For
- * obvious reasons we use the second method as it is much simpler and faster.
- *
- * As an aside, note that to actually disable the transaction log, one would
- * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go
- * through all the mft records on the volume and set the usn field in their
- * $STANDARD_INFORMATION attribute to zero. Once that is done, one would need
- * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally,
- * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag.
- *
- * Note that if a volume is unmounted whilst the transaction log is being
- * disabled, the process will continue the next time the volume is mounted.
- * This is why we can safely mount read-write when we see a transaction log
- * in the process of being deleted.
- */
-
-/* Some $UsnJrnl related constants. */
-#define UsnJrnlMajorVer 2
-#define UsnJrnlMinorVer 0
-
-/*
- * $DATA/$Max attribute. This is (always?) resident and has a fixed size of
- * 32 bytes. It contains the header describing the transaction log.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/sle64 maximum_size; /* The maximum on-disk size of the $DATA/$J
- attribute. */
-/* 8*/sle64 allocation_delta; /* Number of bytes by which to increase the
- size of the $DATA/$J attribute. */
-/*0x10*/sle64 journal_id; /* Current id of the transaction log. */
-/*0x18*/leUSN lowest_valid_usn; /* Lowest valid usn in $DATA/$J for the
- current journal_id. */
-/* sizeof() = 32 (0x20) bytes */
-} __attribute__ ((__packed__)) USN_HEADER;
-
-/*
- * Reason flags (32-bit). Cumulative flags describing the change(s) to the
- * file since it was last opened. I think the names speak for themselves but
- * if you disagree check out the descriptions in the Linux NTFS project NTFS
- * documentation: http://www.linux-ntfs.org/
- */
-enum {
- USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001),
- USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002),
- USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004),
- USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010),
- USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020),
- USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
- USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100),
- USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200),
- USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400),
- USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800),
- USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000),
- USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000),
- USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000),
- USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000),
- USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000),
- USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000),
- USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000),
- USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000),
- USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000),
- USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000),
- USN_REASON_CLOSE = cpu_to_le32(0x80000000),
-};
-
-typedef le32 USN_REASON_FLAGS;
-
-/*
- * Source info flags (32-bit). Information about the source of the change(s)
- * to the file. For detailed descriptions of what these mean, see the Linux
- * NTFS project NTFS documentation:
- * http://www.linux-ntfs.org/
- */
-enum {
- USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001),
- USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002),
- USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
-};
-
-typedef le32 USN_SOURCE_INFO_FLAGS;
-
-/*
- * $DATA/$J attribute. This is always non-resident, is marked as sparse, and
- * is of variabled size. It consists of a sequence of variable size
- * USN_RECORDS. The minimum allocated_size is allocation_delta as
- * specified in $DATA/$Max. When the maximum_size specified in $DATA/$Max is
- * exceeded by more than allocation_delta bytes, allocation_delta bytes are
- * allocated and appended to the $DATA/$J attribute and an equal number of
- * bytes at the beginning of the attribute are freed and made sparse. Note the
- * making sparse only happens at volume checkpoints and hence the actual
- * $DATA/$J size can exceed maximum_size + allocation_delta temporarily.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/le32 length; /* Byte size of this record (8-byte
- aligned). */
-/* 4*/le16 major_ver; /* Major version of the transaction log used
- for this record. */
-/* 6*/le16 minor_ver; /* Minor version of the transaction log used
- for this record. */
-/* 8*/leMFT_REF mft_reference;/* The mft reference of the file (or
- directory) described by this record. */
-/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent
- directory of the file described by this
- record. */
-/*0x18*/leUSN usn; /* The usn of this record. Equals the offset
- within the $DATA/$J attribute. */
-/*0x20*/sle64 time; /* Time when this record was created. */
-/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */
-/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */
-/*0x30*/le32 security_id; /* File security_id copied from
- $STANDARD_INFORMATION. */
-/*0x34*/FILE_ATTR_FLAGS file_attributes; /* File attributes copied from
- $STANDARD_INFORMATION or $FILE_NAME (not
- sure which). */
-/*0x38*/le16 file_name_size; /* Size of the file name in bytes. */
-/*0x3a*/le16 file_name_offset; /* Offset to the file name in bytes from the
- start of this record. */
-/*0x3c*/ntfschar file_name[0]; /* Use when creating only. When reading use
- file_name_offset to determine the location
- of the name. */
-/* sizeof() = 60 (0x3c) bytes */
-} __attribute__ ((__packed__)) USN_RECORD;
-
-extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_USNJRNL_H */
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
deleted file mode 100644
index 930a9ae8a053..000000000000
--- a/fs/ntfs/volume.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
- * of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_VOLUME_H
-#define _LINUX_NTFS_VOLUME_H
-
-#include <linux/rwsem.h>
-#include <linux/uidgid.h>
-
-#include "types.h"
-#include "layout.h"
-
-/*
- * The NTFS in memory super block structure.
- */
-typedef struct {
- /*
- * FIXME: Reorder to have commonly used together element within the
- * same cache line, aiming at a cache line size of 32 bytes. Aim for
- * 64 bytes for less commonly used together elements. Put most commonly
- * used elements to front of structure. Obviously do this only when the
- * structure has stabilized... (AIA)
- */
- /* Device specifics. */
- struct super_block *sb; /* Pointer back to the super_block. */
- LCN nr_blocks; /* Number of sb->s_blocksize bytes
- sized blocks on the device. */
- /* Configuration provided by user at mount time. */
- unsigned long flags; /* Miscellaneous flags, see below. */
- kuid_t uid; /* uid that files will be mounted as. */
- kgid_t gid; /* gid that files will be mounted as. */
- umode_t fmask; /* The mask for file permissions. */
- umode_t dmask; /* The mask for directory
- permissions. */
- u8 mft_zone_multiplier; /* Initial mft zone multiplier. */
- u8 on_errors; /* What to do on filesystem errors. */
- /* NTFS bootsector provided information. */
- u16 sector_size; /* in bytes */
- u8 sector_size_bits; /* log2(sector_size) */
- u32 cluster_size; /* in bytes */
- u32 cluster_size_mask; /* cluster_size - 1 */
- u8 cluster_size_bits; /* log2(cluster_size) */
- u32 mft_record_size; /* in bytes */
- u32 mft_record_size_mask; /* mft_record_size - 1 */
- u8 mft_record_size_bits; /* log2(mft_record_size) */
- u32 index_record_size; /* in bytes */
- u32 index_record_size_mask; /* index_record_size - 1 */
- u8 index_record_size_bits; /* log2(index_record_size) */
- LCN nr_clusters; /* Volume size in clusters == number of
- bits in lcn bitmap. */
- LCN mft_lcn; /* Cluster location of mft data. */
- LCN mftmirr_lcn; /* Cluster location of copy of mft. */
- u64 serial_no; /* The volume serial number. */
- /* Mount specific NTFS information. */
- u32 upcase_len; /* Number of entries in upcase[]. */
- ntfschar *upcase; /* The upcase table. */
-
- s32 attrdef_size; /* Size of the attribute definition
- table in bytes. */
- ATTR_DEF *attrdef; /* Table of attribute definitions.
- Obtained from FILE_AttrDef. */
-
-#ifdef NTFS_RW
- /* Variables used by the cluster and mft allocators. */
- s64 mft_data_pos; /* Mft record number at which to
- allocate the next mft record. */
- LCN mft_zone_start; /* First cluster of the mft zone. */
- LCN mft_zone_end; /* First cluster beyond the mft zone. */
- LCN mft_zone_pos; /* Current position in the mft zone. */
- LCN data1_zone_pos; /* Current position in the first data
- zone. */
- LCN data2_zone_pos; /* Current position in the second data
- zone. */
-#endif /* NTFS_RW */
-
- struct inode *mft_ino; /* The VFS inode of $MFT. */
-
- struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */
- struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the
- mft record bitmap ($MFT/$BITMAP). */
-#ifdef NTFS_RW
- struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */
- int mftmirr_size; /* Size of mft mirror in mft records. */
-
- struct inode *logfile_ino; /* The VFS inode of $LogFile. */
-#endif /* NTFS_RW */
-
- struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */
- struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the
- cluster bitmap ($Bitmap/$DATA). */
-
- struct inode *vol_ino; /* The VFS inode of $Volume. */
- VOLUME_FLAGS vol_flags; /* Volume flags. */
- u8 major_ver; /* Ntfs major version of volume. */
- u8 minor_ver; /* Ntfs minor version of volume. */
-
- struct inode *root_ino; /* The VFS inode of the root
- directory. */
- struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+
- only, otherwise NULL). */
- struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+
- only, otherwise NULL). */
-#ifdef NTFS_RW
- /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */
- struct inode *quota_ino; /* The VFS inode of $Quota. */
- struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */
- /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */
- struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */
- struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */
- struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */
-#endif /* NTFS_RW */
- struct nls_table *nls_map;
-} ntfs_volume;
-
-/*
- * Defined bits for the flags field in the ntfs_volume structure.
- */
-typedef enum {
- NV_Errors, /* 1: Volume has errors, prevent remount rw. */
- NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */
- NV_CaseSensitive, /* 1: Treat file names as case sensitive and
- create filenames in the POSIX namespace.
- Otherwise be case insensitive but still
- create file names in POSIX namespace. */
- NV_LogFileEmpty, /* 1: $LogFile journal is empty. */
- NV_QuotaOutOfDate, /* 1: $Quota is out of date. */
- NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */
- NV_SparseEnabled, /* 1: May create sparse files. */
-} ntfs_volume_flags;
-
-/*
- * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
- * functions.
- */
-#define DEFINE_NVOL_BIT_OPS(flag) \
-static inline int NVol##flag(ntfs_volume *vol) \
-{ \
- return test_bit(NV_##flag, &(vol)->flags); \
-} \
-static inline void NVolSet##flag(ntfs_volume *vol) \
-{ \
- set_bit(NV_##flag, &(vol)->flags); \
-} \
-static inline void NVolClear##flag(ntfs_volume *vol) \
-{ \
- clear_bit(NV_##flag, &(vol)->flags); \
-}
-
-/* Emit the ntfs volume bitops functions. */
-DEFINE_NVOL_BIT_OPS(Errors)
-DEFINE_NVOL_BIT_OPS(ShowSystemFiles)
-DEFINE_NVOL_BIT_OPS(CaseSensitive)
-DEFINE_NVOL_BIT_OPS(LogFileEmpty)
-DEFINE_NVOL_BIT_OPS(QuotaOutOfDate)
-DEFINE_NVOL_BIT_OPS(UsnJrnlStamped)
-DEFINE_NVOL_BIT_OPS(SparseEnabled)
-
-#endif /* _LINUX_NTFS_VOLUME_H */
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 63f70259edc0..7aadf5010999 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -886,7 +886,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
struct runs_tree *run = &ni->file.run;
struct ntfs_sb_info *sbi;
u8 cluster_bits;
- struct ATTRIB *attr = NULL, *attr_b;
+ struct ATTRIB *attr, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
struct mft_inode *mi, *mi_b;
CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen;
@@ -904,12 +904,8 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
*len = 0;
up_read(&ni->file.run_lock);
- if (*len) {
- if (*lcn != SPARSE_LCN || !new)
- return 0; /* Fast normal way without allocation. */
- else if (clen > *len)
- clen = *len;
- }
+ if (*len && (*lcn != SPARSE_LCN || !new))
+ return 0; /* Fast normal way without allocation. */
/* No cluster in cache or we need to allocate cluster in hole. */
sbi = ni->mi.sbi;
@@ -918,6 +914,17 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
ni_lock(ni);
down_write(&ni->file.run_lock);
+ /* Repeat the code above (under write lock). */
+ if (!run_lookup_entry(run, vcn, lcn, len, NULL))
+ *len = 0;
+
+ if (*len) {
+ if (*lcn != SPARSE_LCN || !new)
+ goto out; /* normal way without allocation. */
+ if (clen > *len)
+ clen = *len;
+ }
+
le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b);
if (!attr_b) {
@@ -1736,8 +1743,10 @@ repack:
le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL,
0, NULL, &mi_b);
- if (!attr_b)
- return -ENOENT;
+ if (!attr_b) {
+ err = -ENOENT;
+ goto out;
+ }
attr = attr_b;
le = le_b;
@@ -1818,13 +1827,15 @@ ins_ext:
ok:
run_truncate_around(run, vcn);
out:
- if (new_valid > data_size)
- new_valid = data_size;
+ if (attr_b) {
+ if (new_valid > data_size)
+ new_valid = data_size;
- valid_size = le64_to_cpu(attr_b->nres.valid_size);
- if (new_valid != valid_size) {
- attr_b->nres.valid_size = cpu_to_le64(valid_size);
- mi_b->dirty = true;
+ valid_size = le64_to_cpu(attr_b->nres.valid_size);
+ if (new_valid != valid_size) {
+ attr_b->nres.valid_size = cpu_to_le64(valid_size);
+ mi_b->dirty = true;
+ }
}
return err;
@@ -2073,7 +2084,7 @@ next_attr:
/* Update inode size. */
ni->i_valid = valid_size;
- ni->vfs_inode.i_size = data_size;
+ i_size_write(&ni->vfs_inode, data_size);
inode_set_bytes(&ni->vfs_inode, total_size);
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
mark_inode_dirty(&ni->vfs_inode);
@@ -2488,7 +2499,7 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
mi_b->dirty = true;
done:
- ni->vfs_inode.i_size += bytes;
+ i_size_write(&ni->vfs_inode, ni->vfs_inode.i_size + bytes);
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
mark_inode_dirty(&ni->vfs_inode);
diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c
index 7c01735d1219..9f4bd8d26090 100644
--- a/fs/ntfs3/attrlist.c
+++ b/fs/ntfs3/attrlist.c
@@ -29,7 +29,7 @@ static inline bool al_is_valid_le(const struct ntfs_inode *ni,
void al_destroy(struct ntfs_inode *ni)
{
run_close(&ni->attr_list.run);
- kfree(ni->attr_list.le);
+ kvfree(ni->attr_list.le);
ni->attr_list.le = NULL;
ni->attr_list.size = 0;
ni->attr_list.dirty = false;
@@ -127,12 +127,13 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
{
size_t off;
u16 sz;
+ const unsigned le_min_size = le_size(0);
if (!le) {
le = ni->attr_list.le;
} else {
sz = le16_to_cpu(le->size);
- if (sz < sizeof(struct ATTR_LIST_ENTRY)) {
+ if (sz < le_min_size) {
/* Impossible 'cause we should not return such le. */
return NULL;
}
@@ -141,7 +142,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
/* Check boundary. */
off = PtrOffset(ni->attr_list.le, le);
- if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) {
+ if (off + le_min_size > ni->attr_list.size) {
/* The regular end of list. */
return NULL;
}
@@ -149,8 +150,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
sz = le16_to_cpu(le->size);
/* Check le for errors. */
- if (sz < sizeof(struct ATTR_LIST_ENTRY) ||
- off + sz > ni->attr_list.size ||
+ if (sz < le_min_size || off + sz > ni->attr_list.size ||
sz < le->name_off + le->name_len * sizeof(short)) {
return NULL;
}
@@ -318,7 +318,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name,
memcpy(ptr, al->le, off);
memcpy(Add2Ptr(ptr, off + sz), le, old_size - off);
le = Add2Ptr(ptr, off);
- kfree(al->le);
+ kvfree(al->le);
al->le = ptr;
} else {
memmove(Add2Ptr(le, sz), le, old_size - off);
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 63f14a0232f6..845f9b22deef 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -124,7 +124,7 @@ void wnd_close(struct wnd_bitmap *wnd)
{
struct rb_node *node, *next;
- kfree(wnd->free_bits);
+ kvfree(wnd->free_bits);
wnd->free_bits = NULL;
run_close(&wnd->run);
@@ -1360,7 +1360,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short));
memset(new_free + wnd->nwnd, 0,
(new_wnd - wnd->nwnd) * sizeof(short));
- kfree(wnd->free_bits);
+ kvfree(wnd->free_bits);
wnd->free_bits = new_free;
}
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index ec0566b322d5..5cf3d9decf64 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -309,11 +309,31 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
return 0;
}
- /* NTFS: symlinks are "dir + reparse" or "file + reparse" */
- if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT)
- dt_type = DT_LNK;
- else
- dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+ /*
+ * NTFS: symlinks are "dir + reparse" or "file + reparse"
+ * Unfortunately reparse attribute is used for many purposes (several dozens).
+ * It is not possible here to know is this name symlink or not.
+ * To get exactly the type of name we should to open inode (read mft).
+ * getattr for opened file (fstat) correctly returns symlink.
+ */
+ dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+
+ /*
+ * It is not reliable to detect the type of name using duplicated information
+ * stored in parent directory.
+ * The only correct way to get the type of name - read MFT record and find ATTR_STD.
+ * The code below is not good idea.
+ * It does additional locks/reads just to get the type of name.
+ * Should we use additional mount option to enable branch below?
+ */
+ if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) &&
+ ino != ni->mi.rno) {
+ struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL);
+ if (!IS_ERR_OR_NULL(inode)) {
+ dt_type = fs_umode_to_dtype(inode->i_mode);
+ iput(inode);
+ }
+ }
return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type);
}
@@ -495,11 +515,9 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
struct INDEX_HDR *hdr;
const struct ATTR_FILE_NAME *fname;
u32 e_size, off, end;
- u64 vbo = 0;
size_t drs = 0, fles = 0, bit = 0;
- loff_t i_size = ni->vfs_inode.i_size;
struct indx_node *node = NULL;
- u8 index_bits = ni->dir.index_bits;
+ size_t max_indx = i_size_read(&ni->vfs_inode) >> ni->dir.index_bits;
if (is_empty)
*is_empty = true;
@@ -518,8 +536,10 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
e = Add2Ptr(hdr, off);
e_size = le16_to_cpu(e->size);
if (e_size < sizeof(struct NTFS_DE) ||
- off + e_size > end)
+ off + e_size > end) {
+ /* Looks like corruption. */
break;
+ }
if (de_is_last(e))
break;
@@ -543,7 +563,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
fles += 1;
}
- if (vbo >= i_size)
+ if (bit >= max_indx)
goto out;
err = indx_used_bit(&ni->dir, ni, &bit);
@@ -553,8 +573,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
if (bit == MINUS_ONE_T)
goto out;
- vbo = (u64)bit << index_bits;
- if (vbo >= i_size)
+ if (bit >= max_indx)
goto out;
err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits,
@@ -564,7 +583,6 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
hdr = &node->index->ihdr;
bit += 1;
- vbo = (u64)bit << ni->dir.idx2vbn_bits;
}
out:
@@ -593,5 +611,9 @@ const struct file_operations ntfs_dir_operations = {
.iterate_shared = ntfs_readdir,
.fsync = generic_file_fsync,
.open = ntfs_file_open,
+ .unlocked_ioctl = ntfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ntfs_compat_ioctl,
+#endif
};
// clang-format on
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index a5a30a24ce5d..5418662c80d8 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -48,7 +48,7 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
return 0;
}
-static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
+long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
@@ -61,7 +61,7 @@ static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
}
#ifdef CONFIG_COMPAT
-static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg)
+long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg)
{
return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
@@ -188,6 +188,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
u32 bh_next, bh_off, to;
sector_t iblock;
struct folio *folio;
+ bool dirty = false;
for (; idx < idx_end; idx += 1, from = 0) {
page_off = (loff_t)idx << PAGE_SHIFT;
@@ -223,29 +224,27 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
/* Ok, it's mapped. Make sure it's up-to-date. */
if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
-
- if (!buffer_uptodate(bh)) {
- err = bh_read(bh, 0);
- if (err < 0) {
- folio_unlock(folio);
- folio_put(folio);
- goto out;
- }
+ else if (bh_read(bh, 0) < 0) {
+ err = -EIO;
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
}
mark_buffer_dirty(bh);
-
} while (bh_off = bh_next, iblock += 1,
head != (bh = bh->b_this_page));
folio_zero_segment(folio, from, to);
+ dirty = true;
folio_unlock(folio);
folio_put(folio);
cond_resched();
}
out:
- mark_inode_dirty(inode);
+ if (dirty)
+ mark_inode_dirty(inode);
return err;
}
@@ -261,6 +260,9 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
bool rw = vma->vm_flags & VM_WRITE;
int err;
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if (is_encrypted(ni)) {
ntfs_inode_warn(inode, "mmap encrypted not supported");
return -EOPNOTSUPP;
@@ -499,10 +501,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
ni_lock(ni);
err = attr_punch_hole(ni, vbo, len, &frame_size);
ni_unlock(ni);
+ if (!err)
+ goto ok;
+
if (err != E_NTFS_NOTALIGNED)
goto out;
/* Process not aligned punch. */
+ err = 0;
mask = frame_size - 1;
vbo_a = (vbo + mask) & ~mask;
end_a = end & ~mask;
@@ -525,6 +531,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
ni_lock(ni);
err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL);
ni_unlock(ni);
+ if (err)
+ goto out;
}
} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
/*
@@ -564,6 +572,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
ni_lock(ni);
err = attr_insert_range(ni, vbo, len);
ni_unlock(ni);
+ if (err)
+ goto out;
} else {
/* Check new size. */
u8 cluster_bits = sbi->cluster_bits;
@@ -633,11 +643,18 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
&ni->file.run, i_size, &ni->i_valid,
true, NULL);
ni_unlock(ni);
+ if (err)
+ goto out;
} else if (new_size > i_size) {
- inode->i_size = new_size;
+ i_size_write(inode, new_size);
}
}
+ok:
+ err = file_modified(file);
+ if (err)
+ goto out;
+
out:
if (map_locked)
filemap_invalidate_unlock(mapping);
@@ -663,6 +680,9 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
umode_t mode = inode->i_mode;
int err;
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
err = setattr_prepare(idmap, dentry, attr);
if (err)
goto out;
@@ -676,7 +696,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
goto out;
}
inode_dio_wait(inode);
- oldsize = inode->i_size;
+ oldsize = i_size_read(inode);
newsize = attr->ia_size;
if (newsize <= oldsize)
@@ -688,7 +708,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
goto out;
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
- inode->i_size = newsize;
+ i_size_write(inode, newsize);
}
setattr_copy(idmap, inode, attr);
@@ -718,6 +738,9 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = file->f_mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if (is_encrypted(ni)) {
ntfs_inode_warn(inode, "encrypted i/o not supported");
return -EOPNOTSUPP;
@@ -752,6 +775,9 @@ static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos,
struct inode *inode = in->f_mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if (is_encrypted(ni)) {
ntfs_inode_warn(inode, "encrypted i/o not supported");
return -EOPNOTSUPP;
@@ -821,7 +847,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
size_t count = iov_iter_count(from);
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(file);
- loff_t i_size = inode->i_size;
+ loff_t i_size = i_size_read(inode);
struct address_space *mapping = inode->i_mapping;
struct ntfs_inode *ni = ntfs_i(inode);
u64 valid = ni->i_valid;
@@ -1028,6 +1054,8 @@ out:
iocb->ki_pos += written;
if (iocb->ki_pos > ni->i_valid)
ni->i_valid = iocb->ki_pos;
+ if (iocb->ki_pos > i_size)
+ i_size_write(inode, iocb->ki_pos);
return written;
}
@@ -1041,8 +1069,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t ret;
+ int err;
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if (is_encrypted(ni)) {
ntfs_inode_warn(inode, "encrypted i/o not supported");
return -EOPNOTSUPP;
@@ -1068,6 +1100,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret <= 0)
goto out;
+ err = file_modified(iocb->ki_filp);
+ if (err) {
+ ret = err;
+ goto out;
+ }
+
if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
/* Should never be here, see ntfs_file_open(). */
ret = -EOPNOTSUPP;
@@ -1097,6 +1135,9 @@ int ntfs_file_open(struct inode *inode, struct file *file)
{
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if (unlikely((is_compressed(ni) || is_encrypted(ni)) &&
(file->f_flags & O_DIRECT))) {
return -EOPNOTSUPP;
@@ -1138,7 +1179,8 @@ static int ntfs_file_release(struct inode *inode, struct file *file)
down_write(&ni->file.run_lock);
err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run,
- inode->i_size, &ni->i_valid, false, NULL);
+ i_size_read(inode), &ni->i_valid, false,
+ NULL);
up_write(&ni->file.run_lock);
ni_unlock(ni);
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 3df2d9e34b91..7f27382e0ce2 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -778,7 +778,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
run_deallocate(sbi, &ni->attr_list.run, true);
run_close(&ni->attr_list.run);
ni->attr_list.size = 0;
- kfree(ni->attr_list.le);
+ kvfree(ni->attr_list.le);
ni->attr_list.le = NULL;
ni->attr_list.dirty = false;
@@ -927,7 +927,7 @@ int ni_create_attr_list(struct ntfs_inode *ni)
return 0;
out:
- kfree(ni->attr_list.le);
+ kvfree(ni->attr_list.le);
ni->attr_list.le = NULL;
ni->attr_list.size = 0;
return err;
@@ -2099,7 +2099,7 @@ int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page)
gfp_t gfp_mask;
struct page *pg;
- if (vbo >= ni->vfs_inode.i_size) {
+ if (vbo >= i_size_read(&ni->vfs_inode)) {
SetPageUptodate(page);
err = 0;
goto out;
@@ -2173,7 +2173,7 @@ int ni_decompress_file(struct ntfs_inode *ni)
{
struct ntfs_sb_info *sbi = ni->mi.sbi;
struct inode *inode = &ni->vfs_inode;
- loff_t i_size = inode->i_size;
+ loff_t i_size = i_size_read(inode);
struct address_space *mapping = inode->i_mapping;
gfp_t gfp_mask = mapping_gfp_mask(mapping);
struct page **pages = NULL;
@@ -2508,6 +2508,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
err = -EOPNOTSUPP;
goto out1;
#else
+ loff_t i_size = i_size_read(&ni->vfs_inode);
u32 frame_bits = ni_ext_compress_bits(ni);
u64 frame64 = frame_vbo >> frame_bits;
u64 frames, vbo_data;
@@ -2548,7 +2549,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
}
}
- frames = (ni->vfs_inode.i_size - 1) >> frame_bits;
+ frames = (i_size - 1) >> frame_bits;
err = attr_wof_frame_info(ni, attr, run, frame64, frames,
frame_bits, &ondisk_size, &vbo_data);
@@ -2556,8 +2557,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
goto out2;
if (frame64 == frames) {
- unc_size = 1 + ((ni->vfs_inode.i_size - 1) &
- (frame_size - 1));
+ unc_size = 1 + ((i_size - 1) & (frame_size - 1));
ondisk_size = attr_size(attr) - vbo_data;
} else {
unc_size = frame_size;
@@ -3259,6 +3259,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
if (is_bad_inode(inode) || sb_rdonly(sb))
return 0;
+ if (unlikely(ntfs3_forced_shutdown(sb)))
+ return -EIO;
+
if (!ni_trylock(ni)) {
/* 'ni' is under modification, skip for now. */
mark_inode_dirty_sync(inode);
@@ -3288,7 +3291,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
modified = true;
}
- ts = inode_get_mtime(inode);
+ ts = inode_get_ctime(inode);
dup.c_time = kernel2nt(&ts);
if (std->c_time != dup.c_time) {
std->c_time = dup.c_time;
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 98ccb6650858..855519713bf7 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -465,7 +465,7 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr)
{
const struct RESTART_AREA *ra;
u16 cl, fl, ul;
- u32 off, l_size, file_dat_bits, file_size_round;
+ u32 off, l_size, seq_bits;
u16 ro = le16_to_cpu(rhdr->ra_off);
u32 sys_page = le32_to_cpu(rhdr->sys_page_size);
@@ -511,13 +511,15 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr)
/* Make sure the sequence number bits match the log file size. */
l_size = le64_to_cpu(ra->l_size);
- file_dat_bits = sizeof(u64) * 8 - le32_to_cpu(ra->seq_num_bits);
- file_size_round = 1u << (file_dat_bits + 3);
- if (file_size_round != l_size &&
- (file_size_round < l_size || (file_size_round / 2) > l_size)) {
- return false;
+ seq_bits = sizeof(u64) * 8 + 3;
+ while (l_size) {
+ l_size >>= 1;
+ seq_bits -= 1;
}
+ if (seq_bits != ra->seq_num_bits)
+ return false;
+
/* The log page data offset and record header length must be quad-aligned. */
if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) ||
!IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8))
@@ -974,6 +976,16 @@ skip_looking:
return e;
}
+struct restart_info {
+ u64 last_lsn;
+ struct RESTART_HDR *r_page;
+ u32 vbo;
+ bool chkdsk_was_run;
+ bool valid_page;
+ bool initialized;
+ bool restart;
+};
+
#define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001)
#define NTFSLOG_WRAPPED 0x00000001
@@ -987,6 +999,7 @@ struct ntfs_log {
struct ntfs_inode *ni;
u32 l_size;
+ u32 orig_file_size;
u32 sys_page_size;
u32 sys_page_mask;
u32 page_size;
@@ -1040,6 +1053,8 @@ struct ntfs_log {
struct CLIENT_ID client_id;
u32 client_undo_commit;
+
+ struct restart_info rst_info, rst_info2;
};
static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn)
@@ -1105,16 +1120,6 @@ static inline bool verify_client_lsn(struct ntfs_log *log,
lsn <= le64_to_cpu(log->ra->current_lsn) && lsn;
}
-struct restart_info {
- u64 last_lsn;
- struct RESTART_HDR *r_page;
- u32 vbo;
- bool chkdsk_was_run;
- bool valid_page;
- bool initialized;
- bool restart;
-};
-
static int read_log_page(struct ntfs_log *log, u32 vbo,
struct RECORD_PAGE_HDR **buffer, bool *usa_error)
{
@@ -1176,7 +1181,7 @@ out:
* restart page header. It will stop the first time we find a
* valid page header.
*/
-static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
+static int log_read_rst(struct ntfs_log *log, bool first,
struct restart_info *info)
{
u32 skip, vbo;
@@ -1192,7 +1197,7 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
}
/* Loop continuously until we succeed. */
- for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) {
+ for (; vbo < log->l_size; vbo = 2 * vbo + skip, skip = 0) {
bool usa_error;
bool brst, bchk;
struct RESTART_AREA *ra;
@@ -1285,22 +1290,17 @@ check_result:
/*
* Ilog_init_pg_hdr - Init @log from restart page header.
*/
-static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size,
- u32 page_size, u16 major_ver, u16 minor_ver)
+static void log_init_pg_hdr(struct ntfs_log *log, u16 major_ver, u16 minor_ver)
{
- log->sys_page_size = sys_page_size;
- log->sys_page_mask = sys_page_size - 1;
- log->page_size = page_size;
- log->page_mask = page_size - 1;
- log->page_bits = blksize_bits(page_size);
+ log->sys_page_size = log->page_size;
+ log->sys_page_mask = log->page_mask;
log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits;
if (!log->clst_per_page)
log->clst_per_page = 1;
- log->first_page = major_ver >= 2 ?
- 0x22 * page_size :
- ((sys_page_size << 1) + (page_size << 1));
+ log->first_page = major_ver >= 2 ? 0x22 * log->page_size :
+ 4 * log->page_size;
log->major_ver = major_ver;
log->minor_ver = minor_ver;
}
@@ -1308,12 +1308,11 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size,
/*
* log_create - Init @log in cases when we don't have a restart area to use.
*/
-static void log_create(struct ntfs_log *log, u32 l_size, const u64 last_lsn,
+static void log_create(struct ntfs_log *log, const u64 last_lsn,
u32 open_log_count, bool wrapped, bool use_multi_page)
{
- log->l_size = l_size;
/* All file offsets must be quadword aligned. */
- log->file_data_bits = blksize_bits(l_size) - 3;
+ log->file_data_bits = blksize_bits(log->l_size) - 3;
log->seq_num_mask = (8 << log->file_data_bits) - 1;
log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits;
log->seq_num = (last_lsn >> log->file_data_bits) + 2;
@@ -3720,10 +3719,8 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ntfs_log *log;
- struct restart_info rst_info, rst_info2;
- u64 rec_lsn, ra_lsn, checkpt_lsn = 0, rlsn = 0;
+ u64 rec_lsn, checkpt_lsn = 0, rlsn = 0;
struct ATTR_NAME_ENTRY *attr_names = NULL;
- struct ATTR_NAME_ENTRY *ane;
struct RESTART_TABLE *dptbl = NULL;
struct RESTART_TABLE *trtbl = NULL;
const struct RESTART_TABLE *rt;
@@ -3741,9 +3738,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
struct TRANSACTION_ENTRY *tr;
struct DIR_PAGE_ENTRY *dp;
u32 i, bytes_per_attr_entry;
- u32 l_size = ni->vfs_inode.i_size;
- u32 orig_file_size = l_size;
- u32 page_size, vbo, tail, off, dlen;
+ u32 vbo, tail, off, dlen;
u32 saved_len, rec_len, transact_id;
bool use_second_page;
struct RESTART_AREA *ra2, *ra = NULL;
@@ -3758,52 +3753,50 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
u16 t16;
u32 t32;
- /* Get the size of page. NOTE: To replay we can use default page. */
-#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2
- page_size = norm_file_page(PAGE_SIZE, &l_size, true);
-#else
- page_size = norm_file_page(PAGE_SIZE, &l_size, false);
-#endif
- if (!page_size)
- return -EINVAL;
-
log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS);
if (!log)
return -ENOMEM;
log->ni = ni;
- log->l_size = l_size;
- log->one_page_buf = kmalloc(page_size, GFP_NOFS);
+ log->l_size = log->orig_file_size = ni->vfs_inode.i_size;
+ /* Get the size of page. NOTE: To replay we can use default page. */
+#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2
+ log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, true);
+#else
+ log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, false);
+#endif
+ if (!log->page_size) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ log->one_page_buf = kmalloc(log->page_size, GFP_NOFS);
if (!log->one_page_buf) {
err = -ENOMEM;
goto out;
}
- log->page_size = page_size;
- log->page_mask = page_size - 1;
- log->page_bits = blksize_bits(page_size);
+ log->page_mask = log->page_size - 1;
+ log->page_bits = blksize_bits(log->page_size);
/* Look for a restart area on the disk. */
- memset(&rst_info, 0, sizeof(struct restart_info));
- err = log_read_rst(log, l_size, true, &rst_info);
+ err = log_read_rst(log, true, &log->rst_info);
if (err)
goto out;
/* remember 'initialized' */
- *initialized = rst_info.initialized;
+ *initialized = log->rst_info.initialized;
- if (!rst_info.restart) {
- if (rst_info.initialized) {
+ if (!log->rst_info.restart) {
+ if (log->rst_info.initialized) {
/* No restart area but the file is not initialized. */
err = -EINVAL;
goto out;
}
- log_init_pg_hdr(log, page_size, page_size, 1, 1);
- log_create(log, l_size, 0, get_random_u32(), false, false);
-
- log->ra = ra;
+ log_init_pg_hdr(log, 1, 1);
+ log_create(log, 0, get_random_u32(), false, false);
ra = log_create_ra(log);
if (!ra) {
@@ -3820,25 +3813,26 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
* If the restart offset above wasn't zero then we won't
* look for a second restart.
*/
- if (rst_info.vbo)
+ if (log->rst_info.vbo)
goto check_restart_area;
- memset(&rst_info2, 0, sizeof(struct restart_info));
- err = log_read_rst(log, l_size, false, &rst_info2);
+ err = log_read_rst(log, false, &log->rst_info2);
if (err)
goto out;
/* Determine which restart area to use. */
- if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn)
+ if (!log->rst_info2.restart ||
+ log->rst_info2.last_lsn <= log->rst_info.last_lsn)
goto use_first_page;
use_second_page = true;
- if (rst_info.chkdsk_was_run && page_size != rst_info.vbo) {
+ if (log->rst_info.chkdsk_was_run &&
+ log->page_size != log->rst_info.vbo) {
struct RECORD_PAGE_HDR *sp = NULL;
bool usa_error;
- if (!read_log_page(log, page_size, &sp, &usa_error) &&
+ if (!read_log_page(log, log->page_size, &sp, &usa_error) &&
sp->rhdr.sign == NTFS_CHKD_SIGNATURE) {
use_second_page = false;
}
@@ -3846,52 +3840,43 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
}
if (use_second_page) {
- kfree(rst_info.r_page);
- memcpy(&rst_info, &rst_info2, sizeof(struct restart_info));
- rst_info2.r_page = NULL;
+ kfree(log->rst_info.r_page);
+ memcpy(&log->rst_info, &log->rst_info2,
+ sizeof(struct restart_info));
+ log->rst_info2.r_page = NULL;
}
use_first_page:
- kfree(rst_info2.r_page);
+ kfree(log->rst_info2.r_page);
check_restart_area:
/*
* If the restart area is at offset 0, we want
* to write the second restart area first.
*/
- log->init_ra = !!rst_info.vbo;
+ log->init_ra = !!log->rst_info.vbo;
/* If we have a valid page then grab a pointer to the restart area. */
- ra2 = rst_info.valid_page ?
- Add2Ptr(rst_info.r_page,
- le16_to_cpu(rst_info.r_page->ra_off)) :
+ ra2 = log->rst_info.valid_page ?
+ Add2Ptr(log->rst_info.r_page,
+ le16_to_cpu(log->rst_info.r_page->ra_off)) :
NULL;
- if (rst_info.chkdsk_was_run ||
+ if (log->rst_info.chkdsk_was_run ||
(ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) {
bool wrapped = false;
bool use_multi_page = false;
u32 open_log_count;
/* Do some checks based on whether we have a valid log page. */
- if (!rst_info.valid_page) {
- open_log_count = get_random_u32();
- goto init_log_instance;
- }
- open_log_count = le32_to_cpu(ra2->open_log_count);
-
- /*
- * If the restart page size isn't changing then we want to
- * check how much work we need to do.
- */
- if (page_size != le32_to_cpu(rst_info.r_page->sys_page_size))
- goto init_log_instance;
+ open_log_count = log->rst_info.valid_page ?
+ le32_to_cpu(ra2->open_log_count) :
+ get_random_u32();
-init_log_instance:
- log_init_pg_hdr(log, page_size, page_size, 1, 1);
+ log_init_pg_hdr(log, 1, 1);
- log_create(log, l_size, rst_info.last_lsn, open_log_count,
- wrapped, use_multi_page);
+ log_create(log, log->rst_info.last_lsn, open_log_count, wrapped,
+ use_multi_page);
ra = log_create_ra(log);
if (!ra) {
@@ -3916,28 +3901,27 @@ init_log_instance:
* use the log file. We must use the system page size instead of the
* default size if there is not a clean shutdown.
*/
- t32 = le32_to_cpu(rst_info.r_page->sys_page_size);
- if (page_size != t32) {
- l_size = orig_file_size;
- page_size =
- norm_file_page(t32, &l_size, t32 == DefaultLogPageSize);
+ t32 = le32_to_cpu(log->rst_info.r_page->sys_page_size);
+ if (log->page_size != t32) {
+ log->l_size = log->orig_file_size;
+ log->page_size = norm_file_page(t32, &log->l_size,
+ t32 == DefaultLogPageSize);
}
- if (page_size != t32 ||
- page_size != le32_to_cpu(rst_info.r_page->page_size)) {
+ if (log->page_size != t32 ||
+ log->page_size != le32_to_cpu(log->rst_info.r_page->page_size)) {
err = -EINVAL;
goto out;
}
/* If the file size has shrunk then we won't mount it. */
- if (l_size < le64_to_cpu(ra2->l_size)) {
+ if (log->l_size < le64_to_cpu(ra2->l_size)) {
err = -EINVAL;
goto out;
}
- log_init_pg_hdr(log, page_size, page_size,
- le16_to_cpu(rst_info.r_page->major_ver),
- le16_to_cpu(rst_info.r_page->minor_ver));
+ log_init_pg_hdr(log, le16_to_cpu(log->rst_info.r_page->major_ver),
+ le16_to_cpu(log->rst_info.r_page->minor_ver));
log->l_size = le64_to_cpu(ra2->l_size);
log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits);
@@ -3945,7 +3929,7 @@ init_log_instance:
log->seq_num_mask = (8 << log->file_data_bits) - 1;
log->last_lsn = le64_to_cpu(ra2->current_lsn);
log->seq_num = log->last_lsn >> log->file_data_bits;
- log->ra_off = le16_to_cpu(rst_info.r_page->ra_off);
+ log->ra_off = le16_to_cpu(log->rst_info.r_page->ra_off);
log->restart_size = log->sys_page_size - log->ra_off;
log->record_header_len = le16_to_cpu(ra2->rec_hdr_len);
log->ra_size = le16_to_cpu(ra2->ra_len);
@@ -4045,7 +4029,7 @@ find_oldest:
log->current_avail = current_log_avail(log);
/* Remember which restart area to write first. */
- log->init_ra = rst_info.vbo;
+ log->init_ra = log->rst_info.vbo;
process_log:
/* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */
@@ -4105,7 +4089,7 @@ process_log:
log->client_id.seq_num = cr->seq_num;
log->client_id.client_idx = client;
- err = read_rst_area(log, &rst, &ra_lsn);
+ err = read_rst_area(log, &rst, &checkpt_lsn);
if (err)
goto out;
@@ -4114,9 +4098,8 @@ process_log:
bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28;
- checkpt_lsn = le64_to_cpu(rst->check_point_start);
- if (!checkpt_lsn)
- checkpt_lsn = ra_lsn;
+ if (rst->check_point_start)
+ checkpt_lsn = le64_to_cpu(rst->check_point_start);
/* Allocate and Read the Transaction Table. */
if (!rst->transact_table_len)
@@ -4330,23 +4313,20 @@ check_attr_table:
lcb = NULL;
check_attribute_names2:
- if (!rst->attr_names_len)
- goto trace_attribute_table;
-
- ane = attr_names;
- if (!oatbl)
- goto trace_attribute_table;
- while (ane->off) {
- /* TODO: Clear table on exit! */
- oe = Add2Ptr(oatbl, le16_to_cpu(ane->off));
- t16 = le16_to_cpu(ane->name_bytes);
- oe->name_len = t16 / sizeof(short);
- oe->ptr = ane->name;
- oe->is_attr_name = 2;
- ane = Add2Ptr(ane, sizeof(struct ATTR_NAME_ENTRY) + t16);
- }
-
-trace_attribute_table:
+ if (rst->attr_names_len && oatbl) {
+ struct ATTR_NAME_ENTRY *ane = attr_names;
+ while (ane->off) {
+ /* TODO: Clear table on exit! */
+ oe = Add2Ptr(oatbl, le16_to_cpu(ane->off));
+ t16 = le16_to_cpu(ane->name_bytes);
+ oe->name_len = t16 / sizeof(short);
+ oe->ptr = ane->name;
+ oe->is_attr_name = 2;
+ ane = Add2Ptr(ane,
+ sizeof(struct ATTR_NAME_ENTRY) + t16);
+ }
+ }
+
/*
* If the checkpt_lsn is zero, then this is a freshly
* formatted disk and we have no work to do.
@@ -5189,7 +5169,7 @@ out:
kfree(oatbl);
kfree(dptbl);
kfree(attr_names);
- kfree(rst_info.r_page);
+ kfree(log->rst_info.r_page);
kfree(ra);
kfree(log->one_page_buf);
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index fbfe21dbb425..ae2ef5c11868 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -853,7 +853,8 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
/*
* sb can be NULL here. In this case sbi->flags should be 0 too.
*/
- if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR))
+ if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR) ||
+ unlikely(ntfs3_forced_shutdown(sb)))
return;
blocksize = sb->s_blocksize;
@@ -1006,6 +1007,30 @@ static inline __le32 security_hash(const void *sd, size_t bytes)
return cpu_to_le32(hash);
}
+/*
+ * simple wrapper for sb_bread_unmovable.
+ */
+struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block)
+{
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct buffer_head *bh;
+
+ if (unlikely(block >= sbi->volume.blocks)) {
+ /* prevent generic message "attempt to access beyond end of device" */
+ ntfs_err(sb, "try to read out of volume at offset 0x%llx",
+ (u64)block << sb->s_blocksize_bits);
+ return NULL;
+ }
+
+ bh = sb_bread_unmovable(sb, block);
+ if (bh)
+ return bh;
+
+ ntfs_err(sb, "failed to read volume at offset 0x%llx",
+ (u64)block << sb->s_blocksize_bits);
+ return NULL;
+}
+
int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer)
{
struct block_device *bdev = sb->s_bdev;
@@ -2128,8 +2153,8 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi,
if (le32_to_cpu(d_security->size) == new_sec_size &&
d_security->key.hash == hash_key.hash &&
!memcmp(d_security + 1, sd, size_sd)) {
- *security_id = d_security->key.sec_id;
/* Such security already exists. */
+ *security_id = d_security->key.sec_id;
err = 0;
goto out;
}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index cf92b2433f7a..daabaad63aaf 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1462,7 +1462,7 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
goto out2;
if (in->name == I30_NAME) {
- ni->vfs_inode.i_size = data_size;
+ i_size_write(&ni->vfs_inode, data_size);
inode_set_bytes(&ni->vfs_inode, alloc_size);
}
@@ -1544,7 +1544,7 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
}
if (in->name == I30_NAME)
- ni->vfs_inode.i_size = data_size;
+ i_size_write(&ni->vfs_inode, data_size);
*vbn = bit << indx->idx2vbn_bits;
@@ -2090,7 +2090,7 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni,
return err;
if (in->name == I30_NAME)
- ni->vfs_inode.i_size = new_data;
+ i_size_write(&ni->vfs_inode, new_data);
bpb = bitmap_size(bit);
if (bpb * 8 == nbits)
@@ -2576,7 +2576,7 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
&indx->alloc_run, 0, NULL, false, NULL);
if (in->name == I30_NAME)
- ni->vfs_inode.i_size = 0;
+ i_size_write(&ni->vfs_inode, 0);
err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len,
false, NULL);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 5e3d71374918..eb7a8c9fba01 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -345,9 +345,7 @@ next_attr:
inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer
.PrintNameLength) /
sizeof(u16);
-
ni->i_valid = inode->i_size;
-
/* Clear directory bit. */
if (ni->ni_flags & NI_FLAG_DIR) {
indx_clear(&ni->dir);
@@ -412,7 +410,6 @@ end_enum:
goto out;
if (!is_match && name) {
- /* Reuse rec as buffer for ascii name. */
err = -ENOENT;
goto out;
}
@@ -427,6 +424,7 @@ end_enum:
if (names != le16_to_cpu(rec->hard_links)) {
/* Correct minor error on the fly. Do not mark inode as dirty. */
+ ntfs_inode_warn(inode, "Correct links count -> %u.", names);
rec->hard_links = cpu_to_le16(names);
ni->mi.dirty = true;
}
@@ -653,9 +651,10 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
off = vbo & (PAGE_SIZE - 1);
folio_set_bh(bh, folio, off);
- err = bh_read(bh, 0);
- if (err < 0)
+ if (bh_read(bh, 0) < 0) {
+ err = -EIO;
goto out;
+ }
folio_zero_segment(folio, off + voff, off + block_size);
}
}
@@ -853,9 +852,13 @@ static int ntfs_resident_writepage(struct folio *folio,
struct writeback_control *wbc, void *data)
{
struct address_space *mapping = data;
- struct ntfs_inode *ni = ntfs_i(mapping->host);
+ struct inode *inode = mapping->host;
+ struct ntfs_inode *ni = ntfs_i(inode);
int ret;
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
ni_lock(ni);
ret = attr_data_write_resident(ni, &folio->page);
ni_unlock(ni);
@@ -869,7 +872,12 @@ static int ntfs_resident_writepage(struct folio *folio,
static int ntfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- if (is_resident(ntfs_i(mapping->host)))
+ struct inode *inode = mapping->host;
+
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
+ if (is_resident(ntfs_i(inode)))
return write_cache_pages(mapping, wbc, ntfs_resident_writepage,
mapping);
return mpage_writepages(mapping, wbc, ntfs_get_block);
@@ -889,6 +897,9 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
*pagep = NULL;
if (is_resident(ni)) {
struct page *page =
@@ -974,7 +985,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
}
if (pos + err > inode->i_size) {
- inode->i_size = pos + err;
+ i_size_write(inode, pos + err);
dirty = true;
}
@@ -1306,6 +1317,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
goto out1;
}
+ if (unlikely(ntfs3_forced_shutdown(sb))) {
+ err = -EIO;
+ goto out2;
+ }
+
/* Mark rw ntfs as dirty. it will be cleared at umount. */
ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index ee3093be5170..084d19d78397 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -181,6 +181,9 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry)
struct ntfs_inode *ni = ntfs_i(dir);
int err;
+ if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+ return -EIO;
+
ni_lock_dir(ni);
err = ntfs_unlink_inode(dir, dentry);
@@ -199,6 +202,9 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
u32 size = strlen(symname);
struct inode *inode;
+ if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+ return -EIO;
+
inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0,
symname, size, NULL);
@@ -227,6 +233,9 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry)
struct ntfs_inode *ni = ntfs_i(dir);
int err;
+ if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+ return -EIO;
+
ni_lock_dir(ni);
err = ntfs_unlink_inode(dir, dentry);
@@ -264,6 +273,9 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
1024);
static_assert(PATH_MAX >= 4 * 1024);
+ if (unlikely(ntfs3_forced_shutdown(sb)))
+ return -EIO;
+
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -419,7 +431,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
* fnd contains tree's path to insert to.
* If fnd is not NULL then dir is locked.
*/
- inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni,
+ inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni,
mode, 0, NULL, 0, fnd);
err = IS_ERR(inode) ? PTR_ERR(inode) :
finish_open(file, dentry, ntfs_file_open);
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 86aecbb01a92..9c7478150a03 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -523,12 +523,10 @@ struct ATTR_LIST_ENTRY {
__le64 vcn; // 0x08: Starting VCN of this attribute.
struct MFT_REF ref; // 0x10: MFT record number with attribute.
__le16 id; // 0x18: struct ATTRIB ID.
- __le16 name[3]; // 0x1A: Just to align. To get real name can use bNameOffset.
+ __le16 name[]; // 0x1A: To get real name use name_off.
}; // sizeof(0x20)
-static_assert(sizeof(struct ATTR_LIST_ENTRY) == 0x20);
-
static inline u32 le_size(u8 name_len)
{
return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) +
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index f6706143d14b..79356fd29a14 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -61,6 +61,8 @@ enum utf16_endian;
/* sbi->flags */
#define NTFS_FLAGS_NODISCARD 0x00000001
+/* ntfs in shutdown state. */
+#define NTFS_FLAGS_SHUTDOWN_BIT 0x00000002 /* == 4*/
/* Set when LogFile is replaying. */
#define NTFS_FLAGS_LOG_REPLAYING 0x00000008
/* Set when we changed first MFT's which copy must be updated in $MftMirr. */
@@ -226,7 +228,7 @@ struct ntfs_sb_info {
u64 maxbytes; // Maximum size for normal files.
u64 maxbytes_sparse; // Maximum size for sparse file.
- u32 flags; // See NTFS_FLAGS_XXX.
+ unsigned long flags; // See NTFS_FLAGS_
CLST zone_max; // Maximum MFT zone length in clusters
CLST bad_clusters; // The count of marked bad clusters.
@@ -473,7 +475,7 @@ bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn,
int al_update(struct ntfs_inode *ni, int sync);
static inline size_t al_aligned(size_t size)
{
- return (size + 1023) & ~(size_t)1023;
+ return size_add(size, 1023) & ~(size_t)1023;
}
/* Globals from bitfunc.c */
@@ -500,6 +502,8 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
int ntfs_file_open(struct inode *inode, struct file *file);
int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg);
+long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg);
extern const struct inode_operations ntfs_special_inode_operations;
extern const struct inode_operations ntfs_file_inode_operations;
extern const struct file_operations ntfs_file_operations;
@@ -584,6 +588,7 @@ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes);
int log_replay(struct ntfs_inode *ni, bool *initialized);
/* Globals from fsntfs.c */
+struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block);
bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes);
int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
bool simple);
@@ -872,7 +877,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode,
int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern const struct xattr_handler * const ntfs_xattr_handlers[];
+extern const struct xattr_handler *const ntfs_xattr_handlers[];
int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
void ntfs_get_wsl_perm(struct inode *inode);
@@ -999,6 +1004,11 @@ static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb)
return sb->s_fs_info;
}
+static inline int ntfs3_forced_shutdown(struct super_block *sb)
+{
+ return test_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags);
+}
+
/*
* ntfs_up_cluster - Align up on cluster boundary.
*/
@@ -1025,19 +1035,6 @@ static inline u64 bytes_to_block(const struct super_block *sb, u64 size)
return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
}
-static inline struct buffer_head *ntfs_bread(struct super_block *sb,
- sector_t block)
-{
- struct buffer_head *bh = sb_bread(sb, block);
-
- if (bh)
- return bh;
-
- ntfs_err(sb, "failed to read volume at offset 0x%llx",
- (u64)block << sb->s_blocksize_bits);
- return NULL;
-}
-
static inline struct ntfs_inode *ntfs_i(struct inode *inode)
{
return container_of(inode, struct ntfs_inode, vfs_inode);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 53629b1f65e9..6aa3a9d44df1 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -279,7 +279,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
if (t16 > asize)
return NULL;
- if (t16 + le32_to_cpu(attr->res.data_size) > asize)
+ if (le32_to_cpu(attr->res.data_size) > asize - t16)
return NULL;
t32 = sizeof(short) * attr->name_len;
@@ -535,8 +535,20 @@ bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
return false;
if (ni && is_attr_indexed(attr)) {
- le16_add_cpu(&ni->mi.mrec->hard_links, -1);
- ni->mi.dirty = true;
+ u16 links = le16_to_cpu(ni->mi.mrec->hard_links);
+ struct ATTR_FILE_NAME *fname =
+ attr->type != ATTR_NAME ?
+ NULL :
+ resident_data_ex(attr,
+ SIZEOF_ATTRIBUTE_FILENAME);
+ if (fname && fname->type == FILE_NAME_DOS) {
+ /* Do not decrease links count deleting DOS name. */
+ } else if (!links) {
+ /* minor error. Not critical. */
+ } else {
+ ni->mi.mrec->hard_links = cpu_to_le16(links - 1);
+ ni->mi.dirty = true;
+ }
}
used -= asize;
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 9153dffde950..cef5467fd928 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -122,13 +122,12 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...)
if (name) {
struct dentry *de = d_find_alias(inode);
- const u32 name_len = ARRAY_SIZE(s_name_buf) - 1;
if (de) {
spin_lock(&de->d_lock);
- snprintf(name, name_len, " \"%s\"", de->d_name.name);
+ snprintf(name, sizeof(s_name_buf), " \"%s\"",
+ de->d_name.name);
spin_unlock(&de->d_lock);
- name[name_len] = 0; /* To be sure. */
} else {
name[0] = 0;
}
@@ -625,7 +624,7 @@ static void ntfs3_free_sbi(struct ntfs_sb_info *sbi)
{
kfree(sbi->new_rec);
kvfree(ntfs_put_shared(sbi->upcase));
- kfree(sbi->def_table);
+ kvfree(sbi->def_table);
kfree(sbi->compress.lznt);
#ifdef CONFIG_NTFS3_LZX_XPRESS
xpress_free_decompressor(sbi->compress.xpress);
@@ -715,6 +714,14 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
}
/*
+ * ntfs_shutdown - super_operations::shutdown
+ */
+static void ntfs_shutdown(struct super_block *sb)
+{
+ set_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags);
+}
+
+/*
* ntfs_sync_fs - super_operations::sync_fs
*/
static int ntfs_sync_fs(struct super_block *sb, int wait)
@@ -724,6 +731,9 @@ static int ntfs_sync_fs(struct super_block *sb, int wait)
struct ntfs_inode *ni;
struct inode *inode;
+ if (unlikely(ntfs3_forced_shutdown(sb)))
+ return -EIO;
+
ni = sbi->security.ni;
if (ni) {
inode = &ni->vfs_inode;
@@ -763,6 +773,7 @@ static const struct super_operations ntfs_sops = {
.put_super = ntfs_put_super,
.statfs = ntfs_statfs,
.show_options = ntfs_show_options,
+ .shutdown = ntfs_shutdown,
.sync_fs = ntfs_sync_fs,
.write_inode = ntfs3_write_inode,
};
@@ -866,6 +877,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
u16 fn, ao;
u8 cluster_bits;
u32 boot_off = 0;
+ sector_t boot_block = 0;
const char *hint = "Primary boot";
/* Save original dev_size. Used with alternative boot. */
@@ -873,11 +885,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
sbi->volume.blocks = dev_size >> PAGE_SHIFT;
- bh = ntfs_bread(sb, 0);
+read_boot:
+ bh = ntfs_bread(sb, boot_block);
if (!bh)
- return -EIO;
+ return boot_block ? -EINVAL : -EIO;
-check_boot:
err = -EINVAL;
/* Corrupted image; do not read OOB */
@@ -1108,26 +1120,24 @@ check_boot:
}
out:
- if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) {
+ brelse(bh);
+
+ if (err == -EINVAL && !boot_block && dev_size0 > PAGE_SHIFT) {
u32 block_size = min_t(u32, sector_size, PAGE_SIZE);
u64 lbo = dev_size0 - sizeof(*boot);
- /*
- * Try alternative boot (last sector)
- */
- brelse(bh);
-
- sb_set_blocksize(sb, block_size);
- bh = ntfs_bread(sb, lbo >> blksize_bits(block_size));
- if (!bh)
- return -EINVAL;
-
+ boot_block = lbo >> blksize_bits(block_size);
boot_off = lbo & (block_size - 1);
- hint = "Alternative boot";
- dev_size = dev_size0; /* restore original size. */
- goto check_boot;
+ if (boot_block && block_size >= boot_off + sizeof(*boot)) {
+ /*
+ * Try alternative boot (last sector)
+ */
+ sb_set_blocksize(sb, block_size);
+ hint = "Alternative boot";
+ dev_size = dev_size0; /* restore original size. */
+ goto read_boot;
+ }
}
- brelse(bh);
return err;
}
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 4274b6f31cfa..53e7d1fa036a 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -219,6 +219,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
if (!ea->name_len)
break;
+ if (ea->name_len > ea_size)
+ break;
+
if (buffer) {
/* Check if we can use field ea->name */
if (off + ea_size > size)
@@ -744,6 +747,9 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
int err;
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
/* Dispatch request. */
if (!strcmp(name, SYSTEM_DOS_ATTRIB)) {
/* system.dos_attrib */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4d7efefa98c5..1bde1281d514 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -213,7 +213,7 @@ struct o2hb_region {
unsigned int hr_num_pages;
struct page **hr_slot_data;
- struct bdev_handle *hr_bdev_handle;
+ struct file *hr_bdev_file;
struct o2hb_disk_slot *hr_slots;
/* live node map of this region */
@@ -263,7 +263,7 @@ struct o2hb_region {
static inline struct block_device *reg_bdev(struct o2hb_region *reg)
{
- return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL;
+ return reg->hr_bdev_file ? file_bdev(reg->hr_bdev_file) : NULL;
}
struct o2hb_bio_wait_ctxt {
@@ -1509,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item)
kfree(reg->hr_slot_data);
}
- if (reg->hr_bdev_handle)
- bdev_release(reg->hr_bdev_handle);
+ if (reg->hr_bdev_file)
+ fput(reg->hr_bdev_file);
kfree(reg->hr_slots);
@@ -1569,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
unsigned long block_bytes;
unsigned int block_bits;
- if (reg->hr_bdev_handle)
+ if (reg->hr_bdev_file)
return -EINVAL;
status = o2hb_read_block_input(reg, page, &block_bytes,
@@ -1598,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item,
char *p = (char *)page;
ssize_t ret;
- if (reg->hr_bdev_handle)
+ if (reg->hr_bdev_file)
return -EINVAL;
ret = kstrtoull(p, 0, &tmp);
@@ -1623,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item,
unsigned long tmp;
char *p = (char *)page;
- if (reg->hr_bdev_handle)
+ if (reg->hr_bdev_file)
return -EINVAL;
tmp = simple_strtoul(p, &p, 0);
@@ -1642,7 +1642,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
{
unsigned int ret = 0;
- if (to_o2hb_region(item)->hr_bdev_handle)
+ if (to_o2hb_region(item)->hr_bdev_file)
ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item)));
return ret;
@@ -1753,7 +1753,7 @@ out:
}
/*
- * this is acting as commit; we set up all of hr_bdev_handle and hr_task or
+ * this is acting as commit; we set up all of hr_bdev_file and hr_task or
* nothing
*/
static ssize_t o2hb_region_dev_store(struct config_item *item,
@@ -1769,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
ssize_t ret = -EINVAL;
int live_threshold;
- if (reg->hr_bdev_handle)
+ if (reg->hr_bdev_file)
goto out;
/* We can't heartbeat without having had our node number
@@ -1795,11 +1795,11 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
if (!S_ISBLK(f.file->f_mapping->host->i_mode))
goto out2;
- reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev,
+ reg->hr_bdev_file = bdev_file_open_by_dev(f.file->f_mapping->host->i_rdev,
BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
- if (IS_ERR(reg->hr_bdev_handle)) {
- ret = PTR_ERR(reg->hr_bdev_handle);
- reg->hr_bdev_handle = NULL;
+ if (IS_ERR(reg->hr_bdev_file)) {
+ ret = PTR_ERR(reg->hr_bdev_file);
+ reg->hr_bdev_file = NULL;
goto out2;
}
@@ -1903,8 +1903,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
out3:
if (ret < 0) {
- bdev_release(reg->hr_bdev_handle);
- reg->hr_bdev_handle = NULL;
+ fput(reg->hr_bdev_file);
+ reg->hr_bdev_file = NULL;
}
out2:
fdput(f);
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index f37174e79fad..6de944818c56 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -27,7 +27,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
- if (fl->fl_type == F_WRLCK)
+ if (lock_is_write(fl))
level = 1;
if (!IS_SETLKW(cmd))
trylock = 1;
@@ -53,8 +53,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
*/
locks_init_lock(&request);
- request.fl_type = F_UNLCK;
- request.fl_flags = FL_FLOCK;
+ request.c.flc_type = F_UNLCK;
+ request.c.flc_flags = FL_FLOCK;
locks_lock_file_wait(file, &request);
ocfs2_file_unlock(file);
@@ -100,14 +100,14 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
ocfs2_mount_local(osb))
return locks_lock_file_wait(file, fl);
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
return ocfs2_do_funlock(file, cmd, fl);
else
return ocfs2_do_flock(file, inode, cmd, fl);
@@ -118,7 +118,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!(fl->fl_flags & FL_POSIX))
+ if (!(fl->c.flc_flags & FL_POSIX))
return -ENOLCK;
return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 9b76ee66aeb2..c11406cd87a8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -744,7 +744,7 @@ static int user_plock(struct ocfs2_cluster_connection *conn,
return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl);
else if (IS_GETLK(cmd))
return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
- else if (fl->fl_type == F_UNLCK)
+ else if (lock_is_unlock(fl))
return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
else
return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6b906424902b..a70aff17d455 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2027,8 +2027,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
- memcpy(&sb->s_uuid, di->id2.i_super.s_uuid,
- sizeof(di->id2.i_super.s_uuid));
+ super_set_uuid(sb, di->id2.i_super.s_uuid,
+ sizeof(di->id2.i_super.s_uuid));
osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
diff --git a/fs/open.c b/fs/open.c
index a84d21e55c39..a7d4bb2c725f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -154,49 +154,52 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
}
#endif
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+long do_ftruncate(struct file *file, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
- struct fd f;
int error;
- error = -EINVAL;
- if (length < 0)
- goto out;
- error = -EBADF;
- f = fdget(fd);
- if (!f.file)
- goto out;
-
/* explicitly opened as large or we are on 64-bit box */
- if (f.file->f_flags & O_LARGEFILE)
+ if (file->f_flags & O_LARGEFILE)
small = 0;
- dentry = f.file->f_path.dentry;
+ dentry = file->f_path.dentry;
inode = dentry->d_inode;
- error = -EINVAL;
- if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
- goto out_putf;
+ if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
+ return -EINVAL;
- error = -EINVAL;
/* Cannot ftruncate over 2^31 bytes without large file support */
if (small && length > MAX_NON_LFS)
- goto out_putf;
+ return -EINVAL;
- error = -EPERM;
/* Check IS_APPEND on real upper inode */
- if (IS_APPEND(file_inode(f.file)))
- goto out_putf;
+ if (IS_APPEND(file_inode(file)))
+ return -EPERM;
sb_start_write(inode->i_sb);
- error = security_file_truncate(f.file);
+ error = security_file_truncate(file);
if (!error)
- error = do_truncate(file_mnt_idmap(f.file), dentry, length,
- ATTR_MTIME | ATTR_CTIME, f.file);
+ error = do_truncate(file_mnt_idmap(file), dentry, length,
+ ATTR_MTIME | ATTR_CTIME, file);
sb_end_write(inode->i_sb);
-out_putf:
+
+ return error;
+}
+
+long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+{
+ struct fd f;
+ int error;
+
+ if (length < 0)
+ return -EINVAL;
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ error = do_ftruncate(f.file, length, small);
+
fdput(f);
-out:
return error;
}
@@ -1364,7 +1367,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
{
struct filename *name = getname_kernel(filename);
struct file *file = ERR_CAST(name);
-
+
if (!IS_ERR(name)) {
file = file_open_name(name, flags, mode);
putname(name);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index c4b65a6d41cc..4a0779e3ef79 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -446,7 +446,7 @@ static int __init init_openprom_fs(void)
sizeof(struct op_inode_info),
0,
(SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
op_inode_init_once);
if (!op_inode_cachep)
return -ENOMEM;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index b8e25ca51016..8586e2f5d243 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -265,20 +265,18 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
if (IS_ERR(old_file))
return PTR_ERR(old_file);
+ /* Try to use clone_file_range to clone up within the same fs */
+ cloned = vfs_clone_file_range(old_file, 0, new_file, 0, len, 0);
+ if (cloned == len)
+ goto out_fput;
+
+ /* Couldn't clone, so now we try to copy the data */
error = rw_verify_area(READ, old_file, &old_pos, len);
if (!error)
error = rw_verify_area(WRITE, new_file, &new_pos, len);
if (error)
goto out_fput;
- /* Try to use clone_file_range to clone up within the same fs */
- ovl_start_write(dentry);
- cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
- ovl_end_write(dentry);
- if (cloned == len)
- goto out_fput;
- /* Couldn't clone, so now we try to copy the data */
-
/* Check if lower fs supports seek operation */
if (old_file->f_mode & FMODE_LSEEK)
skip_hole = true;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 984ffdaeed6c..5764f91d283e 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -18,10 +18,11 @@
struct ovl_lookup_data {
struct super_block *sb;
- struct vfsmount *mnt;
+ const struct ovl_layer *layer;
struct qstr name;
bool is_dir;
bool opaque;
+ bool xwhiteouts;
bool stop;
bool last;
char *redirect;
@@ -201,17 +202,13 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
return real;
}
-static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path)
-{
- return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE);
-}
-
static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
const char *name,
struct dentry *base, int len,
bool drop_negative)
{
- struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len);
+ struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name,
+ base, len);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
if (drop_negative && ret->d_lockref.count == 1) {
@@ -232,10 +229,13 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
size_t prelen, const char *post,
struct dentry **ret, bool drop_negative)
{
+ struct ovl_fs *ofs = OVL_FS(d->sb);
struct dentry *this;
struct path path;
int err;
bool last_element = !post[0];
+ bool is_upper = d->layer->idx == 0;
+ char val;
this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
if (IS_ERR(this)) {
@@ -253,8 +253,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
}
path.dentry = this;
- path.mnt = d->mnt;
- if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) {
+ path.mnt = d->layer->mnt;
+ if (ovl_path_is_whiteout(ofs, &path)) {
d->stop = d->opaque = true;
goto put_and_out;
}
@@ -272,7 +272,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
d->stop = true;
goto put_and_out;
}
- err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL);
+ err = ovl_check_metacopy_xattr(ofs, &path, NULL);
if (err < 0)
goto out_err;
@@ -292,7 +292,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
if (d->last)
goto out;
- if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) {
+ /* overlay.opaque=x means xwhiteouts directory */
+ val = ovl_get_opaquedir_val(ofs, &path);
+ if (last_element && !is_upper && val == 'x') {
+ d->xwhiteouts = true;
+ ovl_layer_set_xwhiteouts(ofs, d->layer);
+ } else if (val == 'y') {
d->stop = true;
if (last_element)
d->opaque = true;
@@ -863,7 +868,8 @@ fail:
* Returns next layer in stack starting from top.
* Returns -1 if this is the last layer.
*/
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
+ const struct ovl_layer **layer)
{
struct ovl_entry *oe = OVL_E(dentry);
struct ovl_path *lowerstack = ovl_lowerstack(oe);
@@ -871,13 +877,16 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
BUG_ON(idx < 0);
if (idx == 0) {
ovl_path_upper(dentry, path);
- if (path->dentry)
+ if (path->dentry) {
+ *layer = &OVL_FS(dentry->d_sb)->layers[0];
return ovl_numlower(oe) ? 1 : -1;
+ }
idx++;
}
BUG_ON(idx > ovl_numlower(oe));
path->dentry = lowerstack[idx - 1].dentry;
- path->mnt = lowerstack[idx - 1].layer->mnt;
+ *layer = lowerstack[idx - 1].layer;
+ path->mnt = (*layer)->mnt;
return (idx < ovl_numlower(oe)) ? idx + 1 : -1;
}
@@ -1055,7 +1064,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
old_cred = ovl_override_creds(dentry->d_sb);
upperdir = ovl_dentry_upper(dentry->d_parent);
if (upperdir) {
- d.mnt = ovl_upper_mnt(ofs);
+ d.layer = &ofs->layers[0];
err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
if (err)
goto out;
@@ -1111,7 +1120,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
else if (d.is_dir || !ofs->numdatalayer)
d.last = lower.layer->idx == ovl_numlower(roe);
- d.mnt = lower.layer->mnt;
+ d.layer = lower.layer;
err = ovl_lookup_layer(lower.dentry, &d, &this, false);
if (err)
goto out_put;
@@ -1278,6 +1287,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (upperopaque)
ovl_dentry_set_opaque(dentry);
+ if (d.xwhiteouts)
+ ovl_dentry_set_xwhiteouts(dentry);
if (upperdentry)
ovl_dentry_set_upper_alias(dentry);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 5ba11eb43767..ee949f3e7c77 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -50,7 +50,6 @@ enum ovl_xattr {
OVL_XATTR_METACOPY,
OVL_XATTR_PROTATTR,
OVL_XATTR_XWHITEOUT,
- OVL_XATTR_XWHITEOUTS,
};
enum ovl_inode_flag {
@@ -70,6 +69,8 @@ enum ovl_entry_flag {
OVL_E_UPPER_ALIAS,
OVL_E_OPAQUE,
OVL_E_CONNECTED,
+ /* Lower stack may contain xwhiteout entries */
+ OVL_E_XWHITEOUTS,
};
enum {
@@ -477,6 +478,10 @@ bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
bool ovl_dentry_is_whiteout(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry);
+bool ovl_dentry_has_xwhiteouts(struct dentry *dentry);
+void ovl_dentry_set_xwhiteouts(struct dentry *dentry);
+void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
+ const struct ovl_layer *layer);
bool ovl_dentry_has_upper_alias(struct dentry *dentry);
void ovl_dentry_set_upper_alias(struct dentry *dentry);
bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags);
@@ -494,11 +499,10 @@ struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
bool ovl_already_copied_up(struct dentry *dentry, int flags);
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
- enum ovl_xattr ox);
+char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox);
bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path);
-bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
const struct path *upperpath);
@@ -573,7 +577,13 @@ static inline bool ovl_is_impuredir(struct super_block *sb,
.mnt = ovl_upper_mnt(ofs),
};
- return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE);
+ return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y';
+}
+
+static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs,
+ const struct path *path)
+{
+ return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE);
}
static inline bool ovl_redirect_follow(struct ovl_fs *ofs)
@@ -680,7 +690,8 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
struct dentry *origin, bool verify);
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
+ const struct ovl_layer **layer);
int ovl_verify_lowerdata(struct dentry *dentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 5fa9c58af65f..cb449ab310a7 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -40,6 +40,8 @@ struct ovl_layer {
int idx;
/* One fsid per unique underlying sb (upper fsid == 0) */
int fsid;
+ /* xwhiteouts were found on this layer */
+ bool has_xwhiteouts;
};
struct ovl_path {
@@ -59,7 +61,7 @@ struct ovl_fs {
unsigned int numfs;
/* Number of data-only lower layers */
unsigned int numdatalayer;
- const struct ovl_layer *layers;
+ struct ovl_layer *layers;
struct ovl_sb *fs;
/* workbasedir is the path at workdir= mount option */
struct dentry *workbasedir;
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 112b4b12f825..36dcc530ac28 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -280,12 +280,20 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
{
struct ovl_fs_context *ctx = fc->fs_private;
- if (ovl_dentry_weird(path->dentry))
- return invalfc(fc, "filesystem on %s not supported", name);
-
if (!d_is_dir(path->dentry))
return invalfc(fc, "%s is not a directory", name);
+ /*
+ * Root dentries of case-insensitive capable filesystems might
+ * not have the dentry operations set, but still be incompatible
+ * with overlayfs. Check explicitly to prevent post-mount
+ * failures.
+ */
+ if (sb_has_encoding(path->mnt->mnt_sb))
+ return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name);
+
+ if (ovl_dentry_weird(path->dentry))
+ return invalfc(fc, "filesystem on %s not supported", name);
/*
* Check whether upper path is read-only here to report failures
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index e71156baa7bc..0ca8af060b0c 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -305,8 +305,6 @@ static inline int ovl_dir_read(const struct path *realpath,
if (IS_ERR(realfile))
return PTR_ERR(realfile);
- rdd->in_xwhiteouts_dir = rdd->dentry &&
- ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath);
rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
@@ -359,10 +357,13 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
.is_lowest = false,
};
int idx, next;
+ const struct ovl_layer *layer;
for (idx = 0; idx != -1; idx = next) {
- next = ovl_path_next(idx, dentry, &realpath);
+ next = ovl_path_next(idx, dentry, &realpath, &layer);
rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
+ rdd.in_xwhiteouts_dir = layer->has_xwhiteouts &&
+ ovl_dentry_has_xwhiteouts(dentry);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 4ab66e3d4cff..36d4b8b1f784 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -28,41 +28,38 @@ MODULE_LICENSE("GPL");
struct ovl_dir_cache;
-static struct dentry *ovl_d_real(struct dentry *dentry,
- const struct inode *inode)
+static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type)
{
- struct dentry *real = NULL, *lower;
+ struct dentry *upper, *lower;
int err;
- /*
- * vfs is only expected to call d_real() with NULL from d_real_inode()
- * and with overlay inode from file_dentry() on an overlay file.
- *
- * TODO: remove @inode argument from d_real() API, remove code in this
- * function that deals with non-NULL @inode and remove d_real() call
- * from file_dentry().
- */
- if (inode && d_inode(dentry) == inode)
- return dentry;
- else if (inode)
+ switch (type) {
+ case D_REAL_DATA:
+ case D_REAL_METADATA:
+ break;
+ default:
goto bug;
+ }
if (!d_is_reg(dentry)) {
/* d_real_inode() is only relevant for regular files */
return dentry;
}
- real = ovl_dentry_upper(dentry);
- if (real && (inode == d_inode(real)))
- return real;
+ upper = ovl_dentry_upper(dentry);
+ if (upper && (type == D_REAL_METADATA ||
+ ovl_has_upperdata(d_inode(dentry))))
+ return upper;
- if (real && !inode && ovl_has_upperdata(d_inode(dentry)))
- return real;
+ if (type == D_REAL_METADATA) {
+ lower = ovl_dentry_lower(dentry);
+ goto real_lower;
+ }
/*
- * Best effort lazy lookup of lowerdata for !inode case to return
+ * Best effort lazy lookup of lowerdata for D_REAL_DATA case to return
* the real lowerdata dentry. The only current caller of d_real() with
- * NULL inode is d_real_inode() from trace_uprobe and this caller is
+ * D_REAL_DATA is d_real_inode() from trace_uprobe and this caller is
* likely going to be followed reading from the file, before placing
* uprobes on offset within the file, so lowerdata should be available
* when setting the uprobe.
@@ -73,18 +70,13 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
lower = ovl_dentry_lowerdata(dentry);
if (!lower)
goto bug;
- real = lower;
- /* Handle recursion */
- real = d_real(real, inode);
+real_lower:
+ /* Handle recursion into stacked lower fs */
+ return d_real(lower, type);
- if (!inode || inode == d_inode(real))
- return real;
bug:
- WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n",
- __func__, dentry, inode ? inode->i_sb->s_id : "NULL",
- inode ? inode->i_ino : 0, real,
- real && d_inode(real) ? d_inode(real)->i_ino : 0);
+ WARN(1, "%s(%pd4, %d): real dentry not found\n", __func__, dentry, type);
return dentry;
}
@@ -1249,6 +1241,7 @@ static struct dentry *ovl_get_root(struct super_block *sb,
struct ovl_entry *oe)
{
struct dentry *root;
+ struct ovl_fs *ofs = OVL_FS(sb);
struct ovl_path *lowerpath = ovl_lowerstack(oe);
unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
int fsid = lowerpath->layer->fsid;
@@ -1270,6 +1263,20 @@ static struct dentry *ovl_get_root(struct super_block *sb,
ovl_set_flag(OVL_IMPURE, d_inode(root));
}
+ /* Look for xwhiteouts marker except in the lowermost layer */
+ for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) {
+ struct path path = {
+ .mnt = lowerpath->layer->mnt,
+ .dentry = lowerpath->dentry,
+ };
+
+ /* overlay.opaque=x means xwhiteouts directory */
+ if (ovl_get_opaquedir_val(ofs, &path) == 'x') {
+ ovl_layer_set_xwhiteouts(ofs, lowerpath->layer);
+ ovl_dentry_set_xwhiteouts(root);
+ }
+ }
+
/* Root is always merge -> can have whiteouts */
ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
ovl_dentry_set_flag(OVL_E_CONNECTED, root);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 0217094c23ea..d285d1d7baad 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -461,6 +461,33 @@ void ovl_dentry_set_opaque(struct dentry *dentry)
ovl_dentry_set_flag(OVL_E_OPAQUE, dentry);
}
+bool ovl_dentry_has_xwhiteouts(struct dentry *dentry)
+{
+ return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry);
+}
+
+void ovl_dentry_set_xwhiteouts(struct dentry *dentry)
+{
+ ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry);
+}
+
+/*
+ * ovl_layer_set_xwhiteouts() is called before adding the overlay dir
+ * dentry to dcache, while readdir of that same directory happens after
+ * the overlay dir dentry is in dcache, so if some cpu observes that
+ * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts
+ * for the layers where xwhiteouts marker was found in that merge dir.
+ */
+void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
+ const struct ovl_layer *layer)
+{
+ if (layer->has_xwhiteouts)
+ return;
+
+ /* Write once to read-mostly layer properties */
+ ofs->layers[layer->idx].has_xwhiteouts = true;
+}
+
/*
* For hard links and decoded file handles, it's possible for ovl_dentry_upper()
* to return positive, while there's no actual upper alias for the inode.
@@ -739,19 +766,6 @@ bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path)
return res >= 0;
}
-bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path)
-{
- struct dentry *dentry = path->dentry;
- int res;
-
- /* xattr.whiteouts must be a directory */
- if (!d_is_dir(dentry))
- return false;
-
- res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0);
- return res >= 0;
-}
-
/*
* Load persistent uuid from xattr into s_uuid if found, or store a new
* random generated value in s_uuid and in xattr.
@@ -760,13 +774,14 @@ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
const struct path *upperpath)
{
bool set = false;
+ uuid_t uuid;
int res;
/* Try to load existing persistent uuid */
- res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, sb->s_uuid.b,
+ res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, uuid.b,
UUID_SIZE);
if (res == UUID_SIZE)
- return true;
+ goto set_uuid;
if (res != -ENODATA)
goto fail;
@@ -794,37 +809,37 @@ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
}
/* Generate overlay instance uuid */
- uuid_gen(&sb->s_uuid);
+ uuid_gen(&uuid);
/* Try to store persistent uuid */
set = true;
- res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, sb->s_uuid.b,
+ res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, uuid.b,
UUID_SIZE);
- if (res == 0)
- return true;
+ if (res)
+ goto fail;
+
+set_uuid:
+ super_set_uuid(sb, uuid.b, sizeof(uuid));
+ return true;
fail:
- memset(sb->s_uuid.b, 0, UUID_SIZE);
ofs->config.uuid = OVL_UUID_NULL;
pr_warn("failed to %s uuid (%pd2, err=%i); falling back to uuid=null.\n",
set ? "set" : "get", upperpath->dentry, res);
return false;
}
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
- enum ovl_xattr ox)
+char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox)
{
int res;
char val;
if (!d_is_dir(path->dentry))
- return false;
+ return 0;
res = ovl_path_getxattr(ofs, path, ox, &val, 1);
- if (res == 1 && val == 'y')
- return true;
-
- return false;
+ return res == 1 ? val : 0;
}
#define OVL_XATTR_OPAQUE_POSTFIX "opaque"
@@ -837,7 +852,6 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
#define OVL_XATTR_PROTATTR_POSTFIX "protattr"
#define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout"
-#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts"
#define OVL_XATTR_TAB_ENTRY(x) \
[x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
@@ -854,7 +868,6 @@ const char *const ovl_xattr_table[][2] = {
OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT),
- OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS),
};
int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
diff --git a/fs/pidfs.c b/fs/pidfs.c
new file mode 100644
index 000000000000..8fd71a00be9c
--- /dev/null
+++ b/fs/pidfs.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/mount.h>
+#include <linux/pid.h>
+#include <linux/pidfs.h>
+#include <linux/pid_namespace.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/pseudo_fs.h>
+#include <linux/seq_file.h>
+#include <uapi/linux/pidfd.h>
+
+#include "internal.h"
+
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+#ifndef CONFIG_FS_PID
+ struct pid *pid = file->private_data;
+
+ file->private_data = NULL;
+ put_pid(pid);
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+/**
+ * pidfd_show_fdinfo - print information about a pidfd
+ * @m: proc fdinfo file
+ * @f: file referencing a pidfd
+ *
+ * Pid:
+ * This function will print the pid that a given pidfd refers to in the
+ * pid namespace of the procfs instance.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its pid. This is
+ * similar to calling getppid() on a process whose parent is outside of
+ * its pid namespace.
+ *
+ * NSpid:
+ * If pid namespaces are supported then this function will also print
+ * the pid of a given pidfd refers to for all descendant pid namespaces
+ * starting from the current pid namespace of the instance, i.e. the
+ * Pid field and the first entry in the NSpid field will be identical.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its first NSpid
+ * entry and no others will be shown.
+ * Note that this differs from the Pid and NSpid fields in
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
+ * the pid namespace of the procfs instance. The difference becomes
+ * obvious when sending around a pidfd between pid namespaces from a
+ * different branch of the tree, i.e. where no ancestral relation is
+ * present between the pid namespaces:
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
+ * namespace (also take care to create new mount namespaces in the
+ * new pid namespace and mount procfs)
+ * - create a process with a pidfd in ns1
+ * - send pidfd from ns1 to ns2
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
+ * have exactly one entry, which is 0
+ */
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct pid *pid = pidfd_pid(f);
+ struct pid_namespace *ns;
+ pid_t nr = -1;
+
+ if (likely(pid_has_task(pid, PIDTYPE_PID))) {
+ ns = proc_pid_ns(file_inode(m->file)->i_sb);
+ nr = pid_nr_ns(pid, ns);
+ }
+
+ seq_put_decimal_ll(m, "Pid:\t", nr);
+
+#ifdef CONFIG_PID_NS
+ seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+ if (nr > 0) {
+ int i;
+
+ /* If nr is non-zero it means that 'pid' is valid and that
+ * ns, i.e. the pid namespace associated with the procfs
+ * instance, is in the pid namespace hierarchy of pid.
+ * Start at one below the already printed level.
+ */
+ for (i = ns->level + 1; i <= pid->level; i++)
+ seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
+ }
+#endif
+ seq_putc(m, '\n');
+}
+#endif
+
+/*
+ * Poll support for process exit notification.
+ */
+static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct pid *pid = pidfd_pid(file);
+ bool thread = file->f_flags & PIDFD_THREAD;
+ struct task_struct *task;
+ __poll_t poll_flags = 0;
+
+ poll_wait(file, &pid->wait_pidfd, pts);
+ /*
+ * Depending on PIDFD_THREAD, inform pollers when the thread
+ * or the whole thread-group exits.
+ */
+ guard(rcu)();
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
+ else if (task->exit_state && (thread || thread_group_empty(task)))
+ poll_flags = EPOLLIN | EPOLLRDNORM;
+
+ return poll_flags;
+}
+
+static const struct file_operations pidfs_file_operations = {
+ .release = pidfd_release,
+ .poll = pidfd_poll,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = pidfd_show_fdinfo,
+#endif
+};
+
+struct pid *pidfd_pid(const struct file *file)
+{
+ if (file->f_op != &pidfs_file_operations)
+ return ERR_PTR(-EBADF);
+#ifdef CONFIG_FS_PID
+ return file_inode(file)->i_private;
+#else
+ return file->private_data;
+#endif
+}
+
+#ifdef CONFIG_FS_PID
+static struct vfsmount *pidfs_mnt __ro_after_init;
+
+/*
+ * The vfs falls back to simple_setattr() if i_op->setattr() isn't
+ * implemented. Let's reject it completely until we have a clean
+ * permission concept for pidfds.
+ */
+static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static const struct inode_operations pidfs_inode_operations = {
+ .getattr = pidfs_getattr,
+ .setattr = pidfs_setattr,
+};
+
+static void pidfs_evict_inode(struct inode *inode)
+{
+ struct pid *pid = inode->i_private;
+
+ clear_inode(inode);
+ put_pid(pid);
+}
+
+static const struct super_operations pidfs_sops = {
+ .drop_inode = generic_delete_inode,
+ .evict_inode = pidfs_evict_inode,
+ .statfs = simple_statfs,
+};
+
+static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ return dynamic_dname(buffer, buflen, "pidfd:[%lu]",
+ d_inode(dentry)->i_ino);
+}
+
+static const struct dentry_operations pidfs_dentry_operations = {
+ .d_delete = always_delete_dentry,
+ .d_dname = pidfs_dname,
+ .d_prune = stashed_dentry_prune,
+};
+
+static void pidfs_init_inode(struct inode *inode, void *data)
+{
+ inode->i_private = data;
+ inode->i_flags |= S_PRIVATE;
+ inode->i_mode |= S_IRWXU;
+ inode->i_op = &pidfs_inode_operations;
+ inode->i_fop = &pidfs_file_operations;
+}
+
+static void pidfs_put_data(void *data)
+{
+ struct pid *pid = data;
+ put_pid(pid);
+}
+
+static const struct stashed_operations pidfs_stashed_ops = {
+ .init_inode = pidfs_init_inode,
+ .put_data = pidfs_put_data,
+};
+
+static int pidfs_init_fs_context(struct fs_context *fc)
+{
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, PID_FS_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ops = &pidfs_sops;
+ ctx->dops = &pidfs_dentry_operations;
+ fc->s_fs_info = (void *)&pidfs_stashed_ops;
+ return 0;
+}
+
+static struct file_system_type pidfs_type = {
+ .name = "pidfs",
+ .init_fs_context = pidfs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+
+ struct file *pidfd_file;
+ struct path path;
+ int ret;
+
+ /*
+ * Inode numbering for pidfs start at RESERVED_PIDS + 1.
+ * This avoids collisions with the root inode which is 1
+ * for pseudo filesystems.
+ */
+ ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
+ get_pid(pid), &path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ pidfd_file = dentry_open(&path, flags, current_cred());
+ path_put(&path);
+ return pidfd_file;
+}
+
+void __init pidfs_init(void)
+{
+ pidfs_mnt = kern_mount(&pidfs_type);
+ if (IS_ERR(pidfs_mnt))
+ panic("Failed to mount pidfs pseudo filesystem");
+}
+
+bool is_pidfs_sb(const struct super_block *sb)
+{
+ return sb == pidfs_mnt->mnt_sb;
+}
+
+#else /* !CONFIG_FS_PID */
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+ struct file *pidfd_file;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid,
+ flags | O_RDWR);
+ if (IS_ERR(pidfd_file))
+ return pidfd_file;
+
+ get_pid(pid);
+ return pidfd_file;
+}
+
+void __init pidfs_init(void) { }
+bool is_pidfs_sb(const struct super_block *sb)
+{
+ return false;
+}
+#endif
diff --git a/fs/pipe.c b/fs/pipe.c
index f1adbfe743d4..50c8a8596b52 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -76,18 +76,20 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
* -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
*/
-static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+#define cmp_int(l, r) ((l > r) - (l < r))
+
+#ifdef CONFIG_PROVE_LOCKING
+static int pipe_lock_cmp_fn(const struct lockdep_map *a,
+ const struct lockdep_map *b)
{
- if (pipe->files)
- mutex_lock_nested(&pipe->mutex, subclass);
+ return cmp_int((unsigned long) a, (unsigned long) b);
}
+#endif
void pipe_lock(struct pipe_inode_info *pipe)
{
- /*
- * pipe_lock() nests non-pipe inode locks (for writing to a file)
- */
- pipe_lock_nested(pipe, I_MUTEX_PARENT);
+ if (pipe->files)
+ mutex_lock(&pipe->mutex);
}
EXPORT_SYMBOL(pipe_lock);
@@ -98,28 +100,16 @@ void pipe_unlock(struct pipe_inode_info *pipe)
}
EXPORT_SYMBOL(pipe_unlock);
-static inline void __pipe_lock(struct pipe_inode_info *pipe)
-{
- mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
-}
-
-static inline void __pipe_unlock(struct pipe_inode_info *pipe)
-{
- mutex_unlock(&pipe->mutex);
-}
-
void pipe_double_lock(struct pipe_inode_info *pipe1,
struct pipe_inode_info *pipe2)
{
BUG_ON(pipe1 == pipe2);
- if (pipe1 < pipe2) {
- pipe_lock_nested(pipe1, I_MUTEX_PARENT);
- pipe_lock_nested(pipe2, I_MUTEX_CHILD);
- } else {
- pipe_lock_nested(pipe2, I_MUTEX_PARENT);
- pipe_lock_nested(pipe1, I_MUTEX_CHILD);
- }
+ if (pipe1 > pipe2)
+ swap(pipe1, pipe2);
+
+ pipe_lock(pipe1);
+ pipe_lock(pipe2);
}
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
@@ -271,7 +261,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
return 0;
ret = 0;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
/*
* We only wake up writers if the pipe was full when we started
@@ -368,7 +358,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
ret = -EAGAIN;
break;
}
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
/*
* We only get here if we didn't actually read anything.
@@ -400,13 +390,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
wake_next_reader = true;
}
if (pipe_empty(pipe->head, pipe->tail))
wake_next_reader = false;
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
if (was_full)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
@@ -462,7 +452,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
if (unlikely(total_len == 0))
return 0;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
@@ -582,19 +572,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* after waiting we need to re-check whether the pipe
* become empty while we dropped the lock.
*/
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
if (was_empty)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
was_empty = pipe_empty(pipe->head, pipe->tail);
wake_next_writer = true;
}
out:
if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
wake_next_writer = false;
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
/*
* If we do do a wakeup event, we do a 'sync' wakeup, because we
@@ -629,7 +619,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
switch (cmd) {
case FIONREAD:
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
count = 0;
head = pipe->head;
tail = pipe->tail;
@@ -639,16 +629,16 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
count += pipe->bufs[tail & mask].len;
tail++;
}
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
return put_user(count, (int __user *)arg);
#ifdef CONFIG_WATCH_QUEUE
case IOC_WATCH_QUEUE_SET_SIZE: {
int ret;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
ret = watch_queue_set_size(pipe, arg);
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
return ret;
}
@@ -734,7 +724,7 @@ pipe_release(struct inode *inode, struct file *file)
{
struct pipe_inode_info *pipe = file->private_data;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
if (file->f_mode & FMODE_READ)
pipe->readers--;
if (file->f_mode & FMODE_WRITE)
@@ -747,7 +737,7 @@ pipe_release(struct inode *inode, struct file *file)
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
put_pipe_info(inode, pipe);
return 0;
@@ -759,7 +749,7 @@ pipe_fasync(int fd, struct file *filp, int on)
struct pipe_inode_info *pipe = filp->private_data;
int retval = 0;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
if (filp->f_mode & FMODE_READ)
retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
@@ -768,7 +758,7 @@ pipe_fasync(int fd, struct file *filp, int on)
/* this can happen only if on == T */
fasync_helper(-1, filp, 0, &pipe->fasync_readers);
}
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
return retval;
}
@@ -834,6 +824,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
+ lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
return pipe;
}
@@ -1144,7 +1135,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
filp->private_data = pipe;
/* OK, we have a pipe and it's pinned down */
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
/* We can only do regular read/write on fifos */
stream_open(inode, filp);
@@ -1214,7 +1205,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
}
/* Ok! */
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
return 0;
err_rd:
@@ -1230,7 +1221,7 @@ err_wr:
goto err;
err:
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
put_pipe_info(inode, pipe);
return ret;
@@ -1411,7 +1402,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
if (!pipe)
return -EBADF;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
switch (cmd) {
case F_SETPIPE_SZ:
@@ -1425,7 +1416,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
break;
}
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
return ret;
}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index e1af20893ebe..6bf587d1a9b8 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -786,12 +786,12 @@ struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns,
return ERR_PTR(count);
if (count == 0)
return NULL;
-
+
acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
acl_e = acl->a_entries;
-
+
for (end = entry + count; entry != end; acl_e++, entry++) {
acl_e->e_tag = le16_to_cpu(entry->e_tag);
acl_e->e_perm = le16_to_cpu(entry->e_perm);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ff08a8957552..34a47fb0c57f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
int permitted;
struct mm_struct *mm;
unsigned long long start_time;
- unsigned long cmin_flt = 0, cmaj_flt = 0;
- unsigned long min_flt = 0, maj_flt = 0;
- u64 cutime, cstime, utime, stime;
- u64 cgtime, gtime;
+ unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
+ u64 cutime, cstime, cgtime, utime, stime, gtime;
unsigned long rsslim = 0;
unsigned long flags;
int exit_code = task->exit_code;
+ struct signal_struct *sig = task->signal;
+ unsigned int seq = 1;
state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
sigemptyset(&sigign);
sigemptyset(&sigcatch);
- cutime = cstime = utime = stime = 0;
- cgtime = gtime = 0;
if (lock_task_sighand(task, &flags)) {
- struct signal_struct *sig = task->signal;
-
if (sig->tty) {
struct pid *pgrp = tty_get_pgrp(sig->tty);
tty_pgrp = pid_nr_ns(pgrp, ns);
@@ -527,28 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
num_threads = get_nr_threads(task);
collect_sigign_sigcatch(task, &sigign, &sigcatch);
- cmin_flt = sig->cmin_flt;
- cmaj_flt = sig->cmaj_flt;
- cutime = sig->cutime;
- cstime = sig->cstime;
- cgtime = sig->cgtime;
rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
- /* add up live thread stats at the group level */
if (whole) {
- struct task_struct *t;
-
- __for_each_thread(sig, t) {
- min_flt += t->min_flt;
- maj_flt += t->maj_flt;
- gtime += task_gtime(t);
- }
-
- min_flt += sig->min_flt;
- maj_flt += sig->maj_flt;
- thread_group_cputime_adjusted(task, &utime, &stime);
- gtime += sig->gtime;
-
if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
exit_code = sig->group_exit_code;
}
@@ -562,10 +539,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (permitted && (!whole || num_threads < 2))
wchan = !task_is_running(task);
- if (!whole) {
+
+ do {
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+
+ cmin_flt = sig->cmin_flt;
+ cmaj_flt = sig->cmaj_flt;
+ cutime = sig->cutime;
+ cstime = sig->cstime;
+ cgtime = sig->cgtime;
+
+ if (whole) {
+ struct task_struct *t;
+
+ min_flt = sig->min_flt;
+ maj_flt = sig->maj_flt;
+ gtime = sig->gtime;
+
+ rcu_read_lock();
+ __for_each_thread(sig, t) {
+ min_flt += t->min_flt;
+ maj_flt += t->maj_flt;
+ gtime += task_gtime(t);
+ }
+ rcu_read_unlock();
+ }
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+
+ if (whole) {
+ thread_group_cputime_adjusted(task, &utime, &stime);
+ } else {
+ task_cputime_adjusted(task, &utime, &stime);
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_cputime_adjusted(task, &utime, &stime);
gtime = task_gtime(task);
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 98a031ac2648..18550c071d71 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1878,8 +1878,6 @@ void proc_pid_evict_inode(struct proc_inode *ei)
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(&pid->lock);
}
-
- put_pid(pid);
}
struct inode *proc_pid_make_inode(struct super_block *sb,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b33e490e3fd9..dcd513dccf55 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -30,7 +30,6 @@
static void proc_evict_inode(struct inode *inode)
{
- struct proc_dir_entry *de;
struct ctl_table_header *head;
struct proc_inode *ei = PROC_I(inode);
@@ -38,17 +37,8 @@ static void proc_evict_inode(struct inode *inode)
clear_inode(inode);
/* Stop tracking associated processes */
- if (ei->pid) {
+ if (ei->pid)
proc_pid_evict_inode(ei);
- ei->pid = NULL;
- }
-
- /* Let go of any associated proc directory entry */
- de = ei->pde;
- if (de) {
- pde_put(de);
- ei->pde = NULL;
- }
head = ei->sysctl;
if (head) {
@@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
static void proc_free_inode(struct inode *inode)
{
+ struct proc_inode *ei = PROC_I(inode);
+
+ if (ei->pid)
+ put_pid(ei->pid);
+ /* Let go of any associated proc directory entry */
+ if (ei->pde)
+ pde_put(ei->pde);
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}
@@ -95,7 +92,7 @@ void __init proc_init_kmemcache(void)
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_ACCOUNT|
SLAB_PANIC),
init_once);
pde_opener_cache =
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b55dbc70287b..06a297a27ba3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -271,7 +271,7 @@ static void proc_kill_sb(struct super_block *sb)
kill_anon_super(sb);
put_pid_ns(fs_info->pid_ns);
- kfree(fs_info);
+ kfree_rcu(fs_info, rcu);
}
static struct file_system_type proc_fs_type = {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 62b16f42d5d2..3f78ebbb795f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2432,7 +2432,6 @@ static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
{
- struct mmu_notifier_range range;
struct pagemap_scan_private p = {0};
unsigned long walk_start;
size_t n_ranges_out = 0;
@@ -2448,15 +2447,9 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
if (ret)
return ret;
- /* Protection change for the range is going to happen. */
- if (p.arg.flags & PM_SCAN_WP_MATCHING) {
- mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
- mm, p.arg.start, p.arg.end);
- mmu_notifier_invalidate_range_start(&range);
- }
-
for (walk_start = p.arg.start; walk_start < p.arg.end;
walk_start = p.arg.walk_end) {
+ struct mmu_notifier_range range;
long n_out;
if (fatal_signal_pending(current)) {
@@ -2467,8 +2460,20 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
ret = mmap_read_lock_killable(mm);
if (ret)
break;
+
+ /* Protection change for the range is going to happen. */
+ if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+ mm, walk_start, p.arg.end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
ret = walk_page_range(mm, walk_start, p.arg.end,
&pagemap_scan_ops, &p);
+
+ if (p.arg.flags & PM_SCAN_WP_MATCHING)
+ mmu_notifier_invalidate_range_end(&range);
+
mmap_read_unlock(mm);
n_out = pagemap_scan_flush_buffer(&p);
@@ -2494,9 +2499,6 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
if (pagemap_scan_writeback_args(&p.arg, uarg))
ret = -EFAULT;
- if (p.arg.flags & PM_SCAN_WP_MATCHING)
- mmu_notifier_invalidate_range_end(&range);
-
kfree(p.vec_buf);
return ret;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 6eb9bb369b57..7b5711f76709 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -21,6 +21,7 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
+#include <linux/fs_context.h>
#include "qnx4.h"
#define QNX4_VERSION 4
@@ -30,28 +31,33 @@ static const struct super_operations qnx4_sops;
static struct inode *qnx4_alloc_inode(struct super_block *sb);
static void qnx4_free_inode(struct inode *inode);
-static int qnx4_remount(struct super_block *sb, int *flags, char *data);
static int qnx4_statfs(struct dentry *, struct kstatfs *);
+static int qnx4_get_tree(struct fs_context *fc);
static const struct super_operations qnx4_sops =
{
.alloc_inode = qnx4_alloc_inode,
.free_inode = qnx4_free_inode,
.statfs = qnx4_statfs,
- .remount_fs = qnx4_remount,
};
-static int qnx4_remount(struct super_block *sb, int *flags, char *data)
+static int qnx4_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
struct qnx4_sb_info *qs;
sync_filesystem(sb);
qs = qnx4_sb(sb);
qs->Version = QNX4_VERSION;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
return 0;
}
+static const struct fs_context_operations qnx4_context_opts = {
+ .get_tree = qnx4_get_tree,
+ .reconfigure = qnx4_reconfigure,
+};
+
static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create )
{
unsigned long phys;
@@ -183,12 +189,13 @@ static const char *qnx4_checkroot(struct super_block *sb,
return "bitmap file not found.";
}
-static int qnx4_fill_super(struct super_block *s, void *data, int silent)
+static int qnx4_fill_super(struct super_block *s, struct fs_context *fc)
{
struct buffer_head *bh;
struct inode *root;
const char *errmsg;
struct qnx4_sb_info *qs;
+ int silent = fc->sb_flags & SB_SILENT;
qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
if (!qs)
@@ -216,7 +223,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data);
brelse(bh);
if (errmsg != NULL) {
- if (!silent)
+ if (!silent)
printk(KERN_ERR "qnx4: %s\n", errmsg);
return -EINVAL;
}
@@ -235,6 +242,18 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
return 0;
}
+static int qnx4_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, qnx4_fill_super);
+}
+
+static int qnx4_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &qnx4_context_opts;
+
+ return 0;
+}
+
static void qnx4_kill_sb(struct super_block *sb)
{
struct qnx4_sb_info *qs = qnx4_sb(sb);
@@ -376,18 +395,12 @@ static void destroy_inodecache(void)
kmem_cache_destroy(qnx4_inode_cachep);
}
-static struct dentry *qnx4_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
-}
-
static struct file_system_type qnx4_fs_type = {
- .owner = THIS_MODULE,
- .name = "qnx4",
- .mount = qnx4_mount,
- .kill_sb = qnx4_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "qnx4",
+ .kill_sb = qnx4_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = qnx4_init_fs_context,
};
MODULE_ALIAS_FS("qnx4");
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index a286c545717f..405913f4faff 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -615,7 +615,7 @@ static int init_inodecache(void)
qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
sizeof(struct qnx6_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (!qnx6_inode_cachep)
return -ENOMEM;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 171c912af50f..6474529c4253 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2386,7 +2386,7 @@ static int journal_read(struct super_block *sb)
cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
reiserfs_info(sb, "checking transaction log (%pg)\n",
- journal->j_bdev_handle->bdev);
+ file_bdev(journal->j_bdev_file));
start = ktime_get_seconds();
/*
@@ -2447,7 +2447,7 @@ static int journal_read(struct super_block *sb)
* device and journal device to be the same
*/
d_bh =
- reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock,
+ reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock,
sb->s_blocksize,
SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
SB_ONDISK_JOURNAL_SIZE(sb));
@@ -2588,9 +2588,9 @@ static void journal_list_init(struct super_block *sb)
static void release_journal_dev(struct reiserfs_journal *journal)
{
- if (journal->j_bdev_handle) {
- bdev_release(journal->j_bdev_handle);
- journal->j_bdev_handle = NULL;
+ if (journal->j_bdev_file) {
+ fput(journal->j_bdev_file);
+ journal->j_bdev_file = NULL;
}
}
@@ -2605,7 +2605,7 @@ static int journal_init_dev(struct super_block *super,
result = 0;
- journal->j_bdev_handle = NULL;
+ journal->j_bdev_file = NULL;
jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
@@ -2616,37 +2616,37 @@ static int journal_init_dev(struct super_block *super,
if ((!jdev_name || !jdev_name[0])) {
if (jdev == super->s_dev)
holder = NULL;
- journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode,
+ journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode,
holder, NULL);
- if (IS_ERR(journal->j_bdev_handle)) {
- result = PTR_ERR(journal->j_bdev_handle);
- journal->j_bdev_handle = NULL;
+ if (IS_ERR(journal->j_bdev_file)) {
+ result = PTR_ERR(journal->j_bdev_file);
+ journal->j_bdev_file = NULL;
reiserfs_warning(super, "sh-458",
"cannot init journal device unknown-block(%u,%u): %i",
MAJOR(jdev), MINOR(jdev), result);
return result;
} else if (jdev != super->s_dev)
- set_blocksize(journal->j_bdev_handle->bdev,
+ set_blocksize(file_bdev(journal->j_bdev_file),
super->s_blocksize);
return 0;
}
- journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode,
+ journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode,
holder, NULL);
- if (IS_ERR(journal->j_bdev_handle)) {
- result = PTR_ERR(journal->j_bdev_handle);
- journal->j_bdev_handle = NULL;
+ if (IS_ERR(journal->j_bdev_file)) {
+ result = PTR_ERR(journal->j_bdev_file);
+ journal->j_bdev_file = NULL;
reiserfs_warning(super, "sh-457",
"journal_init_dev: Cannot open '%s': %i",
jdev_name, result);
return result;
}
- set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize);
+ set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize);
reiserfs_info(super,
"journal_init_dev: journal device: %pg\n",
- journal->j_bdev_handle->bdev);
+ file_bdev(journal->j_bdev_file));
return 0;
}
@@ -2804,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
"journal header magic %x (device %pg) does "
"not match to magic found in super block %x",
jh->jh_journal.jp_journal_magic,
- journal->j_bdev_handle->bdev,
+ file_bdev(journal->j_bdev_file),
sb_jp_journal_magic(rs));
brelse(bhjh);
goto free_and_return;
@@ -2828,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
reiserfs_info(sb, "journal params: device %pg, size %u, "
"journal first block %u, max trans len %u, max batch %u, "
"max commit age %u, max trans age %u\n",
- journal->j_bdev_handle->bdev,
+ file_bdev(journal->j_bdev_file),
SB_ONDISK_JOURNAL_SIZE(sb),
SB_ONDISK_JOURNAL_1st_BLOCK(sb),
journal->j_trans_max,
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 83cb9402e0f9..5c68a4a52d78 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused)
"prepare: \t%12lu\n"
"prepare_retry: \t%12lu\n",
DJP(jp_journal_1st_block),
- SB_JOURNAL(sb)->j_bdev_handle->bdev,
+ file_bdev(SB_JOURNAL(sb)->j_bdev_file),
DJP(jp_journal_dev),
DJP(jp_journal_size),
DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 725667880e62..0554903f42a9 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -299,7 +299,7 @@ struct reiserfs_journal {
/* oldest journal block. start here for traverse */
struct reiserfs_journal_cnode *j_first;
- struct bdev_handle *j_bdev_handle;
+ struct file *j_bdev_file;
/* first block on s_dev of reserved area journal */
int j_1st_reserved_block;
@@ -2810,10 +2810,10 @@ struct reiserfs_journal_header {
/* We need these to make journal.c code more readable */
#define journal_find_get_block(s, block) __find_get_block(\
- SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+ file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize)
+#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
block, s->s_blocksize)
-#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
block, s->s_blocksize)
enum reiserfs_bh_state_bits {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 67b5510beded..2cc469d481a2 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -670,7 +670,6 @@ static int __init init_inodecache(void)
sizeof(struct
reiserfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|
SLAB_ACCOUNT),
init_once);
if (reiserfs_inode_cachep == NULL)
diff --git a/fs/remap_range.c b/fs/remap_range.c
index f8c1120b8311..de07f978ce3e 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -373,9 +373,9 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
}
EXPORT_SYMBOL(generic_remap_file_range_prep);
-loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags)
+loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
loff_t ret;
@@ -391,23 +391,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
if (!file_in->f_op->remap_file_range)
return -EOPNOTSUPP;
- ret = file_in->f_op->remap_file_range(file_in, pos_in,
- file_out, pos_out, len, remap_flags);
- if (ret < 0)
- return ret;
-
- fsnotify_access(file_in);
- fsnotify_modify(file_out);
- return ret;
-}
-EXPORT_SYMBOL(do_clone_file_range);
-
-loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags)
-{
- loff_t ret;
-
ret = remap_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;
@@ -417,10 +400,14 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
return ret;
file_start_write(file_out);
- ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
+ ret = file_in->f_op->remap_file_range(file_in, pos_in,
+ file_out, pos_out, len, remap_flags);
file_end_write(file_out);
+ if (ret < 0)
+ return ret;
+ fsnotify_access(file_in);
+ fsnotify_modify(file_out);
return ret;
}
EXPORT_SYMBOL(vfs_clone_file_range);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 545ad44f96b8..2be227532f39 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -594,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb)
#ifdef CONFIG_ROMFS_ON_BLOCK
if (sb->s_bdev) {
sync_blockdev(sb->s_bdev);
- bdev_release(sb->s_bdev_handle);
+ fput(sb->s_bdev_file);
}
#endif
}
@@ -630,8 +630,8 @@ static int __init init_romfs_fs(void)
romfs_inode_cachep =
kmem_cache_create("romfs_i",
sizeof(struct romfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
- SLAB_ACCOUNT, romfs_i_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+ romfs_i_init_once);
if (!romfs_inode_cachep) {
pr_err("Failed to initialise inode cache\n");
diff --git a/fs/select.c b/fs/select.c
index 0ee55af1a55c..9515c3fa1a03 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -476,7 +476,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
wait->_key |= POLLOUT_SET;
}
-static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
+static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
@@ -839,7 +839,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
struct poll_list {
struct poll_list *next;
- int len;
+ unsigned int len;
struct pollfd entries[];
};
@@ -975,14 +975,15 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
struct poll_wqueues table;
- int err = -EFAULT, fdcount, len;
+ int err = -EFAULT, fdcount;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
- unsigned long todo = nfds;
+ unsigned int todo = nfds;
+ unsigned int len;
if (nfds > rlimit(RLIMIT_NOFILE))
return -EINVAL;
@@ -998,9 +999,9 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
sizeof(struct pollfd) * walk->len))
goto out_fds;
- todo -= walk->len;
- if (!todo)
+ if (walk->len >= todo)
break;
+ todo -= walk->len;
len = min(todo, POLLFD_PER_PAGE);
walk = walk->next = kmalloc(struct_size(walk, entries, len),
@@ -1020,7 +1021,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
- int j;
+ unsigned int j;
for (j = walk->len; j; fds++, ufds++, j--)
unsafe_put_user(fds->revents, &ufds->revents, Efault);
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index d64a306a414b..3de5047a7ff9 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -145,21 +145,27 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
struct cached_fid *cfid;
struct cached_fids *cfids;
const char *npath;
+ int retries = 0, cur_sleep = 1;
if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache ||
is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0))
return -EOPNOTSUPP;
ses = tcon->ses;
- server = ses->server;
cfids = tcon->cfids;
- if (!server->ops->new_lease_key)
- return -EIO;
-
if (cifs_sb->root == NULL)
return -ENOENT;
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ oplock = SMB2_OPLOCK_LEVEL_II;
+ server = cifs_pick_channel(ses);
+
+ if (!server->ops->new_lease_key)
+ return -EIO;
+
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
@@ -236,6 +242,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
.desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES,
.disposition = FILE_OPEN,
.fid = pfid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -268,6 +275,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
*/
cfid->has_lease = true;
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ }
+
rc = compound_send_recv(xid, ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
@@ -367,6 +379,11 @@ out:
atomic_inc(&tcon->num_remote_opens);
}
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 60027f5aebe8..3e4209f41c18 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -659,6 +659,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
spin_lock(&tcon->stat_lock);
tcon->bytes_read = 0;
tcon->bytes_written = 0;
+ tcon->stats_from_time = ktime_get_real_seconds();
spin_unlock(&tcon->stat_lock);
if (server->ops->clear_stats)
server->ops->clear_stats(tcon);
@@ -737,8 +738,9 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\n%d) %s", i, tcon->tree_name);
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
- seq_printf(m, "\nSMBs: %d",
- atomic_read(&tcon->num_smbs_sent));
+ seq_printf(m, "\nSMBs: %d since %ptTs UTC",
+ atomic_read(&tcon->num_smbs_sent),
+ &tcon->stats_from_time);
if (server->ops->print_stats)
server->ops->print_stats(m, tcon);
}
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index ef4c2e3c9fa6..6322f0f68a17 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -572,7 +572,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
UniStrupr(user);
} else {
- memset(user, '\0', 2);
+ *(u16 *)user = 0;
}
rc = crypto_shash_update(ses->server->secmech.hmacmd5,
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 99b0ade833aa..fb368b191eef 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -396,7 +396,7 @@ cifs_alloc_inode(struct super_block *sb)
spin_lock_init(&cifs_inode->writers_lock);
cifs_inode->writers = 0;
cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
- cifs_inode->server_eof = 0;
+ cifs_inode->netfs.remote_i_size = 0;
cifs_inode->uniqueid = 0;
cifs_inode->createtime = 0;
cifs_inode->epoch = 0;
@@ -430,7 +430,7 @@ static void
cifs_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_FSCACHE_WB)
+ if (inode->i_state & I_PINNING_NETFS_WB)
cifs_fscache_unuse_inode_cookie(inode, true);
cifs_fscache_release_inode_cookie(inode);
clear_inode(inode);
@@ -681,6 +681,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize);
if (tcon->ses->server->min_offload)
seq_printf(s, ",esize=%u", tcon->ses->server->min_offload);
+ if (tcon->ses->server->retrans)
+ seq_printf(s, ",retrans=%u", tcon->ses->server->retrans);
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
@@ -793,8 +795,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- fscache_unpin_writeback(wbc, cifs_inode_cookie(inode));
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static int cifs_drop_inode(struct inode *inode)
@@ -1084,7 +1085,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
}
static int
-cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv)
+cifs_setlease(struct file *file, int arg, struct file_lease **lease, void **priv)
{
/*
* Note that this is called by vfs setlease with i_lock held to
@@ -1093,9 +1094,6 @@ cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv)
struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
- if (!(S_ISREG(inode->i_mode)))
- return -EINVAL;
-
/* Check if file is oplocked if this is request for new lease */
if (arg == F_UNLCK ||
((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
@@ -1171,6 +1169,9 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
{
char *target_path;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
target_path = kmalloc(PATH_MAX, GFP_KERNEL);
if (!target_path)
return ERR_PTR(-ENOMEM);
@@ -1222,7 +1223,7 @@ static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *s
if (rc < 0)
goto set_failed;
- netfs_resize_file(&src_cifsi->netfs, src_end);
+ netfs_resize_file(&src_cifsi->netfs, src_end, true);
fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end);
return 0;
@@ -1353,7 +1354,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
smb_file_src, smb_file_target, off, len, destoff);
if (rc == 0 && new_size > i_size_read(target_inode)) {
truncate_setsize(target_inode, new_size);
- netfs_resize_file(&target_cifsi->netfs, new_size);
+ netfs_resize_file(&target_cifsi->netfs, new_size, true);
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
}
@@ -1379,6 +1380,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode);
+ struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode);
struct cifsFileInfo *smb_file_src;
struct cifsFileInfo *smb_file_target;
struct cifs_tcon *src_tcon;
@@ -1427,7 +1429,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
* Advance the EOF marker after the flush above to the end of the range
* if it's short of that.
*/
- if (src_cifsi->server_eof < off + len) {
+ if (src_cifsi->netfs.remote_i_size < off + len) {
rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
if (rc < 0)
goto unlock;
@@ -1451,12 +1453,22 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
/* Discard all the folios that overlap the destination region. */
truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
+ fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
+ i_size_read(target_inode), 0);
+
rc = file_modified(dst_file);
if (!rc) {
rc = target_tcon->ses->server->ops->copychunk_range(xid,
smb_file_src, smb_file_target, off, len, destoff);
- if (rc > 0 && destoff + rc > i_size_read(target_inode))
+ if (rc > 0 && destoff + rc > i_size_read(target_inode)) {
truncate_setsize(target_inode, destoff + rc);
+ netfs_resize_file(&target_cifsi->netfs,
+ i_size_read(target_inode), true);
+ fscache_resize_cookie(cifs_inode_cookie(target_inode),
+ i_size_read(target_inode));
+ }
+ if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point)
+ target_cifsi->netfs.zero_point = destoff + rc;
}
file_accessed(src_file);
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 879d5ef8a66e..53c75cfb33ab 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -50,6 +50,11 @@
#define CIFS_DEF_ACTIMEO (1 * HZ)
/*
+ * max sleep time before retry to server
+ */
+#define CIFS_MAX_SLEEP 2000
+
+/*
* max attribute cache timeout (jiffies) - 2^30
*/
#define CIFS_MAX_ACTIMEO (1 << 30)
@@ -82,7 +87,7 @@
#define SMB_INTERFACE_POLL_INTERVAL 600
/* maximum number of PDUs in one compound */
-#define MAX_COMPOUND 5
+#define MAX_COMPOUND 7
/*
* Default number of credits to keep available for SMB3.
@@ -204,6 +209,8 @@ struct cifs_open_info_data {
};
} reparse;
char *symlink_target;
+ struct cifs_sid posix_owner;
+ struct cifs_sid posix_group;
union {
struct smb2_file_all_info fi;
struct smb311_posix_qinfo posix_fi;
@@ -751,6 +758,7 @@ struct TCP_Server_Info {
unsigned int max_read;
unsigned int max_write;
unsigned int min_offload;
+ unsigned int retrans;
__le16 compress_algorithm;
__u16 signing_algorithm;
__le16 cipher_type;
@@ -1024,6 +1032,8 @@ struct cifs_chan {
__u8 signkey[SMB3_SIGN_KEY_SIZE];
};
+#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1)
+
/*
* Session structure. One of these for each uid session with a particular host
*/
@@ -1056,6 +1066,7 @@ struct cifs_ses {
enum securityEnum sectype; /* what security flavor was specified? */
bool sign; /* is signing required? */
bool domainAuto:1;
+ unsigned int flags;
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE];
@@ -1207,6 +1218,7 @@ struct cifs_tcon {
__u64 bytes_read;
__u64 bytes_written;
spinlock_t stat_lock; /* protects the two fields above */
+ time64_t stats_from_time;
FILE_SYSTEM_DEVICE_INFO fsDevInfo;
FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
FILE_SYSTEM_UNIX_INFO fsUnixInfo;
@@ -1366,6 +1378,7 @@ struct cifs_open_parms {
struct cifs_fid *fid;
umode_t mode;
bool reconnect:1;
+ bool replay:1; /* indicates that this open is for a replay */
};
struct cifs_fid {
@@ -1497,6 +1510,7 @@ struct cifs_writedata {
struct smbd_mr *mr;
#endif
struct cifs_credits credits;
+ bool replay;
};
/*
@@ -1557,7 +1571,6 @@ struct cifsInodeInfo {
spinlock_t writers_lock;
unsigned int writers; /* Number of writers on this inode */
unsigned long time; /* jiffies of last update of inode */
- u64 server_eof; /* current file size on server -- protected by i_lock */
u64 uniqueid; /* server inode number */
u64 createtime; /* creation time on server */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */
@@ -1827,6 +1840,13 @@ static inline bool is_retryable_error(int error)
return false;
}
+static inline bool is_replayable_error(int error)
+{
+ if (error == -EAGAIN || error == -ECONNABORTED)
+ return true;
+ return false;
+}
+
/* cifs_get_writable_file() flags */
#define FIND_WR_ANY 0
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 01e89070df5a..5eb83bafc7fd 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -2066,20 +2066,20 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
parm_data = (struct cifs_posix_lock *)
((char *)&pSMBr->hdr.Protocol + data_offset);
if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
- pLockData->fl_type = F_UNLCK;
+ pLockData->c.flc_type = F_UNLCK;
else {
if (parm_data->lock_type ==
cpu_to_le16(CIFS_RDLCK))
- pLockData->fl_type = F_RDLCK;
+ pLockData->c.flc_type = F_RDLCK;
else if (parm_data->lock_type ==
cpu_to_le16(CIFS_WRLCK))
- pLockData->fl_type = F_WRLCK;
+ pLockData->c.flc_type = F_WRLCK;
pLockData->fl_start = le64_to_cpu(parm_data->start);
pLockData->fl_end = pLockData->fl_start +
(le64_to_cpu(parm_data->length) ?
le64_to_cpu(parm_data->length) - 1 : 0);
- pLockData->fl_pid = -le32_to_cpu(parm_data->pid);
+ pLockData->c.flc_pid = -le32_to_cpu(parm_data->pid);
}
}
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 3052a208c6ca..ac9595504f4b 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -233,6 +233,12 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
/* check if iface is still active */
spin_lock(&ses->chan_lock);
+ if (cifs_ses_get_chan_index(ses, server) ==
+ CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ continue;
+ }
+
if (!cifs_chan_is_iface_active(ses, server)) {
spin_unlock(&ses->chan_lock);
cifs_chan_update_iface(ses, server);
@@ -1574,6 +1580,9 @@ static int match_server(struct TCP_Server_Info *server,
if (server->min_offload != ctx->min_offload)
return 0;
+ if (server->retrans != ctx->retrans)
+ return 0;
+
return 1;
}
@@ -1798,6 +1807,7 @@ smbd_connected:
goto out_err_crypto_release;
}
tcp_ses->min_offload = ctx->min_offload;
+ tcp_ses->retrans = ctx->retrans;
/*
* at this point we are the only ones with the pointer
* to the struct since the kernel thread not created yet
@@ -3434,8 +3444,18 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx)
* the user on mount
*/
if ((cifs_sb->ctx->wsize == 0) ||
- (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx)))
- cifs_sb->ctx->wsize = server->ops->negotiate_wsize(tcon, ctx);
+ (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) {
+ cifs_sb->ctx->wsize =
+ round_down(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE);
+ /*
+ * in the very unlikely event that the server sent a max write size under PAGE_SIZE,
+ * (which would get rounded down to 0) then reset wsize to absolute minimum eg 4096
+ */
+ if (cifs_sb->ctx->wsize == 0) {
+ cifs_sb->ctx->wsize = PAGE_SIZE;
+ cifs_dbg(VFS, "wsize too small, reset to minimum ie PAGE_SIZE, usually 4096\n");
+ }
+ }
if ((cifs_sb->ctx->rsize == 0) ||
(cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx)))
cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx);
@@ -4224,6 +4244,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* only send once per connect */
spin_lock(&tcon->tc_lock);
+
+ /* if tcon is marked for needing reconnect, update state */
+ if (tcon->need_reconnect)
+ tcon->status = TID_NEED_TCON;
+
if (tcon->status == TID_GOOD) {
spin_unlock(&tcon->tc_lock);
return 0;
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index a8a1d386da65..449c59830039 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -565,6 +565,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* only send once per connect */
spin_lock(&tcon->tc_lock);
+
+ /* if tcon is marked for needing reconnect, update state */
+ if (tcon->need_reconnect)
+ tcon->status = TID_NEED_TCON;
+
if (tcon->status == TID_GOOD) {
spin_unlock(&tcon->tc_lock);
return 0;
@@ -625,8 +630,8 @@ out:
spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
tcon->status = TID_GOOD;
- spin_unlock(&tcon->tc_lock);
tcon->need_reconnect = false;
+ spin_unlock(&tcon->tc_lock);
}
return rc;
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 1b4262aff8fa..c3b8e7091a4d 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -87,7 +87,7 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len
continue;
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -120,7 +120,7 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len
continue;
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -151,7 +151,7 @@ void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int le
xas_for_each(&xas, folio, end) {
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -175,6 +175,9 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
/* only send once per connect */
spin_lock(&tcon->tc_lock);
+ if (tcon->need_reconnect)
+ tcon->status = TID_NEED_RECON;
+
if (tcon->status != TID_NEED_RECON) {
spin_unlock(&tcon->tc_lock);
return;
@@ -1312,20 +1315,20 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
down_read(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, offset, length, type,
- flock->fl_flags, &conf_lock,
+ flock->c.flc_flags, &conf_lock,
CIFS_LOCK_OP);
if (exist) {
flock->fl_start = conf_lock->offset;
flock->fl_end = conf_lock->offset + conf_lock->length - 1;
- flock->fl_pid = conf_lock->pid;
+ flock->c.flc_pid = conf_lock->pid;
if (conf_lock->type & server->vals->shared_lock_type)
- flock->fl_type = F_RDLCK;
+ flock->c.flc_type = F_RDLCK;
else
- flock->fl_type = F_WRLCK;
+ flock->c.flc_type = F_WRLCK;
} else if (!cinode->can_cache_brlcks)
rc = 1;
else
- flock->fl_type = F_UNLCK;
+ flock->c.flc_type = F_UNLCK;
up_read(&cinode->lock_sem);
return rc;
@@ -1401,16 +1404,16 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
{
int rc = 0;
struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
- unsigned char saved_type = flock->fl_type;
+ unsigned char saved_type = flock->c.flc_type;
- if ((flock->fl_flags & FL_POSIX) == 0)
+ if ((flock->c.flc_flags & FL_POSIX) == 0)
return 1;
down_read(&cinode->lock_sem);
posix_test_lock(file, flock);
- if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
- flock->fl_type = saved_type;
+ if (lock_is_unlock(flock) && !cinode->can_cache_brlcks) {
+ flock->c.flc_type = saved_type;
rc = 1;
}
@@ -1431,7 +1434,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock)
struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
int rc = FILE_LOCK_DEFERRED + 1;
- if ((flock->fl_flags & FL_POSIX) == 0)
+ if ((flock->c.flc_flags & FL_POSIX) == 0)
return rc;
cifs_down_write(&cinode->lock_sem);
@@ -1581,7 +1584,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
el = locks_to_send.next;
spin_lock(&flctx->flc_lock);
- list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
+ for_each_file_lock(flock, &flctx->flc_posix) {
+ unsigned char ftype = flock->c.flc_type;
+
if (el == &locks_to_send) {
/*
* The list ended. We don't have enough allocated
@@ -1591,12 +1596,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
break;
}
length = cifs_flock_len(flock);
- if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
+ if (ftype == F_RDLCK || ftype == F_SHLCK)
type = CIFS_RDLCK;
else
type = CIFS_WRLCK;
lck = list_entry(el, struct lock_to_push, llist);
- lck->pid = hash_lockowner(flock->fl_owner);
+ lck->pid = hash_lockowner(flock->c.flc_owner);
lck->netfid = cfile->fid.netfid;
lck->length = length;
lck->type = type;
@@ -1663,42 +1668,43 @@ static void
cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
bool *wait_flag, struct TCP_Server_Info *server)
{
- if (flock->fl_flags & FL_POSIX)
+ if (flock->c.flc_flags & FL_POSIX)
cifs_dbg(FYI, "Posix\n");
- if (flock->fl_flags & FL_FLOCK)
+ if (flock->c.flc_flags & FL_FLOCK)
cifs_dbg(FYI, "Flock\n");
- if (flock->fl_flags & FL_SLEEP) {
+ if (flock->c.flc_flags & FL_SLEEP) {
cifs_dbg(FYI, "Blocking lock\n");
*wait_flag = true;
}
- if (flock->fl_flags & FL_ACCESS)
+ if (flock->c.flc_flags & FL_ACCESS)
cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n");
- if (flock->fl_flags & FL_LEASE)
+ if (flock->c.flc_flags & FL_LEASE)
cifs_dbg(FYI, "Lease on file - not implemented yet\n");
- if (flock->fl_flags &
+ if (flock->c.flc_flags &
(~(FL_POSIX | FL_FLOCK | FL_SLEEP |
FL_ACCESS | FL_LEASE | FL_CLOSE | FL_OFDLCK)))
- cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags);
+ cifs_dbg(FYI, "Unknown lock flags 0x%x\n",
+ flock->c.flc_flags);
*type = server->vals->large_lock_type;
- if (flock->fl_type == F_WRLCK) {
+ if (lock_is_write(flock)) {
cifs_dbg(FYI, "F_WRLCK\n");
*type |= server->vals->exclusive_lock_type;
*lock = 1;
- } else if (flock->fl_type == F_UNLCK) {
+ } else if (lock_is_unlock(flock)) {
cifs_dbg(FYI, "F_UNLCK\n");
*type |= server->vals->unlock_lock_type;
*unlock = 1;
/* Check if unlock includes more than one lock range */
- } else if (flock->fl_type == F_RDLCK) {
+ } else if (lock_is_read(flock)) {
cifs_dbg(FYI, "F_RDLCK\n");
*type |= server->vals->shared_lock_type;
*lock = 1;
- } else if (flock->fl_type == F_EXLCK) {
+ } else if (flock->c.flc_type == F_EXLCK) {
cifs_dbg(FYI, "F_EXLCK\n");
*type |= server->vals->exclusive_lock_type;
*lock = 1;
- } else if (flock->fl_type == F_SHLCK) {
+ } else if (flock->c.flc_type == F_SHLCK) {
cifs_dbg(FYI, "F_SHLCK\n");
*type |= server->vals->shared_lock_type;
*lock = 1;
@@ -1730,7 +1736,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
else
posix_lock_type = CIFS_WRLCK;
rc = CIFSSMBPosixLock(xid, tcon, netfid,
- hash_lockowner(flock->fl_owner),
+ hash_lockowner(flock->c.flc_owner),
flock->fl_start, length, flock,
posix_lock_type, wait_flag);
return rc;
@@ -1747,7 +1753,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
if (rc == 0) {
rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
type, 0, 1, false);
- flock->fl_type = F_UNLCK;
+ flock->c.flc_type = F_UNLCK;
if (rc != 0)
cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
rc);
@@ -1755,7 +1761,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
}
if (type & server->vals->shared_lock_type) {
- flock->fl_type = F_WRLCK;
+ flock->c.flc_type = F_WRLCK;
return 0;
}
@@ -1767,12 +1773,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
if (rc == 0) {
rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
type | server->vals->shared_lock_type, 0, 1, false);
- flock->fl_type = F_RDLCK;
+ flock->c.flc_type = F_RDLCK;
if (rc != 0)
cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
rc);
} else
- flock->fl_type = F_WRLCK;
+ flock->c.flc_type = F_WRLCK;
return 0;
}
@@ -1940,7 +1946,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
posix_lock_type = CIFS_UNLCK;
rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid,
- hash_lockowner(flock->fl_owner),
+ hash_lockowner(flock->c.flc_owner),
flock->fl_start, length,
NULL, posix_lock_type, wait_flag);
goto out;
@@ -1950,7 +1956,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
struct cifsLockInfo *lock;
lock = cifs_lock_init(flock->fl_start, length, type,
- flock->fl_flags);
+ flock->c.flc_flags);
if (!lock)
return -ENOMEM;
@@ -1989,7 +1995,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
rc = server->ops->mand_unlock_range(cfile, flock, xid);
out:
- if ((flock->fl_flags & FL_POSIX) || (flock->fl_flags & FL_FLOCK)) {
+ if ((flock->c.flc_flags & FL_POSIX) || (flock->c.flc_flags & FL_FLOCK)) {
/*
* If this is a request to remove all locks because we
* are closing the file, it doesn't matter if the
@@ -1998,7 +2004,7 @@ out:
*/
if (rc) {
cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc);
- if (!(flock->fl_flags & FL_CLOSE))
+ if (!(flock->c.flc_flags & FL_CLOSE))
return rc;
}
rc = locks_lock_file_wait(file, flock);
@@ -2019,7 +2025,7 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl)
xid = get_xid();
- if (!(fl->fl_flags & FL_FLOCK)) {
+ if (!(fl->c.flc_flags & FL_FLOCK)) {
rc = -ENOLCK;
free_xid(xid);
return rc;
@@ -2070,7 +2076,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
xid = get_xid();
cifs_dbg(FYI, "%s: %pD2 cmd=0x%x type=0x%x flags=0x%x r=%lld:%lld\n", __func__, file, cmd,
- flock->fl_flags, flock->fl_type, (long long)flock->fl_start,
+ flock->c.flc_flags, flock->c.flc_type,
+ (long long)flock->fl_start,
(long long)flock->fl_end);
cfile = (struct cifsFileInfo *)file->private_data;
@@ -2120,8 +2127,8 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
{
loff_t end_of_write = offset + bytes_written;
- if (end_of_write > cifsi->server_eof)
- cifsi->server_eof = end_of_write;
+ if (end_of_write > cifsi->netfs.remote_i_size)
+ netfs_resize_file(&cifsi->netfs, end_of_write, true);
}
static ssize_t
@@ -2651,7 +2658,7 @@ static void cifs_extend_writeback(struct address_space *mapping,
continue;
if (xa_is_value(folio))
break;
- if (folio_index(folio) != index)
+ if (folio->index != index)
break;
if (!folio_try_get_rcu(folio)) {
xas_reset(&xas);
@@ -2899,7 +2906,7 @@ redo_folio:
goto skip_write;
}
- if (folio_mapping(folio) != mapping ||
+ if (folio->mapping != mapping ||
!folio_test_dirty(folio)) {
start += folio_size(folio);
folio_unlock(folio);
@@ -2951,7 +2958,7 @@ skip_write:
continue;
}
- folio_batch_release(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
} while (wbc->nr_to_write > 0);
@@ -3247,8 +3254,8 @@ cifs_uncached_writev_complete(struct work_struct *work)
spin_lock(&inode->i_lock);
cifs_update_eof(cifsi, wdata->offset, wdata->bytes);
- if (cifsi->server_eof > inode->i_size)
- i_size_write(inode, cifsi->server_eof);
+ if (cifsi->netfs.remote_i_size > inode->i_size)
+ i_size_write(inode, cifsi->netfs.remote_i_size);
spin_unlock(&inode->i_lock);
complete(&wdata->done);
@@ -3300,6 +3307,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
if (wdata->cfile->invalidHandle)
rc = -EAGAIN;
else {
+ wdata->replay = true;
#ifdef CONFIG_CIFS_SMB_DIRECT
if (wdata->mr) {
wdata->mr->need_invalidate = true;
@@ -5043,27 +5051,13 @@ static void cifs_swap_deactivate(struct file *file)
/* do we need to unpin (or unlock) the file */
}
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-#ifdef CONFIG_CIFS_FSCACHE
-static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- return fscache_dirty_folio(mapping, folio,
- cifs_inode_cookie(mapping->host));
-}
-#else
-#define cifs_dirty_folio filemap_dirty_folio
-#endif
-
const struct address_space_operations cifs_addr_ops = {
.read_folio = cifs_read_folio,
.readahead = cifs_readahead,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .dirty_folio = cifs_dirty_folio,
+ .dirty_folio = netfs_dirty_folio,
.release_folio = cifs_release_folio,
.direct_IO = cifs_direct_io,
.invalidate_folio = cifs_invalidate_folio,
@@ -5087,7 +5081,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .dirty_folio = cifs_dirty_folio,
+ .dirty_folio = netfs_dirty_folio,
.release_folio = cifs_release_folio,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index a3493da12ad1..4b2f5aa2ea0e 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -139,6 +139,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("dir_mode", Opt_dirmode),
fsparam_u32("port", Opt_port),
fsparam_u32("min_enc_offload", Opt_min_enc_offload),
+ fsparam_u32("retrans", Opt_retrans),
fsparam_u32("esize", Opt_min_enc_offload),
fsparam_u32("bsize", Opt_blocksize),
fsparam_u32("rasize", Opt_rasize),
@@ -210,7 +211,7 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c
switch (match_token(value, cifs_secflavor_tokens, args)) {
case Opt_sec_krb5p:
- cifs_errorf(fc, "sec=krb5p is not supported!\n");
+ cifs_errorf(fc, "sec=krb5p is not supported. Use sec=krb5,seal instead\n");
return 1;
case Opt_sec_krb5i:
ctx->sign = true;
@@ -1064,6 +1065,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_min_enc_offload:
ctx->min_offload = result.uint_32;
break;
+ case Opt_retrans:
+ ctx->retrans = result.uint_32;
+ break;
case Opt_blocksize:
/*
* inode blocksize realistically should never need to be
@@ -1107,6 +1111,17 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_wsize:
ctx->wsize = result.uint_32;
ctx->got_wsize = true;
+ if (ctx->wsize % PAGE_SIZE != 0) {
+ ctx->wsize = round_down(ctx->wsize, PAGE_SIZE);
+ if (ctx->wsize == 0) {
+ ctx->wsize = PAGE_SIZE;
+ cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE);
+ } else {
+ cifs_dbg(VFS,
+ "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n",
+ ctx->wsize, PAGE_SIZE);
+ }
+ }
break;
case Opt_acregmax:
ctx->acregmax = HZ * result.uint_32;
@@ -1619,6 +1634,8 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->backupuid_specified = false; /* no backup intent for a user */
ctx->backupgid_specified = false; /* no backup intent for a group */
+ ctx->retrans = 1;
+
/*
* short int override_uid = -1;
* short int override_gid = -1;
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index cf46916286d0..182ce11cbe93 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -118,6 +118,7 @@ enum cifs_param {
Opt_file_mode,
Opt_dirmode,
Opt_min_enc_offload,
+ Opt_retrans,
Opt_blocksize,
Opt_rasize,
Opt_rsize,
@@ -245,6 +246,7 @@ struct smb3_fs_context {
unsigned int rsize;
unsigned int wsize;
unsigned int min_offload;
+ unsigned int retrans;
bool sockopt_tcp_nodelay:1;
/* attribute cache timemout for files and directories in jiffies */
unsigned long acregmax;
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index e5cad149f5a2..c4a3cb736881 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -180,7 +180,7 @@ static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_
if (ret < 0)
return ret;
- ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
+ ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode),
no_space_allocated_yet);
if (ret == 0)
ret = fscache_write(&cres, start, &iter, NULL, NULL);
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 9f37c1758f73..d02f8ba29cb5 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -104,7 +104,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
mtime = inode_get_mtime(inode);
if (timespec64_equal(&mtime, &fattr->cf_mtime) &&
- cifs_i->server_eof == fattr->cf_eof) {
+ cifs_i->netfs.remote_i_size == fattr->cf_eof) {
cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
__func__, cifs_i->uniqueid);
return;
@@ -194,7 +194,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
else
clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
- cifs_i->server_eof = fattr->cf_eof;
+ cifs_i->netfs.remote_i_size = fattr->cf_eof;
/*
* Can't safely change the file size here if the client is writing to
* it due to potential races.
@@ -665,8 +665,6 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
/* Fill a cifs_fattr struct with info from POSIX info struct */
static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr,
struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group,
struct super_block *sb)
{
struct smb311_posix_qinfo *info = &data->posix_fi;
@@ -722,8 +720,8 @@ out_reparse:
fattr->cf_symlink_target = data->symlink_target;
data->symlink_target = NULL;
}
- sid_to_id(cifs_sb, owner, fattr, SIDOWNER);
- sid_to_id(cifs_sb, group, fattr, SIDGROUP);
+ sid_to_id(cifs_sb, &data->posix_owner, fattr, SIDOWNER);
+ sid_to_id(cifs_sb, &data->posix_group, fattr, SIDGROUP);
cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n",
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
@@ -1070,9 +1068,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
const unsigned int xid,
struct cifs_tcon *tcon,
const char *full_path,
- struct cifs_fattr *fattr,
- struct cifs_sid *owner,
- struct cifs_sid *group)
+ struct cifs_fattr *fattr)
{
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -1117,7 +1113,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
}
if (tcon->posix_extensions)
- smb311_posix_info_to_fattr(fattr, data, owner, group, sb);
+ smb311_posix_info_to_fattr(fattr, data, sb);
else
cifs_open_info_to_fattr(fattr, data, sb);
out:
@@ -1171,8 +1167,7 @@ static int cifs_get_fattr(struct cifs_open_info_data *data,
*/
if (cifs_open_data_reparse(data)) {
rc = reparse_info_to_fattr(data, sb, xid, tcon,
- full_path, fattr,
- NULL, NULL);
+ full_path, fattr);
} else {
cifs_open_info_to_fattr(fattr, data, sb);
}
@@ -1317,10 +1312,10 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data,
const unsigned int xid)
{
struct cifs_open_info_data tmp_data = {};
+ struct TCP_Server_Info *server;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
struct tcon_link *tlink;
- struct cifs_sid owner, group;
int tmprc;
int rc = 0;
@@ -1328,14 +1323,14 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
/*
* 1. Fetch file metadata if not provided (data)
*/
if (!data) {
- rc = smb311_posix_query_path_info(xid, tcon, cifs_sb,
- full_path, &tmp_data,
- &owner, &group);
+ rc = server->ops->query_path_info(xid, tcon, cifs_sb,
+ full_path, &tmp_data);
data = &tmp_data;
}
@@ -1347,11 +1342,9 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data,
case 0:
if (cifs_open_data_reparse(data)) {
rc = reparse_info_to_fattr(data, sb, xid, tcon,
- full_path, fattr,
- &owner, &group);
+ full_path, fattr);
} else {
- smb311_posix_info_to_fattr(fattr, data,
- &owner, &group, sb);
+ smb311_posix_info_to_fattr(fattr, data, sb);
}
break;
case -EREMOTE:
@@ -2865,7 +2858,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
set_size_out:
if (rc == 0) {
- cifsInode->server_eof = attrs->ia_size;
+ netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
cifs_setsize(inode, attrs->ia_size);
/*
* i_blocks is not related to (i_size / i_blksize), but instead
@@ -3018,6 +3011,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
if ((attrs->ia_valid & ATTR_SIZE) &&
attrs->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attrs->ia_size);
+ netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size);
}
@@ -3217,6 +3211,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
if ((attrs->ia_valid & ATTR_SIZE) &&
attrs->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attrs->ia_size);
+ netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size);
}
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index c2137ea3c253..0748d7b757b9 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -140,6 +140,7 @@ tcon_info_alloc(bool dir_leases_enabled)
spin_lock_init(&ret_buf->stat_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
+ ret_buf->stats_from_time = ktime_get_real_seconds();
#ifdef CONFIG_CIFS_DFS_UPCALL
INIT_LIST_HEAD(&ret_buf->dfs_ses_list);
#endif
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index a6968573b775..4a517b280f2b 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -168,6 +168,21 @@ static char *automount_fullpath(struct dentry *dentry, void *page)
return s;
}
+static void fs_context_set_ids(struct smb3_fs_context *ctx)
+{
+ kuid_t uid = current_fsuid();
+ kgid_t gid = current_fsgid();
+
+ if (ctx->multiuser) {
+ if (!ctx->uid_specified)
+ ctx->linux_uid = uid;
+ if (!ctx->gid_specified)
+ ctx->linux_gid = gid;
+ }
+ if (!ctx->cruid_specified)
+ ctx->cred_uid = uid;
+}
+
/*
* Create a vfsmount that we can automount
*/
@@ -205,6 +220,7 @@ static struct vfsmount *cifs_do_automount(struct path *path)
tmp.leaf_fullpath = NULL;
tmp.UNC = tmp.prepath = NULL;
tmp.dfs_root_ses = NULL;
+ fs_context_set_ids(&tmp);
rc = smb3_fs_context_dup(ctx, &tmp);
if (rc) {
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 056cae1ddcce..b520eea7bfce 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -133,15 +133,15 @@ retry:
* Query dir responses don't provide enough
* information about reparse points other than
* their reparse tags. Save an invalidation by
- * not clobbering the existing mode, size and
- * symlink target (if any) when reparse tag and
- * ctime haven't changed.
+ * not clobbering some existing attributes when
+ * reparse tag and ctime haven't changed.
*/
rc = 0;
if (fattr->cf_cifsattrs & ATTR_REPARSE) {
if (likely(reparse_inode_match(inode, fattr))) {
fattr->cf_mode = inode->i_mode;
- fattr->cf_eof = CIFS_I(inode)->server_eof;
+ fattr->cf_rdev = inode->i_rdev;
+ fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size;
fattr->cf_symlink_target = NULL;
} else {
CIFS_I(inode)->time = 0;
@@ -307,14 +307,16 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
}
static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr,
- SEARCH_ID_FULL_DIR_INFO *info,
+ const void *info,
struct cifs_sb_info *cifs_sb)
{
+ const FILE_FULL_DIRECTORY_INFO *di = info;
+
__dir_info_to_fattr(fattr, info);
- /* See MS-FSCC 2.4.19 FileIdFullDirectoryInformation */
+ /* See MS-FSCC 2.4.14, 2.4.19 */
if (fattr->cf_cifsattrs & ATTR_REPARSE)
- fattr->cf_cifstag = le32_to_cpu(info->EaSize);
+ fattr->cf_cifstag = le32_to_cpu(di->EaSize);
cifs_fill_common_info(fattr, cifs_sb);
}
@@ -396,7 +398,7 @@ ffirst_retry:
} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
} else /* not srvinos - BB fixme add check for backlevel? */ {
- cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO;
+ cifsFile->srch_inf.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO;
}
search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME;
@@ -645,10 +647,10 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)
static int is_dir_changed(struct file *file)
{
struct inode *inode = file_inode(file);
- struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
+ struct cifsInodeInfo *cifs_inode_info = CIFS_I(inode);
- if (cifsInfo->time == 0)
- return 1; /* directory was changed, perhaps due to unlink */
+ if (cifs_inode_info->time == 0)
+ return 1; /* directory was changed, e.g. unlink or new file */
else
return 0;
@@ -987,10 +989,9 @@ static int cifs_filldir(char *find_entry, struct file *file,
(FIND_FILE_STANDARD_INFO *)find_entry,
cifs_sb);
break;
+ case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
case SMB_FIND_FILE_ID_FULL_DIR_INFO:
- cifs_fulldir_info_to_fattr(&fattr,
- (SEARCH_ID_FULL_DIR_INFO *)find_entry,
- cifs_sb);
+ cifs_fulldir_info_to_fattr(&fattr, find_entry, cifs_sb);
break;
default:
cifs_dir_info_to_fattr(&fattr,
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index cde81042bebd..8f37373fd333 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -75,6 +75,10 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
{
unsigned int i;
+ /* if the channel is waiting for termination */
+ if (server && server->terminate)
+ return CIFS_INVAL_CHAN_INDEX;
+
for (i = 0; i < ses->chan_count; i++) {
if (ses->chans[i].server == server)
return i;
@@ -84,7 +88,6 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
if (server)
cifs_dbg(VFS, "unable to get chan index for server: 0x%llx",
server->conn_id);
- WARN_ON(1);
return CIFS_INVAL_CHAN_INDEX;
}
@@ -269,6 +272,8 @@ int cifs_try_adding_channels(struct cifs_ses *ses)
&iface->sockaddr,
rc);
kref_put(&iface->refcount, release_iface);
+ /* failure to add chan should increase weight */
+ iface->weight_fulfilled++;
continue;
}
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index e0ee96d69d49..c23478ab1cf8 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -228,7 +228,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
* flock and OFD lock are associated with an open
* file description, not the process.
*/
- if (!(flock->fl_flags & (FL_FLOCK | FL_OFDLCK)))
+ if (!(flock->c.flc_flags & (FL_FLOCK | FL_OFDLCK)))
continue;
if (cinode->can_cache_brlcks) {
/*
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 5053a5550abe..05818cd6d932 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -56,6 +56,35 @@ static inline __u32 file_create_options(struct dentry *dentry)
return 0;
}
+/* Parse owner and group from SMB3.1.1 POSIX query info */
+static int parse_posix_sids(struct cifs_open_info_data *data,
+ struct kvec *rsp_iov)
+{
+ struct smb2_query_info_rsp *qi = rsp_iov->iov_base;
+ unsigned int out_len = le32_to_cpu(qi->OutputBufferLength);
+ unsigned int qi_len = sizeof(data->posix_fi);
+ int owner_len, group_len;
+ u8 *sidsbuf, *sidsbuf_end;
+
+ if (out_len <= qi_len)
+ return -EINVAL;
+
+ sidsbuf = (u8 *)qi + le16_to_cpu(qi->OutputBufferOffset) + qi_len;
+ sidsbuf_end = sidsbuf + out_len - qi_len;
+
+ owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
+ if (owner_len == -1)
+ return -EINVAL;
+
+ memcpy(&data->posix_owner, sidsbuf, owner_len);
+ group_len = posix_info_sid_size(sidsbuf + owner_len, sidsbuf_end);
+ if (group_len == -1)
+ return -EINVAL;
+
+ memcpy(&data->posix_group, sidsbuf + owner_len, group_len);
+ return 0;
+}
+
/*
* note: If cfile is passed, the reference to it is dropped here.
* So make sure that you do not reuse cfile after return from this func.
@@ -69,7 +98,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
__u32 desired_access, __u32 create_disposition,
__u32 create_options, umode_t mode, struct kvec *in_iov,
int *cmds, int num_cmds, struct cifsFileInfo *cfile,
- __u8 **extbuf, size_t *extbuflen,
struct kvec *out_iov, int *out_buftype)
{
@@ -92,6 +120,14 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int size[2];
void *data[2];
int len;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ num_rqst = 0;
+ server = cifs_pick_channel(ses);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
if (vars == NULL)
@@ -99,8 +135,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
rqst = &vars->rqst[0];
rsp_iov = &vars->rsp_iov[0];
- server = cifs_pick_channel(ses);
-
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -435,15 +469,24 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
num_rqst++;
if (cfile) {
+ if (retries)
+ for (i = 1; i < num_rqst - 2; i++)
+ smb2_set_replay(server, &rqst[i]);
+
rc = compound_send_recv(xid, ses, server,
flags, num_rqst - 2,
&rqst[1], &resp_buftype[1],
&rsp_iov[1]);
- } else
+ } else {
+ if (retries)
+ for (i = 0; i < num_rqst; i++)
+ smb2_set_replay(server, &rqst[i]);
+
rc = compound_send_recv(xid, ses, server,
flags, num_rqst,
rqst, resp_buftype,
rsp_iov);
+ }
finished:
num_rqst = 0;
@@ -494,21 +537,9 @@ finished:
&rsp_iov[i + 1], sizeof(idata->posix_fi) /* add SIDs */,
(char *)&idata->posix_fi);
}
- if (rc == 0) {
- unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength);
-
- if (length > sizeof(idata->posix_fi)) {
- char *base = (char *)rsp_iov[i + 1].iov_base +
- le16_to_cpu(qi_rsp->OutputBufferOffset) +
- sizeof(idata->posix_fi);
- *extbuflen = length - sizeof(idata->posix_fi);
- *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL);
- if (!*extbuf)
- rc = -ENOMEM;
- } else {
- rc = -EINVAL;
- }
- }
+ if (rc == 0)
+ rc = parse_posix_sids(idata, &rsp_iov[i + 1]);
+
SMB2_query_info_free(&rqst[num_rqst++]);
if (rc)
trace_smb3_posix_query_info_compound_err(xid, ses->Suid,
@@ -604,9 +635,6 @@ finished:
}
SMB2_close_free(&rqst[num_rqst]);
- if (cfile)
- cifsFileInfo_put(cfile);
-
num_cmds += 2;
if (out_iov && out_buftype) {
memcpy(out_iov, rsp_iov, num_cmds * sizeof(*out_iov));
@@ -616,7 +644,16 @@ finished:
for (i = 0; i < num_cmds; i++)
free_rsp_buf(resp_buftype[i], rsp_iov[i].iov_base);
}
+ num_cmds -= 2; /* correct num_cmds as there could be a retry */
kfree(vars);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
+ if (cfile)
+ cifsFileInfo_put(cfile);
+
return rc;
}
@@ -662,7 +699,7 @@ int smb2_query_path_info(const unsigned int xid,
struct smb2_hdr *hdr;
struct kvec in_iov[2], out_iov[3] = {};
int out_buftype[3] = {};
- int cmds[2] = { SMB2_OP_QUERY_INFO, };
+ int cmds[2];
bool islink;
int i, num_cmds;
int rc, rc2;
@@ -670,20 +707,36 @@ int smb2_query_path_info(const unsigned int xid,
data->adjust_tz = false;
data->reparse_point = false;
- if (strcmp(full_path, ""))
- rc = -ENOENT;
- else
- rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid);
- /* If it is a root and its handle is cached then use it */
- if (!rc) {
- if (cfid->file_all_info_is_valid) {
- memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi));
+ /*
+ * BB TODO: Add support for using cached root handle in SMB3.1.1 POSIX.
+ * Create SMB2_query_posix_info worker function to do non-compounded
+ * query when we already have an open file handle for this. For now this
+ * is fast enough (always using the compounded version).
+ */
+ if (!tcon->posix_extensions) {
+ if (*full_path) {
+ rc = -ENOENT;
} else {
- rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid,
- cfid->fid.volatile_fid, &data->fi);
+ rc = open_cached_dir(xid, tcon, full_path,
+ cifs_sb, false, &cfid);
+ }
+ /* If it is a root and its handle is cached then use it */
+ if (!rc) {
+ if (cfid->file_all_info_is_valid) {
+ memcpy(&data->fi, &cfid->file_all_info,
+ sizeof(data->fi));
+ } else {
+ rc = SMB2_query_info(xid, tcon,
+ cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid,
+ &data->fi);
+ }
+ close_cached_dir(cfid);
+ return rc;
}
- close_cached_dir(cfid);
- return rc;
+ cmds[0] = SMB2_OP_QUERY_INFO;
+ } else {
+ cmds[0] = SMB2_OP_POSIX_QUERY_INFO;
}
in_iov[0].iov_base = data;
@@ -693,9 +746,8 @@ int smb2_query_path_info(const unsigned int xid,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE,
- in_iov, cmds, 1, cfile,
- NULL, NULL, out_iov, out_buftype);
+ create_options, ACL_NO_MODE, in_iov,
+ cmds, 1, cfile, out_iov, out_buftype);
hdr = out_iov[0].iov_base;
/*
* If first iov is unset, then SMB session was dropped or we've got a
@@ -707,6 +759,10 @@ int smb2_query_path_info(const unsigned int xid,
switch (rc) {
case 0:
case -EOPNOTSUPP:
+ /*
+ * BB TODO: When support for special files added to Samba
+ * re-verify this path.
+ */
rc = parse_create_response(data, cifs_sb, &out_iov[0]);
if (rc || !data->reparse_point)
goto out;
@@ -722,8 +778,8 @@ int smb2_query_path_info(const unsigned int xid,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, in_iov, cmds,
- num_cmds, cfile, NULL, NULL, NULL, NULL);
+ create_options, ACL_NO_MODE, in_iov,
+ cmds, num_cmds, cfile, NULL, NULL);
break;
case -EREMOTE:
break;
@@ -746,101 +802,6 @@ out:
return rc;
}
-int smb311_posix_query_path_info(const unsigned int xid,
- struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group)
-{
- int rc;
- __u32 create_options = 0;
- struct cifsFileInfo *cfile;
- struct kvec in_iov[2], out_iov[3] = {};
- int out_buftype[3] = {};
- __u8 *sidsbuf = NULL;
- __u8 *sidsbuf_end = NULL;
- size_t sidsbuflen = 0;
- size_t owner_len, group_len;
- int cmds[2] = { SMB2_OP_POSIX_QUERY_INFO, };
- int i, num_cmds;
-
- data->adjust_tz = false;
- data->reparse_point = false;
-
- /*
- * BB TODO: Add support for using the cached root handle.
- * Create SMB2_query_posix_info worker function to do non-compounded query
- * when we already have an open file handle for this. For now this is fast enough
- * (always using the compounded version).
- */
- in_iov[0].iov_base = data;
- in_iov[0].iov_len = sizeof(*data);
- in_iov[1] = in_iov[0];
-
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, in_iov, cmds, 1,
- cfile, &sidsbuf, &sidsbuflen, out_iov, out_buftype);
- /*
- * If first iov is unset, then SMB session was dropped or we've got a
- * cached open file (@cfile).
- */
- if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER)
- goto out;
-
- switch (rc) {
- case 0:
- case -EOPNOTSUPP:
- /* BB TODO: When support for special files added to Samba re-verify this path */
- rc = parse_create_response(data, cifs_sb, &out_iov[0]);
- if (rc || !data->reparse_point)
- goto out;
-
- if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) {
- /* symlink already parsed in create response */
- num_cmds = 1;
- } else {
- cmds[1] = SMB2_OP_GET_REPARSE;
- num_cmds = 2;
- }
- create_options |= OPEN_REPARSE_POINT;
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, in_iov, cmds,
- num_cmds, cfile, &sidsbuf, &sidsbuflen, NULL, NULL);
- break;
- }
-
-out:
- if (rc == 0) {
- sidsbuf_end = sidsbuf + sidsbuflen;
-
- owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
- if (owner_len == -1) {
- rc = -EINVAL;
- goto out;
- }
- memcpy(owner, sidsbuf, owner_len);
-
- group_len = posix_info_sid_size(
- sidsbuf + owner_len, sidsbuf_end);
- if (group_len == -1) {
- rc = -EINVAL;
- goto out;
- }
- memcpy(group, sidsbuf + owner_len, group_len);
- }
-
- kfree(sidsbuf);
- for (i = 0; i < ARRAY_SIZE(out_buftype); i++)
- free_rsp_buf(out_buftype[i], out_iov[i].iov_base);
- return rc;
-}
-
int
smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
struct cifs_tcon *tcon, const char *name,
@@ -848,9 +809,9 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
{
return smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, mode, NULL,
- &(int){SMB2_OP_MKDIR}, 1,
- NULL, NULL, NULL, NULL, NULL);
+ CREATE_NOT_FILE, mode,
+ NULL, &(int){SMB2_OP_MKDIR}, 1,
+ NULL, NULL, NULL);
}
void
@@ -875,7 +836,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, ACL_NO_MODE, &in_iov,
&(int){SMB2_OP_SET_INFO}, 1,
- cfile, NULL, NULL, NULL, NULL);
+ cfile, NULL, NULL);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -887,8 +848,9 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
return smb2_compound_op(xid, tcon, cifs_sb, name,
DELETE, FILE_OPEN, CREATE_NOT_FILE,
- ACL_NO_MODE, NULL, &(int){SMB2_OP_RMDIR}, 1,
- NULL, NULL, NULL, NULL, NULL);
+ ACL_NO_MODE, NULL,
+ &(int){SMB2_OP_RMDIR}, 1,
+ NULL, NULL, NULL);
}
int
@@ -897,8 +859,9 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
{
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE, NULL, &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL, NULL, NULL);
+ ACL_NO_MODE, NULL,
+ &(int){SMB2_OP_DELETE}, 1,
+ NULL, NULL, NULL);
}
static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
@@ -919,8 +882,8 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
in_iov.iov_base = smb2_to_name;
in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX);
rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
- FILE_OPEN, create_options, ACL_NO_MODE, &in_iov,
- &command, 1, cfile, NULL, NULL, NULL, NULL);
+ FILE_OPEN, create_options, ACL_NO_MODE,
+ &in_iov, &command, 1, cfile, NULL, NULL);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -971,7 +934,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
FILE_WRITE_DATA, FILE_OPEN,
0, ACL_NO_MODE, &in_iov,
&(int){SMB2_OP_SET_EOF}, 1,
- cfile, NULL, NULL, NULL, NULL);
+ cfile, NULL, NULL);
}
int
@@ -999,8 +962,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_ATTRIBUTES, FILE_OPEN,
0, ACL_NO_MODE, &in_iov,
- &(int){SMB2_OP_SET_INFO}, 1, cfile,
- NULL, NULL, NULL, NULL);
+ &(int){SMB2_OP_SET_INFO}, 1,
+ cfile, NULL, NULL);
cifs_put_tlink(tlink);
return rc;
}
@@ -1035,7 +998,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
da, cd, co, ACL_NO_MODE, in_iov,
- cmds, 2, cfile, NULL, NULL, NULL, NULL);
+ cmds, 2, cfile, NULL, NULL);
if (!rc) {
rc = smb311_posix_get_inode_info(&new, full_path,
data, sb, xid);
@@ -1045,7 +1008,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
da, cd, co, ACL_NO_MODE, in_iov,
- cmds, 2, cfile, NULL, NULL, NULL, NULL);
+ cmds, 2, cfile, NULL, NULL);
if (!rc) {
rc = cifs_get_inode_info(&new, full_path,
data, sb, xid, NULL);
@@ -1072,8 +1035,8 @@ int smb2_query_reparse_point(const unsigned int xid,
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov,
- &(int){SMB2_OP_GET_REPARSE}, 1, cfile,
- NULL, NULL, NULL, NULL);
+ &(int){SMB2_OP_GET_REPARSE}, 1,
+ cfile, NULL, NULL);
if (rc)
goto out;
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index 1a90dd78b238..ac1895358908 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -1210,6 +1210,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"},
{STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"},
{STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"},
+ {STATUS_SERVER_UNAVAILABLE, -EAGAIN, "STATUS_SERVER_UNAVAILABLE"},
+ {STATUS_FILE_NOT_AVAILABLE, -EAGAIN, "STATUS_FILE_NOT_AVAILABLE"},
{STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"},
{STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"},
{STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"},
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 01a5bd7e6a30..4695433fcf39 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -614,11 +614,12 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
"multichannel not available\n"
"Empty network interface list returned by server %s\n",
ses->server->hostname);
- rc = -EINVAL;
+ rc = -EOPNOTSUPP;
+ ses->iface_last_update = jiffies;
goto out;
}
- while (bytes_left >= sizeof(*p)) {
+ while (bytes_left >= (ssize_t)sizeof(*p)) {
memset(&tmp_iface, 0, sizeof(tmp_iface));
tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
@@ -712,7 +713,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
ses->iface_count++;
spin_unlock(&ses->iface_lock);
- ses->iface_last_update = jiffies;
next_iface:
nb_iface++;
next = le32_to_cpu(p->Next);
@@ -734,11 +734,7 @@ next_iface:
if ((bytes_left > 8) || p->Next)
cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-
- if (!ses->iface_count) {
- rc = -EINVAL;
- goto out;
- }
+ ses->iface_last_update = jiffies;
out:
/*
@@ -1112,7 +1108,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
{
struct smb2_compound_vars *vars;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct smb_rqst *rqst;
struct kvec *rsp_iov;
__le16 *utf16_path = NULL;
@@ -1128,6 +1124,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
struct smb2_file_full_ea_info *ea = NULL;
struct smb2_query_info_rsp *rsp;
int rc, used_len = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_CP_CREATE_CLOSE_OP;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(ses);
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -1201,6 +1204,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, 0),
.fid = &fid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -1248,6 +1252,12 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
goto sea_exit;
smb2_set_related(&rqst[2]);
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ smb2_set_replay(server, &rqst[2]);
+ }
+
rc = compound_send_recv(xid, ses, server,
flags, 3, rqst,
resp_buftype, rsp_iov);
@@ -1264,6 +1274,11 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
kfree(vars);
out_free_path:
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
#endif
@@ -1488,7 +1503,7 @@ smb2_ioctl_query_info(const unsigned int xid,
struct smb_rqst *rqst;
struct kvec *rsp_iov;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
char __user *arg = (char __user *)p;
struct smb_query_info qi;
struct smb_query_info __user *pqi;
@@ -1505,6 +1520,13 @@ smb2_ioctl_query_info(const unsigned int xid,
void *data[2];
int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR;
void (*free_req1_func)(struct smb_rqst *r);
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_CP_CREATE_CLOSE_OP;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(ses);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
if (vars == NULL)
@@ -1548,6 +1570,7 @@ smb2_ioctl_query_info(const unsigned int xid,
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, create_options),
.fid = &fid,
+ .replay = !!(retries),
};
if (qi.flags & PASSTHRU_FSCTL) {
@@ -1645,6 +1668,12 @@ smb2_ioctl_query_info(const unsigned int xid,
goto free_req_1;
smb2_set_related(&rqst[2]);
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ smb2_set_replay(server, &rqst[2]);
+ }
+
rc = compound_send_recv(xid, ses, server,
flags, 3, rqst,
resp_buftype, rsp_iov);
@@ -1705,6 +1734,11 @@ free_output_buffer:
kfree(buffer);
free_vars:
kfree(vars);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -2231,8 +2265,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct smb2_query_directory_rsp *qd_rsp = NULL;
struct smb2_create_rsp *op_rsp = NULL;
- struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
- int retry_count = 0;
+ struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(tcon->ses);
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
@@ -2257,6 +2297,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, 0),
.fid = fid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -2282,14 +2323,15 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
smb2_set_related(&rqst[1]);
-again:
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ }
+
rc = compound_send_recv(xid, tcon->ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
- if (rc == -EAGAIN && retry_count++ < 10)
- goto again;
-
/* If the open failed there is nothing to do */
op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) {
@@ -2337,6 +2379,11 @@ again:
SMB2_query_directory_free(&rqst[1]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -2462,6 +2509,22 @@ smb2_oplock_response(struct cifs_tcon *tcon, __u64 persistent_fid,
}
void
+smb2_set_replay(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+ struct smb2_hdr *shdr;
+
+ if (server->dialect < SMB30_PROT_ID)
+ return;
+
+ shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
+ if (shdr == NULL) {
+ cifs_dbg(FYI, "shdr NULL in smb2_set_related\n");
+ return;
+ }
+ shdr->Flags |= SMB2_FLAGS_REPLAY_OPERATION;
+}
+
+void
smb2_set_related(struct smb_rqst *rqst)
{
struct smb2_hdr *shdr;
@@ -2534,6 +2597,27 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
}
/*
+ * helper function for exponential backoff and check if replayable
+ */
+bool smb2_should_replay(struct cifs_tcon *tcon,
+ int *pretries,
+ int *pcur_sleep)
+{
+ if (!pretries || !pcur_sleep)
+ return false;
+
+ if (tcon->retry || (*pretries)++ < tcon->ses->server->retrans) {
+ msleep(*pcur_sleep);
+ (*pcur_sleep) = ((*pcur_sleep) << 1);
+ if ((*pcur_sleep) > CIFS_MAX_SLEEP)
+ (*pcur_sleep) = CIFS_MAX_SLEEP;
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Passes the query info response back to the caller on success.
* Caller need to free this with free_rsp_buf().
*/
@@ -2546,7 +2630,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
{
struct smb2_compound_vars *vars;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = CIFS_CP_CREATE_CLOSE_OP;
struct smb_rqst *rqst;
int resp_buftype[3];
@@ -2557,6 +2641,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
__le16 *utf16_path;
struct cached_fid *cfid = NULL;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_CP_CREATE_CLOSE_OP;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(ses);
if (!path)
path = "";
@@ -2593,6 +2684,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, 0),
.fid = &fid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -2637,6 +2729,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
goto qic_exit;
smb2_set_related(&rqst[2]);
+ if (retries) {
+ if (!cfid) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[2]);
+ }
+ smb2_set_replay(server, &rqst[1]);
+ }
+
if (cfid) {
rc = compound_send_recv(xid, ses, server,
flags, 1, &rqst[1],
@@ -2669,6 +2769,11 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
kfree(vars);
out_free_path:
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3217,6 +3322,9 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
cfile->fid.volatile_fid, cfile->pid, new_size);
if (rc >= 0) {
truncate_setsize(inode, new_size);
+ netfs_resize_file(&cifsi->netfs, new_size, true);
+ if (offset < cifsi->netfs.zero_point)
+ cifsi->netfs.zero_point = offset;
fscache_resize_cookie(cifs_inode_cookie(inode), new_size);
}
}
@@ -3440,7 +3548,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, new_eof);
if (rc == 0) {
- cifsi->server_eof = new_eof;
+ netfs_resize_file(&cifsi->netfs, new_eof, true);
cifs_setsize(inode, new_eof);
cifs_truncate_page(inode->i_mapping, inode->i_size);
truncate_setsize(inode, new_eof);
@@ -3532,8 +3640,9 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
int rc;
unsigned int xid;
struct inode *inode = file_inode(file);
- struct cifsFileInfo *cfile = file->private_data;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct cifsFileInfo *cfile = file->private_data;
+ struct netfs_inode *ictx = &cifsi->netfs;
loff_t old_eof, new_eof;
xid = get_xid();
@@ -3553,6 +3662,7 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
goto out_2;
truncate_pagecache_range(inode, off, old_eof);
+ ictx->zero_point = old_eof;
rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
old_eof - off - len, off);
@@ -3567,9 +3677,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
rc = 0;
- cifsi->server_eof = i_size_read(inode) - len;
- truncate_setsize(inode, cifsi->server_eof);
- fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof);
+ truncate_setsize(inode, new_eof);
+ netfs_resize_file(&cifsi->netfs, new_eof, true);
+ ictx->zero_point = new_eof;
+ fscache_resize_cookie(cifs_inode_cookie(inode), new_eof);
out_2:
filemap_invalidate_unlock(inode->i_mapping);
out:
@@ -3585,6 +3696,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
unsigned int xid;
struct cifsFileInfo *cfile = file->private_data;
struct inode *inode = file_inode(file);
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
__u64 count, old_eof, new_eof;
xid = get_xid();
@@ -3612,6 +3724,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
goto out_2;
truncate_setsize(inode, new_eof);
+ netfs_resize_file(&cifsi->netfs, i_size_read(inode), true);
fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode));
rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
@@ -5104,7 +5217,7 @@ static int smb2_create_reparse_symlink(const unsigned int xid,
struct inode *new;
struct kvec iov;
__le16 *path;
- char *sym;
+ char *sym, sep = CIFS_DIR_SEP(cifs_sb);
u16 len, plen;
int rc = 0;
@@ -5118,7 +5231,8 @@ static int smb2_create_reparse_symlink(const unsigned int xid,
.symlink_target = sym,
};
- path = cifs_convert_path_to_utf16(symname, cifs_sb);
+ convert_delimiter(sym, sep);
+ path = cifs_convert_path_to_utf16(sym, cifs_sb);
if (!path) {
rc = -ENOMEM;
goto out;
@@ -5141,7 +5255,10 @@ static int smb2_create_reparse_symlink(const unsigned int xid,
buf->PrintNameLength = cpu_to_le16(plen);
memcpy(buf->PathBuffer, path, plen);
buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
+ if (*sym != sep)
+ buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
+ convert_delimiter(sym, '/');
iov.iov_base = buf;
iov.iov_len = len;
new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index bd25c34dc398..608ee05491e2 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -156,6 +156,56 @@ out:
return;
}
+/* helper function for code reuse */
+static int
+cifs_chan_skip_or_disable(struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
+ bool from_reconnect)
+{
+ struct TCP_Server_Info *pserver;
+ unsigned int chan_index;
+
+ if (SERVER_IS_CHAN(server)) {
+ cifs_dbg(VFS,
+ "server %s does not support multichannel anymore. Skip secondary channel\n",
+ ses->server->hostname);
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ goto skip_terminate;
+ }
+
+ ses->chans[chan_index].server = NULL;
+ server->terminate = true;
+ spin_unlock(&ses->chan_lock);
+
+ /*
+ * the above reference of server by channel
+ * needs to be dropped without holding chan_lock
+ * as cifs_put_tcp_session takes a higher lock
+ * i.e. cifs_tcp_ses_lock
+ */
+ cifs_put_tcp_session(server, from_reconnect);
+
+ cifs_signal_cifsd_for_reconnect(server, false);
+
+ /* mark primary server as needing reconnect */
+ pserver = server->primary_server;
+ cifs_signal_cifsd_for_reconnect(pserver, false);
+skip_terminate:
+ return -EHOSTDOWN;
+ }
+
+ cifs_server_dbg(VFS,
+ "server does not support multichannel anymore. Disable all other channels\n");
+ cifs_disable_secondary_channels(ses);
+
+
+ return 0;
+}
+
static int
smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct TCP_Server_Info *server, bool from_reconnect)
@@ -164,8 +214,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct nls_table *nls_codepage = NULL;
struct cifs_ses *ses;
int xid;
- struct TCP_Server_Info *pserver;
- unsigned int chan_index;
/*
* SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
@@ -310,44 +358,11 @@ again:
*/
if (ses->chan_count > 1 &&
!(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
- if (SERVER_IS_CHAN(server)) {
- cifs_dbg(VFS, "server %s does not support " \
- "multichannel anymore. skipping secondary channel\n",
- ses->server->hostname);
-
- spin_lock(&ses->chan_lock);
- chan_index = cifs_ses_get_chan_index(ses, server);
- if (chan_index == CIFS_INVAL_CHAN_INDEX) {
- spin_unlock(&ses->chan_lock);
- goto skip_terminate;
- }
-
- ses->chans[chan_index].server = NULL;
- spin_unlock(&ses->chan_lock);
-
- /*
- * the above reference of server by channel
- * needs to be dropped without holding chan_lock
- * as cifs_put_tcp_session takes a higher lock
- * i.e. cifs_tcp_ses_lock
- */
- cifs_put_tcp_session(server, from_reconnect);
-
- server->terminate = true;
- cifs_signal_cifsd_for_reconnect(server, false);
-
- /* mark primary server as needing reconnect */
- pserver = server->primary_server;
- cifs_signal_cifsd_for_reconnect(pserver, false);
-
-skip_terminate:
+ rc = cifs_chan_skip_or_disable(ses, server,
+ from_reconnect);
+ if (rc) {
mutex_unlock(&ses->session_mutex);
- rc = -EHOSTDOWN;
goto out;
- } else {
- cifs_server_dbg(VFS, "does not support " \
- "multichannel anymore. disabling all other channels\n");
- cifs_disable_secondary_channels(ses);
}
}
@@ -384,6 +399,15 @@ skip_sess_setup:
goto out;
}
+ spin_lock(&ses->ses_lock);
+ if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) {
+ spin_unlock(&ses->ses_lock);
+ mutex_unlock(&ses->session_mutex);
+ goto skip_add_channels;
+ }
+ ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS;
+ spin_unlock(&ses->ses_lock);
+
if (!rc &&
(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
mutex_unlock(&ses->session_mutex);
@@ -395,14 +419,29 @@ skip_sess_setup:
rc = SMB3_request_interfaces(xid, tcon, false);
free_xid(xid);
- if (rc)
+ if (rc == -EOPNOTSUPP && ses->chan_count > 1) {
+ /*
+ * some servers like Azure SMB server do not advertise
+ * that multichannel has been disabled with server
+ * capabilities, rather return STATUS_NOT_IMPLEMENTED.
+ * treat this as server not supporting multichannel
+ */
+
+ rc = cifs_chan_skip_or_disable(ses, server,
+ from_reconnect);
+ goto skip_add_channels;
+ } else if (rc)
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
if (ses->chan_max > ses->chan_count &&
+ ses->iface_count &&
!SERVER_IS_CHAN(server)) {
- if (ses->chan_count == 1)
+ if (ses->chan_count == 1) {
cifs_server_dbg(VFS, "supports multichannel now\n");
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+ }
cifs_try_adding_channels(ses);
}
@@ -410,6 +449,11 @@ skip_sess_setup:
mutex_unlock(&ses->session_mutex);
}
+skip_add_channels:
+ spin_lock(&ses->ses_lock);
+ ses->flags &= ~CIFS_SES_FLAG_SCALE_CHANNELS;
+ spin_unlock(&ses->ses_lock);
+
if (smb2_command != SMB2_INTERNAL_CMD)
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
@@ -1958,10 +2002,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
__le16 *unc_path = NULL;
int flags = 0;
unsigned int total_len;
- struct TCP_Server_Info *server;
-
- /* always use master channel */
- server = ses->server;
+ struct TCP_Server_Info *server = cifs_pick_channel(ses);
cifs_dbg(FYI, "TCON\n");
@@ -2094,6 +2135,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
struct smb2_tree_disconnect_req *req; /* response is trivial */
int rc = 0;
struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = cifs_pick_channel(ses);
int flags = 0;
unsigned int total_len;
struct kvec iov[1];
@@ -2116,7 +2158,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
invalidate_all_cached_dirs(tcon);
- rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server,
+ rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, server,
(void **) &req,
&total_len);
if (rc)
@@ -2134,7 +2176,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
- rc = cifs_send_recv(xid, ses, ses->server,
+ rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
if (rc) {
@@ -2279,7 +2321,7 @@ int smb2_parse_contexts(struct TCP_Server_Info *server,
noff = le16_to_cpu(cc->NameOffset);
nlen = le16_to_cpu(cc->NameLength);
- if (noff + nlen >= doff)
+ if (noff + nlen > doff)
return -EINVAL;
name = (char *)cc + noff;
@@ -2362,8 +2404,13 @@ create_durable_v2_buf(struct cifs_open_parms *oparms)
*/
buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout);
buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
- generate_random_uuid(buf->dcontext.CreateGuid);
- memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+
+ /* for replay, we should not overwrite the existing create guid */
+ if (!oparms->replay) {
+ generate_random_uuid(buf->dcontext.CreateGuid);
+ memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+ } else
+ memcpy(buf->dcontext.CreateGuid, pfid->create_guid, 16);
/* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */
buf->Name[0] = 'D';
@@ -2736,7 +2783,14 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
int flags = 0;
unsigned int total_len;
__le16 *utf16_path = NULL;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ n_iov = 2;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "mkdir\n");
@@ -2840,6 +2894,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
/* no need to inc num_remote_opens because we close it just below */
trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, full_path, CREATE_NOT_FILE,
FILE_WRITE_ATTRIBUTES);
+
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
/* resource #4: response buffer */
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -2877,6 +2935,11 @@ err_free_req:
cifs_small_buf_release(req);
err_free_path:
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3072,12 +3135,19 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
struct smb2_create_rsp *rsp = NULL;
struct cifs_tcon *tcon = oparms->tcon;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct kvec iov[SMB2_CREATE_IOV_SIZE];
struct kvec rsp_iov = {NULL, 0};
int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
+ oparms->replay = !!(retries);
cifs_dbg(FYI, "create/open\n");
if (!ses || !server)
@@ -3099,6 +3169,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, oparms->path,
oparms->create_options, oparms->desired_access);
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags,
&rsp_iov);
@@ -3152,6 +3225,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
creat_exit:
SMB2_open_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3276,15 +3354,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
-
- cifs_dbg(FYI, "SMB2 IOCTL\n");
-
- if (out_data != NULL)
- *out_data = NULL;
-
- /* zero out returned data len, in case of error */
- if (plen)
- *plen = 0;
+ int retries = 0, cur_sleep = 1;
if (!tcon)
return -EIO;
@@ -3293,10 +3363,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (!ses)
return -EIO;
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
server = cifs_pick_channel(ses);
+
if (!server)
return -EIO;
+ cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+ if (out_data != NULL)
+ *out_data = NULL;
+
+ /* zero out returned data len, in case of error */
+ if (plen)
+ *plen = 0;
+
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3311,6 +3394,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (rc)
goto ioctl_exit;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags,
&rsp_iov);
@@ -3380,6 +3466,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
ioctl_exit:
SMB2_ioctl_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3451,13 +3542,20 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
struct smb_rqst rqst;
struct smb2_close_rsp *rsp = NULL;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct kvec iov[1];
struct kvec rsp_iov;
int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
bool query_attrs = false;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ query_attrs = false;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "Close\n");
@@ -3483,6 +3581,9 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto close_exit;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_close_rsp *)rsp_iov.iov_base;
@@ -3516,6 +3617,11 @@ close_exit:
cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n",
persistent_fid, tmp_rc);
}
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3646,12 +3752,19 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
struct TCP_Server_Info *server;
int flags = 0;
bool allocated = false;
+ int retries = 0, cur_sleep = 1;
cifs_dbg(FYI, "Query Info\n");
if (!ses)
return -EIO;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ allocated = false;
server = cifs_pick_channel(ses);
+
if (!server)
return -EIO;
@@ -3673,6 +3786,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid,
ses->Suid, info_class, (__u32)info_type);
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
@@ -3715,6 +3831,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
qinf_exit:
SMB2_query_info_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3815,7 +3936,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
u32 *plen /* returned data len */)
{
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct smb_rqst rqst;
struct smb2_change_notify_rsp *smb_rsp;
struct kvec iov[1];
@@ -3823,6 +3944,12 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buftype = CIFS_NO_BUFFER;
int flags = 0;
int rc = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "change notify\n");
if (!ses || !server)
@@ -3847,6 +3974,10 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid,
(u8)watch_tree, completion_filter);
+
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -3881,6 +4012,11 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
if (rqst.rq_iov)
cifs_small_buf_release(rqst.rq_iov[0].iov_base); /* request */
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3918,7 +4054,7 @@ void smb2_reconnect_server(struct work_struct *work)
struct cifs_ses *ses, *ses2;
struct cifs_tcon *tcon, *tcon2;
struct list_head tmp_list, tmp_ses_list;
- bool tcon_exist = false, ses_exist = false;
+ bool ses_exist = false;
bool tcon_selected = false;
int rc;
bool resched = false;
@@ -3964,7 +4100,7 @@ void smb2_reconnect_server(struct work_struct *work)
if (tcon->need_reconnect || tcon->need_reopen_files) {
tcon->tc_count++;
list_add_tail(&tcon->rlist, &tmp_list);
- tcon_selected = tcon_exist = true;
+ tcon_selected = true;
}
}
/*
@@ -3973,7 +4109,7 @@ void smb2_reconnect_server(struct work_struct *work)
*/
if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) {
list_add_tail(&ses->tcon_ipc->rlist, &tmp_list);
- tcon_selected = tcon_exist = true;
+ tcon_selected = true;
cifs_smb_ses_inc_refcount(ses);
}
/*
@@ -4123,10 +4259,16 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
struct smb_rqst rqst;
struct kvec iov[1];
struct kvec rsp_iov = {NULL, 0};
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int resp_buftype = CIFS_NO_BUFFER;
int flags = 0;
int rc = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "flush\n");
if (!ses || !(ses->server))
@@ -4146,6 +4288,10 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
goto flush_exit;
trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid);
+
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -4160,6 +4306,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
flush_exit:
SMB2_flush_free(&rqst);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -4639,7 +4790,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
struct cifs_io_parms *io_parms = NULL;
int credit_request;
- if (!wdata->server)
+ if (!wdata->server || wdata->replay)
server = wdata->server = cifs_pick_channel(tcon->ses);
/*
@@ -4724,6 +4875,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
rqst.rq_nvec = 1;
rqst.rq_iter = wdata->iter;
rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter);
+ if (wdata->replay)
+ smb2_set_replay(server, &rqst);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (wdata->mr)
iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1);
@@ -4797,18 +4950,21 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
int flags = 0;
unsigned int total_len;
struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
*nbytes = 0;
-
- if (n_vec < 1)
- return rc;
-
if (!io_parms->server)
io_parms->server = cifs_pick_channel(io_parms->tcon->ses);
server = io_parms->server;
if (server == NULL)
return -ECONNABORTED;
+ if (n_vec < 1)
+ return rc;
+
rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, server,
(void **) &req, &total_len);
if (rc)
@@ -4842,6 +4998,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
rqst.rq_iov = iov;
rqst.rq_nvec = n_vec + 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, io_parms->tcon->ses, server,
&rqst,
&resp_buftype, flags, &rsp_iov);
@@ -4866,6 +5025,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
cifs_small_buf_release(req);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(io_parms->tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5048,6 +5212,9 @@ int SMB2_query_directory_init(const unsigned int xid,
case SMB_FIND_FILE_POSIX_INFO:
req->FileInformationClass = SMB_FIND_FILE_POSIX_INFO;
break;
+ case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
+ req->FileInformationClass = FILE_FULL_DIRECTORY_INFORMATION;
+ break;
default:
cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
info_level);
@@ -5117,6 +5284,9 @@ smb2_parse_query_directory(struct cifs_tcon *tcon,
/* note that posix payload are variable size */
info_buf_size = sizeof(struct smb2_posix_info);
break;
+ case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
+ info_buf_size = sizeof(FILE_FULL_DIRECTORY_INFO);
+ break;
default:
cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
srch_inf->info_level);
@@ -5177,8 +5347,14 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec rsp_iov;
int rc = 0;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
if (!ses || !(ses->server))
return -EIO;
@@ -5198,6 +5374,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto qdir_exit;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
@@ -5232,6 +5411,11 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
qdir_exit:
SMB2_query_directory_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5298,8 +5482,14 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
if (!ses || !server)
return -EIO;
@@ -5327,6 +5517,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
+ if (retries)
+ smb2_set_replay(server, &rqst);
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags,
@@ -5342,6 +5534,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
free_rsp_buf(resp_buftype, rsp);
kfree(iov);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5394,12 +5591,18 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
struct smb2_oplock_break *req = NULL;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = CIFS_OBREAK_OP;
unsigned int total_len;
struct kvec iov[1];
struct kvec rsp_iov;
int resp_buf_type;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_OBREAK_OP;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "SMB2_oplock_break\n");
rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, server,
@@ -5424,15 +5627,21 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
-
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc);
}
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5518,9 +5727,15 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
FILE_SYSTEM_POSIX_INFO *info = NULL;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
rc = build_qfs_info_req(&iov, tcon, server,
FS_POSIX_INFORMATION,
@@ -5536,6 +5751,9 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = &iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
free_qfs_info_req(&iov);
@@ -5555,6 +5773,11 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
posix_qfsinf_exit:
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5569,9 +5792,15 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct smb2_fs_full_size_info *info = NULL;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
rc = build_qfs_info_req(&iov, tcon, server,
FS_FULL_SIZE_INFORMATION,
@@ -5587,6 +5816,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = &iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
free_qfs_info_req(&iov);
@@ -5606,6 +5838,11 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
qfsinf_exit:
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5620,9 +5857,15 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype, max_len, min_len;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
unsigned int rsp_len, offset;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
if (level == FS_DEVICE_INFORMATION) {
max_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
@@ -5654,6 +5897,9 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = &iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
free_qfs_info_req(&iov);
@@ -5691,6 +5937,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
qfsattr_exit:
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5708,7 +5959,13 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int count;
int flags = CIFS_NO_RSP_BUF;
unsigned int total_len;
- struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
+ struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_NO_RSP_BUF;
+ server = cifs_pick_channel(tcon->ses);
cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
@@ -5739,6 +5996,9 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, tcon->ses, server,
&rqst, &resp_buf_type, flags,
&rsp_iov);
@@ -5750,6 +6010,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
tcon->ses->Suid, rc);
}
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 343ada691e76..b3069911e9dd 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -122,6 +122,11 @@ extern unsigned long smb_rqst_len(struct TCP_Server_Info *server,
extern void smb2_set_next_command(struct cifs_tcon *tcon,
struct smb_rqst *rqst);
extern void smb2_set_related(struct smb_rqst *rqst);
+extern void smb2_set_replay(struct TCP_Server_Info *server,
+ struct smb_rqst *rqst);
+extern bool smb2_should_replay(struct cifs_tcon *tcon,
+ int *pretries,
+ int *pcur_sleep);
/*
* SMB2 Worker functions - most of protocol specific implementation details
@@ -299,9 +304,7 @@ int smb311_posix_query_path_info(const unsigned int xid,
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const char *full_path,
- struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group);
+ struct cifs_open_info_data *data);
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
diff --git a/fs/smb/client/smb2status.h b/fs/smb/client/smb2status.h
index a9e958166fc5..9c6d79b0bd49 100644
--- a/fs/smb/client/smb2status.h
+++ b/fs/smb/client/smb2status.h
@@ -982,6 +982,8 @@ struct ntstatus {
#define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501)
#define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502)
#define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503)
+#define STATUS_SERVER_UNAVAILABLE cpu_to_le32(0xC0000466)
+#define STATUS_FILE_NOT_AVAILABLE cpu_to_le32(0xC0000467)
#define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700)
#define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701)
#define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702)
diff --git a/fs/smb/client/smbencrypt.c b/fs/smb/client/smbencrypt.c
index f0ce26414f17..1d1ee9f18f37 100644
--- a/fs/smb/client/smbencrypt.c
+++ b/fs/smb/client/smbencrypt.c
@@ -26,13 +26,6 @@
#include "cifsproto.h"
#include "../common/md4.h"
-#ifndef false
-#define false 0
-#endif
-#ifndef true
-#define true 1
-#endif
-
/* following came from the other byteorder.h to avoid include conflicts */
#define CVAL(buf,pos) (((unsigned char *)(buf))[pos])
#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 4f717ad7c21b..994d70193432 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -400,10 +400,17 @@ unmask:
server->conn_id, server->hostname);
}
smbd_done:
- if (rc < 0 && rc != -EINTR)
+ /*
+ * there's hardly any use for the layers above to know the
+ * actual error code here. All they should do at this point is
+ * to retry the connection and hope it goes away.
+ */
+ if (rc < 0 && rc != -EINTR && rc != -EAGAIN) {
cifs_server_dbg(VFS, "Error %d sending data on socket to server\n",
rc);
- else if (rc > 0)
+ rc = -ECONNABORTED;
+ cifs_signal_cifsd_for_reconnect(server, false);
+ } else if (rc > 0)
rc = 0;
out:
cifs_in_send_dec(server);
@@ -428,8 +435,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
if (!(flags & CIFS_TRANSFORM_REQ))
return __smb_send_rqst(server, num_rqst, rqst);
- if (num_rqst > MAX_COMPOUND - 1)
- return -ENOMEM;
+ if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1))
+ return -EIO;
if (!server->ops->init_transform_rq) {
cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n");
@@ -1026,6 +1033,9 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
if (!server || server->terminate)
continue;
+ if (CIFS_CHAN_NEEDS_RECONNECT(ses, i))
+ continue;
+
/*
* strictly speaking, we should pick up req_lock to read
* server->in_flight. But it shouldn't matter much here if we
diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c
index 4a4b2b03ff33..b931a99ab9c8 100644
--- a/fs/smb/server/asn1.c
+++ b/fs/smb/server/asn1.c
@@ -214,10 +214,15 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen,
{
struct ksmbd_conn *conn = context;
+ if (!vlen)
+ return -EINVAL;
+
conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL);
if (!conn->mechToken)
return -ENOMEM;
+ conn->mechTokenLen = (unsigned int)vlen;
+
return 0;
}
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index d311c2ee10bd..09e1e7771592 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -416,13 +416,7 @@ static void stop_sessions(void)
again:
down_read(&conn_list_lock);
list_for_each_entry(conn, &conn_list, conns_list) {
- struct task_struct *task;
-
t = conn->transport;
- task = t->handler;
- if (task)
- ksmbd_debug(CONN, "Stop session handler %s/%d\n",
- task->comm, task_pid_nr(task));
ksmbd_conn_set_exiting(conn);
if (t->ops->shutdown) {
up_read(&conn_list_lock);
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 3c005246a32e..0e04cf8b1d89 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -88,6 +88,7 @@ struct ksmbd_conn {
__u16 dialect;
char *mechToken;
+ unsigned int mechTokenLen;
struct ksmbd_conn_ops *conn_ops;
@@ -134,7 +135,6 @@ struct ksmbd_transport_ops {
struct ksmbd_transport {
struct ksmbd_conn *conn;
struct ksmbd_transport_ops *ops;
- struct task_struct *handler;
};
#define KSMBD_TCP_RECV_TIMEOUT (7 * HZ)
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index b7521e41402e..0ebf91ffa236 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -304,7 +304,8 @@ enum ksmbd_event {
KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST,
KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15,
- KSMBD_EVENT_MAX
+ __KSMBD_EVENT_MAX,
+ KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1
};
/*
diff --git a/fs/smb/server/misc.c b/fs/smb/server/misc.c
index 9e8afaa686e3..1a5faa6f6e7b 100644
--- a/fs/smb/server/misc.c
+++ b/fs/smb/server/misc.c
@@ -261,6 +261,7 @@ out_ascii:
/**
* ksmbd_extract_sharename() - get share name from tree connect request
+ * @um: pointer to a unicode_map structure for character encoding handling
* @treename: buffer containing tree name and share name
*
* Return: share name on success, otherwise error
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 001926d3b348..53dfaac425c6 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -1197,6 +1197,12 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
bool prev_op_has_lease;
__le32 prev_op_state = 0;
+ /* Only v2 leases handle the directory */
+ if (S_ISDIR(file_inode(fp->filp)->i_mode)) {
+ if (!lctx || lctx->version != 2)
+ return 0;
+ }
+
opinfo = alloc_opinfo(work, pid, tid);
if (!opinfo)
return -ENOMEM;
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 3143819935dc..089527a8b4ff 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1414,7 +1414,10 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn,
char *name;
unsigned int name_off, name_len, secbuf_len;
- secbuf_len = le16_to_cpu(req->SecurityBufferLength);
+ if (conn->use_spnego && conn->mechToken)
+ secbuf_len = conn->mechTokenLen;
+ else
+ secbuf_len = le16_to_cpu(req->SecurityBufferLength);
if (secbuf_len < sizeof(struct authenticate_message)) {
ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len);
return NULL;
@@ -1505,7 +1508,10 @@ static int ntlm_authenticate(struct ksmbd_work *work,
struct authenticate_message *authblob;
authblob = user_authblob(conn, req);
- sz = le16_to_cpu(req->SecurityBufferLength);
+ if (conn->use_spnego && conn->mechToken)
+ sz = conn->mechTokenLen;
+ else
+ sz = le16_to_cpu(req->SecurityBufferLength);
rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess);
if (rc) {
set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD);
@@ -1778,8 +1784,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
negblob_off = le16_to_cpu(req->SecurityBufferOffset);
negblob_len = le16_to_cpu(req->SecurityBufferLength);
- if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) ||
- negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) {
+ if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer)) {
rc = -EINVAL;
goto out_err;
}
@@ -1788,8 +1793,15 @@ int smb2_sess_setup(struct ksmbd_work *work)
negblob_off);
if (decode_negotiation_token(conn, negblob, negblob_len) == 0) {
- if (conn->mechToken)
+ if (conn->mechToken) {
negblob = (struct negotiate_message *)conn->mechToken;
+ negblob_len = conn->mechTokenLen;
+ }
+ }
+
+ if (negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) {
+ rc = -EINVAL;
+ goto out_err;
}
if (server_conf.auth_mechs & conn->auth_mechs) {
@@ -6161,8 +6173,10 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
err = ksmbd_iov_pin_rsp_read(work, (void *)rsp,
offsetof(struct smb2_read_rsp, Buffer),
aux_payload_buf, nbytes);
- if (err)
+ if (err) {
+ kvfree(aux_payload_buf);
goto out;
+ }
kvfree(rpc_resp);
} else {
err = ksmbd_iov_pin_rsp(work, (void *)rsp,
@@ -6372,8 +6386,10 @@ int smb2_read(struct ksmbd_work *work)
err = ksmbd_iov_pin_rsp_read(work, (void *)rsp,
offsetof(struct smb2_read_rsp, Buffer),
aux_payload_buf, nbytes);
- if (err)
+ if (err) {
+ kvfree(aux_payload_buf);
goto out;
+ }
ksmbd_fd_put(work, fp);
return 0;
@@ -6748,10 +6764,10 @@ struct file_lock *smb_flock_init(struct file *f)
locks_init_lock(fl);
- fl->fl_owner = f;
- fl->fl_pid = current->tgid;
- fl->fl_file = f;
- fl->fl_flags = FL_POSIX;
+ fl->c.flc_owner = f;
+ fl->c.flc_pid = current->tgid;
+ fl->c.flc_file = f;
+ fl->c.flc_flags = FL_POSIX;
fl->fl_ops = NULL;
fl->fl_lmops = NULL;
@@ -6768,30 +6784,30 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags)
case SMB2_LOCKFLAG_SHARED:
ksmbd_debug(SMB, "received shared request\n");
cmd = F_SETLKW;
- flock->fl_type = F_RDLCK;
- flock->fl_flags |= FL_SLEEP;
+ flock->c.flc_type = F_RDLCK;
+ flock->c.flc_flags |= FL_SLEEP;
break;
case SMB2_LOCKFLAG_EXCLUSIVE:
ksmbd_debug(SMB, "received exclusive request\n");
cmd = F_SETLKW;
- flock->fl_type = F_WRLCK;
- flock->fl_flags |= FL_SLEEP;
+ flock->c.flc_type = F_WRLCK;
+ flock->c.flc_flags |= FL_SLEEP;
break;
case SMB2_LOCKFLAG_SHARED | SMB2_LOCKFLAG_FAIL_IMMEDIATELY:
ksmbd_debug(SMB,
"received shared & fail immediately request\n");
cmd = F_SETLK;
- flock->fl_type = F_RDLCK;
+ flock->c.flc_type = F_RDLCK;
break;
case SMB2_LOCKFLAG_EXCLUSIVE | SMB2_LOCKFLAG_FAIL_IMMEDIATELY:
ksmbd_debug(SMB,
"received exclusive & fail immediately request\n");
cmd = F_SETLK;
- flock->fl_type = F_WRLCK;
+ flock->c.flc_type = F_WRLCK;
break;
case SMB2_LOCKFLAG_UNLOCK:
ksmbd_debug(SMB, "received unlock request\n");
- flock->fl_type = F_UNLCK;
+ flock->c.flc_type = F_UNLCK;
cmd = F_SETLK;
break;
}
@@ -6829,13 +6845,13 @@ static void smb2_remove_blocked_lock(void **argv)
struct file_lock *flock = (struct file_lock *)argv[0];
ksmbd_vfs_posix_lock_unblock(flock);
- wake_up(&flock->fl_wait);
+ locks_wake_up(flock);
}
static inline bool lock_defer_pending(struct file_lock *fl)
{
/* check pending lock waiters */
- return waitqueue_active(&fl->fl_wait);
+ return waitqueue_active(&fl->c.flc_wait);
}
/**
@@ -6926,8 +6942,8 @@ int smb2_lock(struct ksmbd_work *work)
list_for_each_entry(cmp_lock, &lock_list, llist) {
if (cmp_lock->fl->fl_start <= flock->fl_start &&
cmp_lock->fl->fl_end >= flock->fl_end) {
- if (cmp_lock->fl->fl_type != F_UNLCK &&
- flock->fl_type != F_UNLCK) {
+ if (cmp_lock->fl->c.flc_type != F_UNLCK &&
+ flock->c.flc_type != F_UNLCK) {
pr_err("conflict two locks in one request\n");
err = -EINVAL;
locks_free_lock(flock);
@@ -6975,12 +6991,12 @@ int smb2_lock(struct ksmbd_work *work)
list_for_each_entry(conn, &conn_list, conns_list) {
spin_lock(&conn->llist_lock);
list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) {
- if (file_inode(cmp_lock->fl->fl_file) !=
- file_inode(smb_lock->fl->fl_file))
+ if (file_inode(cmp_lock->fl->c.flc_file) !=
+ file_inode(smb_lock->fl->c.flc_file))
continue;
- if (smb_lock->fl->fl_type == F_UNLCK) {
- if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file &&
+ if (lock_is_unlock(smb_lock->fl)) {
+ if (cmp_lock->fl->c.flc_file == smb_lock->fl->c.flc_file &&
cmp_lock->start == smb_lock->start &&
cmp_lock->end == smb_lock->end &&
!lock_defer_pending(cmp_lock->fl)) {
@@ -6997,7 +7013,7 @@ int smb2_lock(struct ksmbd_work *work)
continue;
}
- if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file) {
+ if (cmp_lock->fl->c.flc_file == smb_lock->fl->c.flc_file) {
if (smb_lock->flags & SMB2_LOCKFLAG_SHARED)
continue;
} else {
@@ -7039,7 +7055,7 @@ int smb2_lock(struct ksmbd_work *work)
}
up_read(&conn_list_lock);
out_check_cl:
- if (smb_lock->fl->fl_type == F_UNLCK && nolock) {
+ if (lock_is_unlock(smb_lock->fl) && nolock) {
pr_err("Try to unlock nolocked range\n");
rsp->hdr.Status = STATUS_RANGE_NOT_LOCKED;
goto out;
@@ -7163,7 +7179,7 @@ out:
struct file_lock *rlock = NULL;
rlock = smb_flock_init(filp);
- rlock->fl_type = F_UNLCK;
+ rlock->c.flc_type = F_UNLCK;
rlock->fl_start = smb_lock->start;
rlock->fl_end = smb_lock->end;
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index b49d47bdafc9..f29bb03f0dc4 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -74,7 +74,7 @@ static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info)
static int handle_generic_event(struct sk_buff *skb, struct genl_info *info);
static int ksmbd_ipc_heartbeat_request(void);
-static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX] = {
+static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = {
[KSMBD_EVENT_UNSPEC] = {
.len = 0,
},
@@ -403,7 +403,7 @@ static int handle_generic_event(struct sk_buff *skb, struct genl_info *info)
return -EPERM;
#endif
- if (type >= KSMBD_EVENT_MAX) {
+ if (type > KSMBD_EVENT_MAX) {
WARN_ON(1);
return -EINVAL;
}
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index c5629a68c8b7..8faa25c6e129 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -2039,6 +2039,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs)
static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
{
struct smb_direct_transport *t;
+ struct task_struct *handler;
int ret;
if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) {
@@ -2056,11 +2057,11 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
if (ret)
goto out_err;
- KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn, "ksmbd:r%u",
- smb_direct_port);
- if (IS_ERR(KSMBD_TRANS(t)->handler)) {
- ret = PTR_ERR(KSMBD_TRANS(t)->handler);
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn, "ksmbd:r%u",
+ smb_direct_port);
+ if (IS_ERR(handler)) {
+ ret = PTR_ERR(handler);
pr_err("Can't start thread\n");
goto out_err;
}
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index eff7a1d793f0..002a3f0dc7c5 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -185,6 +185,7 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
struct sockaddr *csin;
int rc = 0;
struct tcp_transport *t;
+ struct task_struct *handler;
t = alloc_transport(client_sk);
if (!t) {
@@ -199,13 +200,13 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
goto out_error;
}
- KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn,
- "ksmbd:%u",
- ksmbd_tcp_get_port(csin));
- if (IS_ERR(KSMBD_TRANS(t)->handler)) {
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn,
+ "ksmbd:%u",
+ ksmbd_tcp_get_port(csin));
+ if (IS_ERR(handler)) {
pr_err("cannot start conn thread\n");
- rc = PTR_ERR(KSMBD_TRANS(t)->handler);
+ rc = PTR_ERR(handler);
free_transport(t);
}
return rc;
@@ -364,6 +365,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig,
* @t: TCP transport instance
* @buf: buffer to store read data from socket
* @to_read: number of bytes to read from socket
+ * @max_retries: number of retries if reading from socket fails
*
* Return: on success return number of bytes read from socket,
* otherwise return error number
@@ -415,6 +417,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket)
/**
* create_socket - create socket for ksmbd/0
+ * @iface: interface to bind the created socket to
*
* Return: 0 on success, error number otherwise
*/
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index a6961bfe3e13..c487e834331a 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -337,18 +337,18 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end,
return 0;
spin_lock(&ctx->flc_lock);
- list_for_each_entry(flock, &ctx->flc_posix, fl_list) {
+ for_each_file_lock(flock, &ctx->flc_posix) {
/* check conflict locks */
if (flock->fl_end >= start && end >= flock->fl_start) {
- if (flock->fl_type == F_RDLCK) {
+ if (lock_is_read(flock)) {
if (type == WRITE) {
pr_err("not allow write by shared lock\n");
error = 1;
goto out;
}
- } else if (flock->fl_type == F_WRLCK) {
+ } else if (lock_is_write(flock)) {
/* check owner in lock */
- if (flock->fl_file != filp) {
+ if (flock->c.flc_file != filp) {
error = 1;
pr_err("not allow rw access by exclusive lock from other opens\n");
goto out;
@@ -1837,13 +1837,13 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
void ksmbd_vfs_posix_lock_wait(struct file_lock *flock)
{
- wait_event(flock->fl_wait, !flock->fl_blocker);
+ wait_event(flock->c.flc_wait, !flock->c.flc_blocker);
}
int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout)
{
- return wait_event_interruptible_timeout(flock->fl_wait,
- !flock->fl_blocker,
+ return wait_event_interruptible_timeout(flock->c.flc_wait,
+ !flock->c.flc_blocker,
timeout);
}
diff --git a/fs/super.c b/fs/super.c
index d35e85295489..ee05ab6b37e7 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -274,9 +274,10 @@ static void destroy_super_work(struct work_struct *work)
{
struct super_block *s = container_of(work, struct super_block,
destroy_work);
- int i;
-
- for (i = 0; i < SB_FREEZE_LEVELS; i++)
+ security_sb_free(s);
+ put_user_ns(s->s_user_ns);
+ kfree(s->s_subtype);
+ for (int i = 0; i < SB_FREEZE_LEVELS; i++)
percpu_free_rwsem(&s->s_writers.rw_sem[i]);
kfree(s);
}
@@ -296,9 +297,6 @@ static void destroy_unused_super(struct super_block *s)
super_unlock_excl(s);
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
- security_sb_free(s);
- put_user_ns(s->s_user_ns);
- kfree(s->s_subtype);
shrinker_free(s->s_shrink);
/* no delays needed */
destroy_super_work(&s->destroy_work);
@@ -409,9 +407,6 @@ static void __put_super(struct super_block *s)
WARN_ON(s->s_dentry_lru.node);
WARN_ON(s->s_inode_lru.node);
WARN_ON(!list_empty(&s->s_mounts));
- security_sb_free(s);
- put_user_ns(s->s_user_ns);
- kfree(s->s_subtype);
call_rcu(&s->rcu, destroy_super_rcu);
}
}
@@ -1532,16 +1527,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
struct fs_context *fc)
{
blk_mode_t mode = sb_open_mode(sb_flags);
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
- bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
- if (IS_ERR(bdev_handle)) {
+ bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+ if (IS_ERR(bdev_file)) {
if (fc)
errorf(fc, "%s: Can't open blockdev", fc->source);
- return PTR_ERR(bdev_handle);
+ return PTR_ERR(bdev_file);
}
- bdev = bdev_handle->bdev;
+ bdev = file_bdev(bdev_file);
/*
* This really should be in blkdev_get_by_dev, but right now can't due
@@ -1549,7 +1544,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
* writable from userspace even for a read-only block device.
*/
if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
- bdev_release(bdev_handle);
+ fput(bdev_file);
return -EACCES;
}
@@ -1560,11 +1555,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
if (fc)
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return -EBUSY;
}
spin_lock(&sb_lock);
- sb->s_bdev_handle = bdev_handle;
+ sb->s_bdev_file = bdev_file;
sb->s_bdev = bdev;
sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
if (bdev_stable_writes(bdev))
@@ -1680,7 +1675,7 @@ void kill_block_super(struct super_block *sb)
generic_shutdown_super(sb);
if (bdev) {
sync_blockdev(bdev);
- bdev_release(sb->s_bdev_handle);
+ fput(sb->s_bdev_file);
}
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index b6b6796e1616..4df2afa551dc 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -81,7 +81,7 @@ void sysfs_remove_dir(struct kobject *kobj)
struct kernfs_node *kn = kobj->sd;
/*
- * In general, kboject owner is responsible for ensuring removal
+ * In general, kobject owner is responsible for ensuring removal
* doesn't race with other operations and sysfs doesn't provide any
* protection; however, when @kobj is used as a symlink target, the
* symlinking entity usually doesn't own @kobj and thus has no
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 5a915b2e68f5..76bc2d5e75a9 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -336,7 +336,7 @@ int __init sysv_init_icache(void)
{
sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
sizeof(struct sysv_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
init_once);
if (!sysv_inode_cachep)
return -ENOMEM;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 410ab2a44d2f..19bcb51a2203 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -83,9 +83,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh)
return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
}
-/*
- * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock)
- */
static Indirect *get_branch(struct inode *inode,
int depth,
int offsets[],
@@ -105,15 +102,18 @@ static Indirect *get_branch(struct inode *inode,
bh = sb_bread(sb, block);
if (!bh)
goto failure;
+ read_lock(&pointers_lock);
if (!verify_chain(chain, p))
goto changed;
add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
+ read_unlock(&pointers_lock);
if (!p->key)
goto no_block;
}
return NULL;
changed:
+ read_unlock(&pointers_lock);
brelse(bh);
*err = -EAGAIN;
goto no_block;
@@ -219,9 +219,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b
goto out;
reread:
- read_lock(&pointers_lock);
partial = get_branch(inode, depth, offsets, chain, &err);
- read_unlock(&pointers_lock);
/* Simplest case - block found, no allocation needed */
if (!partial) {
@@ -291,9 +289,9 @@ static Indirect *find_shared(struct inode *inode,
*top = 0;
for (k = depth; k > 1 && !offsets[k-1]; k--)
;
+ partial = get_branch(inode, k, offsets, chain, &err);
write_lock(&pointers_lock);
- partial = get_branch(inode, k, offsets, chain, &err);
if (!partial)
partial = chain + k-1;
/*
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index f0677ea0ec24..110e8a272189 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -32,6 +32,18 @@
*/
static DEFINE_MUTEX(eventfs_mutex);
+/* Choose something "unique" ;-) */
+#define EVENTFS_FILE_INODE_INO 0x12c4e37
+
+/* Just try to make something consistent and unique */
+static int eventfs_dir_ino(struct eventfs_inode *ei)
+{
+ if (!ei->ino)
+ ei->ino = get_next_ino();
+
+ return ei->ino;
+}
+
/*
* The eventfs_inode (ei) itself is protected by SRCU. It is released from
* its parent's list and will have is_freed set (under eventfs_mutex).
@@ -45,16 +57,55 @@ enum {
EVENTFS_SAVE_MODE = BIT(16),
EVENTFS_SAVE_UID = BIT(17),
EVENTFS_SAVE_GID = BIT(18),
+ EVENTFS_TOPLEVEL = BIT(19),
};
#define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1)
+/*
+ * eventfs_inode reference count management.
+ *
+ * NOTE! We count only references from dentries, in the
+ * form 'dentry->d_fsdata'. There are also references from
+ * directory inodes ('ti->private'), but the dentry reference
+ * count is always a superset of the inode reference count.
+ */
+static void release_ei(struct kref *ref)
+{
+ struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref);
+
+ WARN_ON_ONCE(!ei->is_freed);
+
+ kfree(ei->entry_attrs);
+ kfree_const(ei->name);
+ kfree_rcu(ei, rcu);
+}
+
+static inline void put_ei(struct eventfs_inode *ei)
+{
+ if (ei)
+ kref_put(&ei->kref, release_ei);
+}
+
+static inline void free_ei(struct eventfs_inode *ei)
+{
+ if (ei) {
+ ei->is_freed = 1;
+ put_ei(ei);
+ }
+}
+
+static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei)
+{
+ if (ei)
+ kref_get(&ei->kref);
+ return ei;
+}
+
static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags);
-static int dcache_dir_open_wrapper(struct inode *inode, struct file *file);
-static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx);
-static int eventfs_release(struct inode *inode, struct file *file);
+static int eventfs_iterate(struct file *file, struct dir_context *ctx);
static void update_attr(struct eventfs_attr *attr, struct iattr *iattr)
{
@@ -94,7 +145,7 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
/* Preallocate the children mode array if necessary */
if (!(dentry->d_inode->i_mode & S_IFDIR)) {
if (!ei->entry_attrs) {
- ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries,
+ ei->entry_attrs = kcalloc(ei->nr_entries, sizeof(*ei->entry_attrs),
GFP_NOFS);
if (!ei->entry_attrs) {
ret = -ENOMEM;
@@ -117,10 +168,17 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
* The events directory dentry is never freed, unless its
* part of an instance that is deleted. It's attr is the
* default for its child files and directories.
- * Do not update it. It's not used for its own mode or ownership
+ * Do not update it. It's not used for its own mode or ownership.
*/
- if (!ei->is_events)
+ if (ei->is_events) {
+ /* But it still needs to know if it was modified */
+ if (iattr->ia_valid & ATTR_UID)
+ ei->attr.mode |= EVENTFS_SAVE_UID;
+ if (iattr->ia_valid & ATTR_GID)
+ ei->attr.mode |= EVENTFS_SAVE_GID;
+ } else {
update_attr(&ei->attr, iattr);
+ }
} else {
name = dentry->d_name.name;
@@ -138,9 +196,63 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
return ret;
}
+static void update_top_events_attr(struct eventfs_inode *ei, struct super_block *sb)
+{
+ struct inode *root;
+
+ /* Only update if the "events" was on the top level */
+ if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL))
+ return;
+
+ /* Get the tracefs root inode. */
+ root = d_inode(sb->s_root);
+ ei->attr.uid = root->i_uid;
+ ei->attr.gid = root->i_gid;
+}
+
+static void set_top_events_ownership(struct inode *inode)
+{
+ struct tracefs_inode *ti = get_tracefs(inode);
+ struct eventfs_inode *ei = ti->private;
+
+ /* The top events directory doesn't get automatically updated */
+ if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL))
+ return;
+
+ update_top_events_attr(ei, inode->i_sb);
+
+ if (!(ei->attr.mode & EVENTFS_SAVE_UID))
+ inode->i_uid = ei->attr.uid;
+
+ if (!(ei->attr.mode & EVENTFS_SAVE_GID))
+ inode->i_gid = ei->attr.gid;
+}
+
+static int eventfs_get_attr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ struct dentry *dentry = path->dentry;
+ struct inode *inode = d_backing_inode(dentry);
+
+ set_top_events_ownership(inode);
+
+ generic_fillattr(idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static int eventfs_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
+{
+ set_top_events_ownership(inode);
+ return generic_permission(idmap, inode, mask);
+}
+
static const struct inode_operations eventfs_root_dir_inode_operations = {
.lookup = eventfs_root_lookup,
.setattr = eventfs_set_attr,
+ .getattr = eventfs_get_attr,
+ .permission = eventfs_permission,
};
static const struct inode_operations eventfs_file_inode_operations = {
@@ -148,11 +260,9 @@ static const struct inode_operations eventfs_file_inode_operations = {
};
static const struct file_operations eventfs_file_operations = {
- .open = dcache_dir_open_wrapper,
.read = generic_read_dir,
- .iterate_shared = dcache_readdir_wrapper,
+ .iterate_shared = eventfs_iterate,
.llseek = generic_file_llseek,
- .release = eventfs_release,
};
/* Return the evenfs_inode of the "events" directory */
@@ -160,10 +270,11 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
{
struct eventfs_inode *ei;
- mutex_lock(&eventfs_mutex);
do {
- /* The parent always has an ei, except for events itself */
- ei = dentry->d_parent->d_fsdata;
+ // The parent is stable because we do not do renames
+ dentry = dentry->d_parent;
+ // ... and directories always have d_fsdata
+ ei = dentry->d_fsdata;
/*
* If the ei is being freed, the ownership of the children
@@ -173,10 +284,10 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
ei = NULL;
break;
}
-
- dentry = ei->dentry;
+ // Walk upwards until you find the events inode
} while (!ei->is_events);
- mutex_unlock(&eventfs_mutex);
+
+ update_top_events_attr(ei, dentry->d_sb);
return ei;
}
@@ -206,50 +317,11 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode,
inode->i_gid = attr->gid;
}
-static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level)
-{
- struct eventfs_inode *ei_child;
-
- /* at most we have events/system/event */
- if (WARN_ON_ONCE(level > 3))
- return;
-
- ei->attr.gid = gid;
-
- if (ei->entry_attrs) {
- for (int i = 0; i < ei->nr_entries; i++) {
- ei->entry_attrs[i].gid = gid;
- }
- }
-
- /*
- * Only eventfs_inode with dentries are updated, make sure
- * all eventfs_inodes are updated. If one of the children
- * do not have a dentry, this function must traverse it.
- */
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- if (!ei_child->dentry)
- update_gid(ei_child, gid, level + 1);
- }
-}
-
-void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
-{
- struct eventfs_inode *ei = dentry->d_fsdata;
- int idx;
-
- idx = srcu_read_lock(&eventfs_srcu);
- update_gid(ei, gid, 0);
- srcu_read_unlock(&eventfs_srcu, idx);
-}
-
/**
- * create_file - create a file in the tracefs filesystem
- * @name: the name of the file to create.
+ * lookup_file - look up a file in the tracefs filesystem
+ * @dentry: the dentry to look up
* @mode: the permission that the file should have.
* @attr: saved attributes changed by user
- * @parent: parent dentry for this file.
* @data: something that the caller will want to get to later on.
* @fop: struct file_operations that should be used for this file.
*
@@ -257,30 +329,25 @@ void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
* directory. The inode.i_private pointer will point to @data in the open()
* call.
*/
-static struct dentry *create_file(const char *name, umode_t mode,
+static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
+ struct dentry *dentry,
+ umode_t mode,
struct eventfs_attr *attr,
- struct dentry *parent, void *data,
+ void *data,
const struct file_operations *fop)
{
struct tracefs_inode *ti;
- struct dentry *dentry;
struct inode *inode;
if (!(mode & S_IFMT))
mode |= S_IFREG;
if (WARN_ON_ONCE(!S_ISREG(mode)))
- return NULL;
-
- WARN_ON_ONCE(!parent);
- dentry = eventfs_start_creating(name, parent);
-
- if (IS_ERR(dentry))
- return dentry;
+ return ERR_PTR(-EIO);
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
- return eventfs_failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
/* If the user updated the directory's attributes, use them */
update_inode_attr(dentry, inode, attr, mode);
@@ -289,34 +356,36 @@ static struct dentry *create_file(const char *name, umode_t mode,
inode->i_fop = fop;
inode->i_private = data;
+ /* All files will have the same inode number */
+ inode->i_ino = EVENTFS_FILE_INODE_INO;
+
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE;
- d_instantiate(dentry, inode);
- fsnotify_create(dentry->d_parent->d_inode, dentry);
- return eventfs_end_creating(dentry);
+
+ // Files have their parent's ei as their fsdata
+ dentry->d_fsdata = get_ei(parent_ei);
+
+ d_add(dentry, inode);
+ return NULL;
};
/**
- * create_dir - create a dir in the tracefs filesystem
+ * lookup_dir_entry - look up a dir in the tracefs filesystem
+ * @dentry: the directory to look up
* @ei: the eventfs_inode that represents the directory to create
- * @parent: parent dentry for this file.
*
- * This function will create a dentry for a directory represented by
+ * This function will look up a dentry for a directory represented by
* a eventfs_inode.
*/
-static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent)
+static struct dentry *lookup_dir_entry(struct dentry *dentry,
+ struct eventfs_inode *pei, struct eventfs_inode *ei)
{
struct tracefs_inode *ti;
- struct dentry *dentry;
struct inode *inode;
- dentry = eventfs_start_creating(ei->name, parent);
- if (IS_ERR(dentry))
- return dentry;
-
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
- return eventfs_failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
/* If the user updated the directory's attributes, use them */
update_inode_attr(dentry, inode, &ei->attr,
@@ -325,247 +394,72 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
+ /* All directories will have the same inode number */
+ inode->i_ino = eventfs_dir_ino(ei);
+
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE;
+ /* Only directories have ti->private set to an ei, not files */
+ ti->private = ei;
- inc_nlink(inode);
- d_instantiate(dentry, inode);
- inc_nlink(dentry->d_parent->d_inode);
- fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
- return eventfs_end_creating(dentry);
+ dentry->d_fsdata = get_ei(ei);
+
+ d_add(dentry, inode);
+ return NULL;
}
-static void free_ei(struct eventfs_inode *ei)
+static inline struct eventfs_inode *alloc_ei(const char *name)
{
- kfree_const(ei->name);
- kfree(ei->d_children);
- kfree(ei->entry_attrs);
- kfree(ei);
+ struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+
+ if (!ei)
+ return NULL;
+
+ ei->name = kstrdup_const(name, GFP_KERNEL);
+ if (!ei->name) {
+ kfree(ei);
+ return NULL;
+ }
+ kref_init(&ei->kref);
+ return ei;
}
/**
- * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode
- * @ti: the tracefs_inode of the dentry
+ * eventfs_d_release - dentry is going away
* @dentry: dentry which has the reference to remove.
*
* Remove the association between a dentry from an eventfs_inode.
*/
-void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+void eventfs_d_release(struct dentry *dentry)
{
- struct eventfs_inode *ei;
- int i;
-
- mutex_lock(&eventfs_mutex);
-
- ei = dentry->d_fsdata;
- if (!ei)
- goto out;
-
- /* This could belong to one of the files of the ei */
- if (ei->dentry != dentry) {
- for (i = 0; i < ei->nr_entries; i++) {
- if (ei->d_children[i] == dentry)
- break;
- }
- if (WARN_ON_ONCE(i == ei->nr_entries))
- goto out;
- ei->d_children[i] = NULL;
- } else if (ei->is_freed) {
- free_ei(ei);
- } else {
- ei->dentry = NULL;
- }
-
- dentry->d_fsdata = NULL;
- out:
- mutex_unlock(&eventfs_mutex);
+ put_ei(dentry->d_fsdata);
}
/**
- * create_file_dentry - create a dentry for a file of an eventfs_inode
+ * lookup_file_dentry - create a dentry for a file of an eventfs_inode
* @ei: the eventfs_inode that the file will be created under
- * @idx: the index into the d_children[] of the @ei
+ * @idx: the index into the entry_attrs[] of the @ei
* @parent: The parent dentry of the created file.
* @name: The name of the file to create
* @mode: The mode of the file.
* @data: The data to use to set the inode of the file with on open()
* @fops: The fops of the file to be created.
- * @lookup: If called by the lookup routine, in which case, dput() the created dentry.
*
* Create a dentry for a file of an eventfs_inode @ei and place it into the
- * address located at @e_dentry. If the @e_dentry already has a dentry, then
- * just do a dget() on it and return. Otherwise create the dentry and attach it.
+ * address located at @e_dentry.
*/
static struct dentry *
-create_file_dentry(struct eventfs_inode *ei, int idx,
- struct dentry *parent, const char *name, umode_t mode, void *data,
- const struct file_operations *fops, bool lookup)
+lookup_file_dentry(struct dentry *dentry,
+ struct eventfs_inode *ei, int idx,
+ umode_t mode, void *data,
+ const struct file_operations *fops)
{
struct eventfs_attr *attr = NULL;
- struct dentry **e_dentry = &ei->d_children[idx];
- struct dentry *dentry;
-
- WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
- mutex_lock(&eventfs_mutex);
- if (ei->is_freed) {
- mutex_unlock(&eventfs_mutex);
- return NULL;
- }
- /* If the e_dentry already has a dentry, use it */
- if (*e_dentry) {
- /* lookup does not need to up the ref count */
- if (!lookup)
- dget(*e_dentry);
- mutex_unlock(&eventfs_mutex);
- return *e_dentry;
- }
-
- /* ei->entry_attrs are protected by SRCU */
if (ei->entry_attrs)
attr = &ei->entry_attrs[idx];
- mutex_unlock(&eventfs_mutex);
-
- dentry = create_file(name, mode, attr, parent, data, fops);
-
- mutex_lock(&eventfs_mutex);
-
- if (IS_ERR_OR_NULL(dentry)) {
- /*
- * When the mutex was released, something else could have
- * created the dentry for this e_dentry. In which case
- * use that one.
- *
- * If ei->is_freed is set, the e_dentry is currently on its
- * way to being freed, don't return it. If e_dentry is NULL
- * it means it was already freed.
- */
- if (ei->is_freed)
- dentry = NULL;
- else
- dentry = *e_dentry;
- /* The lookup does not need to up the dentry refcount */
- if (dentry && !lookup)
- dget(dentry);
- mutex_unlock(&eventfs_mutex);
- return dentry;
- }
-
- if (!*e_dentry && !ei->is_freed) {
- *e_dentry = dentry;
- dentry->d_fsdata = ei;
- } else {
- /*
- * Should never happen unless we get here due to being freed.
- * Otherwise it means two dentries exist with the same name.
- */
- WARN_ON_ONCE(!ei->is_freed);
- dentry = NULL;
- }
- mutex_unlock(&eventfs_mutex);
-
- if (lookup)
- dput(dentry);
-
- return dentry;
-}
-
-/**
- * eventfs_post_create_dir - post create dir routine
- * @ei: eventfs_inode of recently created dir
- *
- * Map the meta-data of files within an eventfs dir to their parent dentry
- */
-static void eventfs_post_create_dir(struct eventfs_inode *ei)
-{
- struct eventfs_inode *ei_child;
- struct tracefs_inode *ti;
-
- lockdep_assert_held(&eventfs_mutex);
-
- /* srcu lock already held */
- /* fill parent-child relation */
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- ei_child->d_parent = ei->dentry;
- }
-
- ti = get_tracefs(ei->dentry->d_inode);
- ti->private = ei;
-}
-
-/**
- * create_dir_dentry - Create a directory dentry for the eventfs_inode
- * @pei: The eventfs_inode parent of ei.
- * @ei: The eventfs_inode to create the directory for
- * @parent: The dentry of the parent of this directory
- * @lookup: True if this is called by the lookup code
- *
- * This creates and attaches a directory dentry to the eventfs_inode @ei.
- */
-static struct dentry *
-create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei,
- struct dentry *parent, bool lookup)
-{
- struct dentry *dentry = NULL;
-
- WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
-
- mutex_lock(&eventfs_mutex);
- if (pei->is_freed || ei->is_freed) {
- mutex_unlock(&eventfs_mutex);
- return NULL;
- }
- if (ei->dentry) {
- /* If the dentry already has a dentry, use it */
- dentry = ei->dentry;
- /* lookup does not need to up the ref count */
- if (!lookup)
- dget(dentry);
- mutex_unlock(&eventfs_mutex);
- return dentry;
- }
- mutex_unlock(&eventfs_mutex);
-
- dentry = create_dir(ei, parent);
-
- mutex_lock(&eventfs_mutex);
-
- if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
- /*
- * When the mutex was released, something else could have
- * created the dentry for this e_dentry. In which case
- * use that one.
- *
- * If ei->is_freed is set, the e_dentry is currently on its
- * way to being freed.
- */
- dentry = ei->dentry;
- if (dentry && !lookup)
- dget(dentry);
- mutex_unlock(&eventfs_mutex);
- return dentry;
- }
-
- if (!ei->dentry && !ei->is_freed) {
- ei->dentry = dentry;
- eventfs_post_create_dir(ei);
- dentry->d_fsdata = ei;
- } else {
- /*
- * Should never happen unless we get here due to being freed.
- * Otherwise it means two dentries exist with the same name.
- */
- WARN_ON_ONCE(!ei->is_freed);
- dentry = NULL;
- }
- mutex_unlock(&eventfs_mutex);
-
- if (lookup)
- dput(dentry);
-
- return dentry;
+ return lookup_file(ei, dentry, mode, attr, data, fops);
}
/**
@@ -582,250 +476,153 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
{
- const struct file_operations *fops;
- const struct eventfs_entry *entry;
struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
struct eventfs_inode *ei;
- struct dentry *ei_dentry = NULL;
- struct dentry *ret = NULL;
const char *name = dentry->d_name.name;
- bool created = false;
- umode_t mode;
- void *data;
- int idx;
- int i;
- int r;
+ struct dentry *result = NULL;
ti = get_tracefs(dir);
if (!(ti->flags & TRACEFS_EVENT_INODE))
- return NULL;
-
- /* Grab srcu to prevent the ei from going away */
- idx = srcu_read_lock(&eventfs_srcu);
+ return ERR_PTR(-EIO);
- /*
- * Grab the eventfs_mutex to consistent value from ti->private.
- * This s
- */
mutex_lock(&eventfs_mutex);
- ei = READ_ONCE(ti->private);
- if (ei && !ei->is_freed)
- ei_dentry = READ_ONCE(ei->dentry);
- mutex_unlock(&eventfs_mutex);
- if (!ei || !ei_dentry)
+ ei = ti->private;
+ if (!ei || ei->is_freed)
goto out;
- data = ei->data;
-
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
+ list_for_each_entry(ei_child, &ei->children, list) {
if (strcmp(ei_child->name, name) != 0)
continue;
- ret = simple_lookup(dir, dentry, flags);
- if (IS_ERR(ret))
+ if (ei_child->is_freed)
goto out;
- create_dir_dentry(ei, ei_child, ei_dentry, true);
- created = true;
- break;
- }
-
- if (created)
+ result = lookup_dir_entry(dentry, ei, ei_child);
goto out;
-
- for (i = 0; i < ei->nr_entries; i++) {
- entry = &ei->entries[i];
- if (strcmp(name, entry->name) == 0) {
- void *cdata = data;
- mutex_lock(&eventfs_mutex);
- /* If ei->is_freed, then the event itself may be too */
- if (!ei->is_freed)
- r = entry->callback(name, &mode, &cdata, &fops);
- else
- r = -1;
- mutex_unlock(&eventfs_mutex);
- if (r <= 0)
- continue;
- ret = simple_lookup(dir, dentry, flags);
- if (IS_ERR(ret))
- goto out;
- create_file_dentry(ei, i, ei_dentry, name, mode, cdata,
- fops, true);
- break;
- }
}
- out:
- srcu_read_unlock(&eventfs_srcu, idx);
- return ret;
-}
-
-struct dentry_list {
- void *cursor;
- struct dentry **dentries;
-};
-/**
- * eventfs_release - called to release eventfs file/dir
- * @inode: inode to be released
- * @file: file to be released (not used)
- */
-static int eventfs_release(struct inode *inode, struct file *file)
-{
- struct tracefs_inode *ti;
- struct dentry_list *dlist = file->private_data;
- void *cursor;
- int i;
+ for (int i = 0; i < ei->nr_entries; i++) {
+ void *data;
+ umode_t mode;
+ const struct file_operations *fops;
+ const struct eventfs_entry *entry = &ei->entries[i];
- ti = get_tracefs(inode);
- if (!(ti->flags & TRACEFS_EVENT_INODE))
- return -EINVAL;
+ if (strcmp(name, entry->name) != 0)
+ continue;
- if (WARN_ON_ONCE(!dlist))
- return -EINVAL;
+ data = ei->data;
+ if (entry->callback(name, &mode, &data, &fops) <= 0)
+ goto out;
- for (i = 0; dlist->dentries && dlist->dentries[i]; i++) {
- dput(dlist->dentries[i]);
+ result = lookup_file_dentry(dentry, ei, i, mode, data, fops);
+ goto out;
}
-
- cursor = dlist->cursor;
- kfree(dlist->dentries);
- kfree(dlist);
- file->private_data = cursor;
- return dcache_dir_close(inode, file);
-}
-
-static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt)
-{
- struct dentry **tmp;
-
- tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_NOFS);
- if (!tmp)
- return -1;
- tmp[cnt] = d;
- tmp[cnt + 1] = NULL;
- *dentries = tmp;
- return 0;
+ out:
+ mutex_unlock(&eventfs_mutex);
+ return result;
}
-/**
- * dcache_dir_open_wrapper - eventfs open wrapper
- * @inode: not used
- * @file: dir to be opened (to create it's children)
- *
- * Used to dynamic create file/dir with-in @file, all the
- * file/dir will be created. If already created then references
- * will be increased
+/*
+ * Walk the children of a eventfs_inode to fill in getdents().
*/
-static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
+static int eventfs_iterate(struct file *file, struct dir_context *ctx)
{
const struct file_operations *fops;
+ struct inode *f_inode = file_inode(file);
const struct eventfs_entry *entry;
struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
struct eventfs_inode *ei;
- struct dentry_list *dlist;
- struct dentry **dentries = NULL;
- struct dentry *parent = file_dentry(file);
- struct dentry *d;
- struct inode *f_inode = file_inode(file);
- const char *name = parent->d_name.name;
+ const char *name;
umode_t mode;
- void *data;
- int cnt = 0;
int idx;
- int ret;
- int i;
- int r;
+ int ret = -EINVAL;
+ int ino;
+ int i, r, c;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
ti = get_tracefs(f_inode);
if (!(ti->flags & TRACEFS_EVENT_INODE))
return -EINVAL;
- if (WARN_ON_ONCE(file->private_data))
- return -EINVAL;
+ c = ctx->pos - 2;
idx = srcu_read_lock(&eventfs_srcu);
mutex_lock(&eventfs_mutex);
ei = READ_ONCE(ti->private);
+ if (ei && ei->is_freed)
+ ei = NULL;
mutex_unlock(&eventfs_mutex);
- if (!ei) {
- srcu_read_unlock(&eventfs_srcu, idx);
- return -EINVAL;
- }
-
+ if (!ei)
+ goto out;
- data = ei->data;
+ /*
+ * Need to create the dentries and inodes to have a consistent
+ * inode number.
+ */
+ ret = 0;
- dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
- if (!dlist) {
- srcu_read_unlock(&eventfs_srcu, idx);
- return -ENOMEM;
- }
+ /* Start at 'c' to jump over already read entries */
+ for (i = c; i < ei->nr_entries; i++, ctx->pos++) {
+ void *cdata = ei->data;
- inode_lock(parent->d_inode);
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- d = create_dir_dentry(ei, ei_child, parent, false);
- if (d) {
- ret = add_dentries(&dentries, d, cnt);
- if (ret < 0)
- break;
- cnt++;
- }
- }
-
- for (i = 0; i < ei->nr_entries; i++) {
- void *cdata = data;
entry = &ei->entries[i];
name = entry->name;
+
mutex_lock(&eventfs_mutex);
- /* If ei->is_freed, then the event itself may be too */
- if (!ei->is_freed)
- r = entry->callback(name, &mode, &cdata, &fops);
- else
- r = -1;
+ /* If ei->is_freed then just bail here, nothing more to do */
+ if (ei->is_freed) {
+ mutex_unlock(&eventfs_mutex);
+ goto out;
+ }
+ r = entry->callback(name, &mode, &cdata, &fops);
mutex_unlock(&eventfs_mutex);
if (r <= 0)
continue;
- d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false);
- if (d) {
- ret = add_dentries(&dentries, d, cnt);
- if (ret < 0)
- break;
- cnt++;
+
+ ino = EVENTFS_FILE_INODE_INO;
+
+ if (!dir_emit(ctx, name, strlen(name), ino, DT_REG))
+ goto out;
+ }
+
+ /* Subtract the skipped entries above */
+ c -= min((unsigned int)c, (unsigned int)ei->nr_entries);
+
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
+ srcu_read_lock_held(&eventfs_srcu)) {
+
+ if (c > 0) {
+ c--;
+ continue;
}
+
+ ctx->pos++;
+
+ if (ei_child->is_freed)
+ continue;
+
+ name = ei_child->name;
+
+ ino = eventfs_dir_ino(ei_child);
+
+ if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR))
+ goto out_dec;
}
- inode_unlock(parent->d_inode);
+ ret = 1;
+ out:
srcu_read_unlock(&eventfs_srcu, idx);
- ret = dcache_dir_open(inode, file);
- /*
- * dcache_dir_open() sets file->private_data to a dentry cursor.
- * Need to save that but also save all the dentries that were
- * opened by this function.
- */
- dlist->cursor = file->private_data;
- dlist->dentries = dentries;
- file->private_data = dlist;
return ret;
-}
-
-/*
- * This just sets the file->private_data back to the cursor and back.
- */
-static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
-{
- struct dentry_list *dlist = file->private_data;
- int ret;
- file->private_data = dlist->cursor;
- ret = dcache_readdir(file, ctx);
- dlist->cursor = file->private_data;
- file->private_data = dlist;
- return ret;
+ out_dec:
+ /* Incremented ctx->pos without adding something, reset it */
+ ctx->pos--;
+ goto out;
}
/**
@@ -872,25 +669,10 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
if (!parent)
return ERR_PTR(-EINVAL);
- ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ ei = alloc_ei(name);
if (!ei)
return ERR_PTR(-ENOMEM);
- ei->name = kstrdup_const(name, GFP_KERNEL);
- if (!ei->name) {
- kfree(ei);
- return ERR_PTR(-ENOMEM);
- }
-
- if (size) {
- ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
- if (!ei->d_children) {
- kfree_const(ei->name);
- kfree(ei);
- return ERR_PTR(-ENOMEM);
- }
- }
-
ei->entries = entries;
ei->nr_entries = size;
ei->data = data;
@@ -898,10 +680,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
INIT_LIST_HEAD(&ei->list);
mutex_lock(&eventfs_mutex);
- if (!parent->is_freed) {
+ if (!parent->is_freed)
list_add_tail(&ei->list, &parent->children);
- ei->d_parent = parent->dentry;
- }
mutex_unlock(&eventfs_mutex);
/* Was the parent freed? */
@@ -941,33 +721,33 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
if (IS_ERR(dentry))
return ERR_CAST(dentry);
- ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ ei = alloc_ei(name);
if (!ei)
- goto fail_ei;
+ goto fail;
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
goto fail;
- if (size) {
- ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
- if (!ei->d_children)
- goto fail;
- }
-
- ei->dentry = dentry;
+ // Note: we have a ref to the dentry from tracefs_start_creating()
+ ei->events_dir = dentry;
ei->entries = entries;
ei->nr_entries = size;
ei->is_events = 1;
ei->data = data;
- ei->name = kstrdup_const(name, GFP_KERNEL);
- if (!ei->name)
- goto fail;
/* Save the ownership of this directory */
uid = d_inode(dentry->d_parent)->i_uid;
gid = d_inode(dentry->d_parent)->i_gid;
+ /*
+ * If the events directory is of the top instance, then parent
+ * is NULL. Set the attr.mode to reflect this and its permissions will
+ * default to the tracefs root dentry.
+ */
+ if (!parent)
+ ei->attr.mode = EVENTFS_TOPLEVEL;
+
/* This is used as the default ownership of the files and directories */
ei->attr.uid = uid;
ei->attr.gid = gid;
@@ -985,11 +765,19 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
- dentry->d_fsdata = ei;
+ dentry->d_fsdata = get_ei(ei);
- /* directory inodes start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
+ /*
+ * Keep all eventfs directories with i_nlink == 1.
+ * Due to the dynamic nature of the dentry creations and not
+ * wanting to add a pointer to the parent eventfs_inode in the
+ * eventfs_inode structure, keeping the i_nlink in sync with the
+ * number of directories would cause too much complexity for
+ * something not worth much. Keeping directory links at 1
+ * tells userspace not to trust the link number.
+ */
d_instantiate(dentry, inode);
+ /* The dentry of the "events" parent does keep track though */
inc_nlink(dentry->d_parent->d_inode);
fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
tracefs_end_creating(dentry);
@@ -997,72 +785,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
return ei;
fail:
- kfree(ei->d_children);
- kfree(ei);
- fail_ei:
+ free_ei(ei);
tracefs_failed_creating(dentry);
return ERR_PTR(-ENOMEM);
}
-static LLIST_HEAD(free_list);
-
-static void eventfs_workfn(struct work_struct *work)
-{
- struct eventfs_inode *ei, *tmp;
- struct llist_node *llnode;
-
- llnode = llist_del_all(&free_list);
- llist_for_each_entry_safe(ei, tmp, llnode, llist) {
- /* This dput() matches the dget() from unhook_dentry() */
- for (int i = 0; i < ei->nr_entries; i++) {
- if (ei->d_children[i])
- dput(ei->d_children[i]);
- }
- /* This should only get here if it had a dentry */
- if (!WARN_ON_ONCE(!ei->dentry))
- dput(ei->dentry);
- }
-}
-
-static DECLARE_WORK(eventfs_work, eventfs_workfn);
-
-static void free_rcu_ei(struct rcu_head *head)
-{
- struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu);
-
- if (ei->dentry) {
- /* Do not free the ei until all references of dentry are gone */
- if (llist_add(&ei->llist, &free_list))
- queue_work(system_unbound_wq, &eventfs_work);
- return;
- }
-
- /* If the ei doesn't have a dentry, neither should its children */
- for (int i = 0; i < ei->nr_entries; i++) {
- WARN_ON_ONCE(ei->d_children[i]);
- }
-
- free_ei(ei);
-}
-
-static void unhook_dentry(struct dentry *dentry)
-{
- if (!dentry)
- return;
- /*
- * Need to add a reference to the dentry that is expected by
- * simple_recursive_removal(), which will include a dput().
- */
- dget(dentry);
-
- /*
- * Also add a reference for the dput() in eventfs_workfn().
- * That is required as that dput() will free the ei after
- * the SRCU grace period is over.
- */
- dget(dentry);
-}
-
/**
* eventfs_remove_rec - remove eventfs dir or file from list
* @ei: eventfs_inode to be removed.
@@ -1075,8 +802,6 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
{
struct eventfs_inode *ei_child;
- if (!ei)
- return;
/*
* Check recursion depth. It should never be greater than 3:
* 0 - events/
@@ -1088,28 +813,11 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
return;
/* search for nested folders or files */
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- lockdep_is_held(&eventfs_mutex)) {
- /* Children only have dentry if parent does */
- WARN_ON_ONCE(ei_child->dentry && !ei->dentry);
+ list_for_each_entry(ei_child, &ei->children, list)
eventfs_remove_rec(ei_child, level + 1);
- }
-
-
- ei->is_freed = 1;
- for (int i = 0; i < ei->nr_entries; i++) {
- if (ei->d_children[i]) {
- /* Children only have dentry if parent does */
- WARN_ON_ONCE(!ei->dentry);
- unhook_dentry(ei->d_children[i]);
- }
- }
-
- unhook_dentry(ei->dentry);
-
- list_del_rcu(&ei->list);
- call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei);
+ list_del(&ei->list);
+ free_ei(ei);
}
/**
@@ -1120,22 +828,12 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
*/
void eventfs_remove_dir(struct eventfs_inode *ei)
{
- struct dentry *dentry;
-
if (!ei)
return;
mutex_lock(&eventfs_mutex);
- dentry = ei->dentry;
eventfs_remove_rec(ei, 0);
mutex_unlock(&eventfs_mutex);
-
- /*
- * If any of the ei children has a dentry, then the ei itself
- * must have a dentry.
- */
- if (dentry)
- simple_recursive_removal(dentry, NULL);
}
/**
@@ -1148,7 +846,11 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei)
{
struct dentry *dentry;
- dentry = ei->dentry;
+ dentry = ei->events_dir;
+ if (!dentry)
+ return;
+
+ ei->events_dir = NULL;
eventfs_remove_dir(ei);
/*
@@ -1158,5 +860,6 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei)
* sticks around while the other ei->dentry are created
* and destroyed dynamically.
*/
+ d_invalidate(dentry);
dput(dentry);
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index ad20e6af938d..d65ffad4c327 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -38,8 +38,6 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb)
if (!ti)
return NULL;
- ti->flags = 0;
-
return &ti->vfs_inode;
}
@@ -91,6 +89,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
struct inode *inode, struct dentry *dentry,
umode_t mode)
{
+ struct tracefs_inode *ti;
char *name;
int ret;
@@ -99,6 +98,15 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
return -ENOMEM;
/*
+ * This is a new directory that does not take the default of
+ * the rootfs. It becomes the default permissions for all the
+ * files and directories underneath it.
+ */
+ ti = get_tracefs(inode);
+ ti->flags |= TRACEFS_INSTANCE_INODE;
+ ti->private = inode;
+
+ /*
* The mkdir call can call the generic functions that create
* the files within the tracefs system. It is up to the individual
* mkdir routine to handle races.
@@ -141,10 +149,76 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
return ret;
}
-static const struct inode_operations tracefs_dir_inode_operations = {
+static void set_tracefs_inode_owner(struct inode *inode)
+{
+ struct tracefs_inode *ti = get_tracefs(inode);
+ struct inode *root_inode = ti->private;
+
+ /*
+ * If this inode has never been referenced, then update
+ * the permissions to the superblock.
+ */
+ if (!(ti->flags & TRACEFS_UID_PERM_SET))
+ inode->i_uid = root_inode->i_uid;
+
+ if (!(ti->flags & TRACEFS_GID_PERM_SET))
+ inode->i_gid = root_inode->i_gid;
+}
+
+static int tracefs_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
+{
+ set_tracefs_inode_owner(inode);
+ return generic_permission(idmap, inode, mask);
+}
+
+static int tracefs_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ struct inode *inode = d_backing_inode(path->dentry);
+
+ set_tracefs_inode_owner(inode);
+ generic_fillattr(idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static int tracefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ unsigned int ia_valid = attr->ia_valid;
+ struct inode *inode = d_inode(dentry);
+ struct tracefs_inode *ti = get_tracefs(inode);
+
+ if (ia_valid & ATTR_UID)
+ ti->flags |= TRACEFS_UID_PERM_SET;
+
+ if (ia_valid & ATTR_GID)
+ ti->flags |= TRACEFS_GID_PERM_SET;
+
+ return simple_setattr(idmap, dentry, attr);
+}
+
+static const struct inode_operations tracefs_instance_dir_inode_operations = {
.lookup = simple_lookup,
.mkdir = tracefs_syscall_mkdir,
.rmdir = tracefs_syscall_rmdir,
+ .permission = tracefs_permission,
+ .getattr = tracefs_getattr,
+ .setattr = tracefs_setattr,
+};
+
+static const struct inode_operations tracefs_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .permission = tracefs_permission,
+ .getattr = tracefs_getattr,
+ .setattr = tracefs_setattr,
+};
+
+static const struct inode_operations tracefs_file_inode_operations = {
+ .permission = tracefs_permission,
+ .getattr = tracefs_getattr,
+ .setattr = tracefs_setattr,
};
struct inode *tracefs_get_inode(struct super_block *sb)
@@ -183,82 +257,6 @@ struct tracefs_fs_info {
struct tracefs_mount_opts mount_opts;
};
-static void change_gid(struct dentry *dentry, kgid_t gid)
-{
- if (!dentry->d_inode)
- return;
- dentry->d_inode->i_gid = gid;
-}
-
-/*
- * Taken from d_walk, but without he need for handling renames.
- * Nothing can be renamed while walking the list, as tracefs
- * does not support renames. This is only called when mounting
- * or remounting the file system, to set all the files to
- * the given gid.
- */
-static void set_gid(struct dentry *parent, kgid_t gid)
-{
- struct dentry *this_parent, *dentry;
-
- this_parent = parent;
- spin_lock(&this_parent->d_lock);
-
- change_gid(this_parent, gid);
-repeat:
- dentry = d_first_child(this_parent);
-resume:
- hlist_for_each_entry_from(dentry, d_sib) {
- struct tracefs_inode *ti;
-
- /* Note, getdents() can add a cursor dentry with no inode */
- if (!dentry->d_inode)
- continue;
-
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-
- change_gid(dentry, gid);
-
- /* If this is the events directory, update that too */
- ti = get_tracefs(dentry->d_inode);
- if (ti && (ti->flags & TRACEFS_EVENT_INODE))
- eventfs_update_gid(dentry, gid);
-
- if (!hlist_empty(&dentry->d_children)) {
- spin_unlock(&this_parent->d_lock);
- spin_release(&dentry->d_lock.dep_map, _RET_IP_);
- this_parent = dentry;
- spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
- goto repeat;
- }
- spin_unlock(&dentry->d_lock);
- }
- /*
- * All done at this level ... ascend and resume the search.
- */
- rcu_read_lock();
-ascend:
- if (this_parent != parent) {
- dentry = this_parent;
- this_parent = dentry->d_parent;
-
- spin_unlock(&dentry->d_lock);
- spin_lock(&this_parent->d_lock);
-
- /* go into the first sibling still alive */
- hlist_for_each_entry_continue(dentry, d_sib) {
- if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
- rcu_read_unlock();
- goto resume;
- }
- }
- goto ascend;
- }
- rcu_read_unlock();
- spin_unlock(&this_parent->d_lock);
- return;
-}
-
static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
{
substring_t args[MAX_OPT_ARGS];
@@ -331,10 +329,8 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
if (!remount || opts->opts & BIT(Opt_uid))
inode->i_uid = opts->uid;
- if (!remount || opts->opts & BIT(Opt_gid)) {
- /* Set all the group ids to the mount option */
- set_gid(sb->s_root, opts->gid);
- }
+ if (!remount || opts->opts & BIT(Opt_gid))
+ inode->i_gid = opts->gid;
return 0;
}
@@ -381,21 +377,30 @@ static const struct super_operations tracefs_super_operations = {
.show_options = tracefs_show_options,
};
-static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
+/*
+ * It would be cleaner if eventfs had its own dentry ops.
+ *
+ * Note that d_revalidate is called potentially under RCU,
+ * so it can't take the eventfs mutex etc. It's fine - if
+ * we open a file just as it's marked dead, things will
+ * still work just fine, and just see the old stale case.
+ */
+static void tracefs_d_release(struct dentry *dentry)
{
- struct tracefs_inode *ti;
+ if (dentry->d_fsdata)
+ eventfs_d_release(dentry);
+}
- if (!dentry || !inode)
- return;
+static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct eventfs_inode *ei = dentry->d_fsdata;
- ti = get_tracefs(inode);
- if (ti && ti->flags & TRACEFS_EVENT_INODE)
- eventfs_set_ei_status_free(ti, dentry);
- iput(inode);
+ return !(ei && ei->is_freed);
}
static const struct dentry_operations tracefs_dentry_operations = {
- .d_iput = tracefs_dentry_iput,
+ .d_revalidate = tracefs_d_revalidate,
+ .d_release = tracefs_d_release,
};
static int trace_fill_super(struct super_block *sb, void *data, int silent)
@@ -499,73 +504,24 @@ struct dentry *tracefs_end_creating(struct dentry *dentry)
return dentry;
}
-/**
- * eventfs_start_creating - start the process of creating a dentry
- * @name: Name of the file created for the dentry
- * @parent: The parent dentry where this dentry will be created
- *
- * This is a simple helper function for the dynamically created eventfs
- * files. When the directory of the eventfs files are accessed, their
- * dentries are created on the fly. This function is used to start that
- * process.
- */
-struct dentry *eventfs_start_creating(const char *name, struct dentry *parent)
+/* Find the inode that this will use for default */
+static struct inode *instance_inode(struct dentry *parent, struct inode *inode)
{
- struct dentry *dentry;
- int error;
-
- /* Must always have a parent. */
- if (WARN_ON_ONCE(!parent))
- return ERR_PTR(-EINVAL);
-
- error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
- &tracefs_mount_count);
- if (error)
- return ERR_PTR(error);
+ struct tracefs_inode *ti;
- if (unlikely(IS_DEADDIR(parent->d_inode)))
- dentry = ERR_PTR(-ENOENT);
- else
- dentry = lookup_one_len(name, parent, strlen(name));
+ /* If parent is NULL then use root inode */
+ if (!parent)
+ return d_inode(inode->i_sb->s_root);
- if (!IS_ERR(dentry) && dentry->d_inode) {
- dput(dentry);
- dentry = ERR_PTR(-EEXIST);
+ /* Find the inode that is flagged as an instance or the root inode */
+ while (!IS_ROOT(parent)) {
+ ti = get_tracefs(d_inode(parent));
+ if (ti->flags & TRACEFS_INSTANCE_INODE)
+ break;
+ parent = parent->d_parent;
}
- if (IS_ERR(dentry))
- simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-
- return dentry;
-}
-
-/**
- * eventfs_failed_creating - clean up a failed eventfs dentry creation
- * @dentry: The dentry to clean up
- *
- * If after calling eventfs_start_creating(), a failure is detected, the
- * resources created by eventfs_start_creating() needs to be cleaned up. In
- * that case, this function should be called to perform that clean up.
- */
-struct dentry *eventfs_failed_creating(struct dentry *dentry)
-{
- dput(dentry);
- simple_release_fs(&tracefs_mount, &tracefs_mount_count);
- return NULL;
-}
-
-/**
- * eventfs_end_creating - Finish the process of creating a eventfs dentry
- * @dentry: The dentry that has successfully been created.
- *
- * This function is currently just a place holder to match
- * eventfs_start_creating(). In case any synchronization needs to be added,
- * this function will be used to implement that without having to modify
- * the callers of eventfs_start_creating().
- */
-struct dentry *eventfs_end_creating(struct dentry *dentry)
-{
- return dentry;
+ return d_inode(parent);
}
/**
@@ -598,6 +554,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
+ struct tracefs_inode *ti;
struct dentry *dentry;
struct inode *inode;
@@ -616,7 +573,11 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
if (unlikely(!inode))
return tracefs_failed_creating(dentry);
+ ti = get_tracefs(inode);
+ ti->private = instance_inode(parent, inode);
+
inode->i_mode = mode;
+ inode->i_op = &tracefs_file_inode_operations;
inode->i_fop = fops ? fops : &tracefs_file_operations;
inode->i_private = data;
inode->i_uid = d_inode(dentry->d_parent)->i_uid;
@@ -629,6 +590,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
static struct dentry *__create_dir(const char *name, struct dentry *parent,
const struct inode_operations *ops)
{
+ struct tracefs_inode *ti;
struct dentry *dentry = tracefs_start_creating(name, parent);
struct inode *inode;
@@ -646,6 +608,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
inode->i_uid = d_inode(dentry->d_parent)->i_uid;
inode->i_gid = d_inode(dentry->d_parent)->i_gid;
+ ti = get_tracefs(inode);
+ ti->private = instance_inode(parent, inode);
+
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
@@ -676,7 +641,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
if (security_locked_down(LOCKDOWN_TRACEFS))
return NULL;
- return __create_dir(name, parent, &simple_dir_inode_operations);
+ return __create_dir(name, parent, &tracefs_dir_inode_operations);
}
/**
@@ -707,7 +672,7 @@ __init struct dentry *tracefs_create_instance_dir(const char *name,
if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
return NULL;
- dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
+ dentry = __create_dir(name, parent, &tracefs_instance_dir_inode_operations);
if (!dentry)
return NULL;
@@ -752,7 +717,11 @@ static void init_once(void *foo)
{
struct tracefs_inode *ti = (struct tracefs_inode *) foo;
+ /* inode_init_once() calls memset() on the vfs_inode portion */
inode_init_once(&ti->vfs_inode);
+
+ /* Zero out the rest */
+ memset_after(ti, 0, vfs_inode);
}
static int __init tracefs_init(void)
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 42bdeb471a07..beb3dcd0e434 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -5,12 +5,16 @@
enum {
TRACEFS_EVENT_INODE = BIT(1),
TRACEFS_EVENT_TOP_INODE = BIT(2),
+ TRACEFS_GID_PERM_SET = BIT(3),
+ TRACEFS_UID_PERM_SET = BIT(4),
+ TRACEFS_INSTANCE_INODE = BIT(5),
};
struct tracefs_inode {
+ struct inode vfs_inode;
+ /* The below gets initialized with memset_after(ti, 0, vfs_inode) */
unsigned long flags;
void *private;
- struct inode vfs_inode;
};
/*
@@ -28,42 +32,37 @@ struct eventfs_attr {
/*
* struct eventfs_inode - hold the properties of the eventfs directories.
* @list: link list into the parent directory
+ * @rcu: Union with @list for freeing
+ * @children: link list into the child eventfs_inode
* @entries: the array of entries representing the files in the directory
* @name: the name of the directory to create
- * @children: link list into the child eventfs_inode
- * @dentry: the dentry of the directory
- * @d_parent: pointer to the parent's dentry
- * @d_children: The array of dentries to represent the files when created
+ * @events_dir: the dentry of the events directory
* @entry_attrs: Saved mode and ownership of the @d_children
- * @attr: Saved mode and ownership of eventfs_inode itself
* @data: The private data to pass to the callbacks
+ * @attr: Saved mode and ownership of eventfs_inode itself
* @is_freed: Flag set if the eventfs is on its way to be freed
* Note if is_freed is set, then dentry is corrupted.
+ * @is_events: Flag set for only the top level "events" directory
* @nr_entries: The number of items in @entries
+ * @ino: The saved inode number
*/
struct eventfs_inode {
- struct list_head list;
+ union {
+ struct list_head list;
+ struct rcu_head rcu;
+ };
+ struct list_head children;
const struct eventfs_entry *entries;
const char *name;
- struct list_head children;
- struct dentry *dentry; /* Check is_freed to access */
- struct dentry *d_parent;
- struct dentry **d_children;
+ struct dentry *events_dir;
struct eventfs_attr *entry_attrs;
- struct eventfs_attr attr;
void *data;
- /*
- * Union - used for deletion
- * @llist: for calling dput() if needed after RCU
- * @rcu: eventfs_inode to delete in RCU
- */
- union {
- struct llist_node llist;
- struct rcu_head rcu;
- };
+ struct eventfs_attr attr;
+ struct kref kref;
unsigned int is_freed:1;
unsigned int is_events:1;
unsigned int nr_entries:30;
+ unsigned int ino;
};
static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
@@ -75,10 +74,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent);
struct dentry *tracefs_end_creating(struct dentry *dentry);
struct dentry *tracefs_failed_creating(struct dentry *dentry);
struct inode *tracefs_get_inode(struct super_block *sb);
-struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
-struct dentry *eventfs_failed_creating(struct dentry *dentry);
-struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_update_gid(struct dentry *dentry, kgid_t gid);
-void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
+
+void eventfs_d_release(struct dentry *dentry);
#endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 0d561ecb6869..a4a0158f712d 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -18,7 +18,7 @@
#include "ubifs.h"
/**
- * ubifs_node_calc_hash - calculate the hash of a UBIFS node
+ * __ubifs_node_calc_hash - calculate the hash of a UBIFS node
* @c: UBIFS file-system description object
* @node: the node to calculate a hash for
* @hash: the returned hash
@@ -507,28 +507,13 @@ out:
*/
int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac)
{
- SHASH_DESC_ON_STACK(shash, c->hmac_tfm);
- int err;
const char well_known_message[] = "UBIFS";
if (!ubifs_authenticated(c))
return 0;
- shash->tfm = c->hmac_tfm;
-
- err = crypto_shash_init(shash);
- if (err)
- return err;
-
- err = crypto_shash_update(shash, well_known_message,
- sizeof(well_known_message) - 1);
- if (err < 0)
- return err;
-
- err = crypto_shash_final(shash, hmac);
- if (err)
- return err;
- return 0;
+ return crypto_shash_tfm_digest(c->hmac_tfm, well_known_message,
+ sizeof(well_known_message) - 1, hmac);
}
/*
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index c4fc1047fc07..5b3a840098b0 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -70,18 +70,29 @@ static int nothing_to_commit(struct ubifs_info *c)
return 0;
/*
+ * Increasing @c->dirty_pn_cnt/@c->dirty_nn_cnt and marking
+ * nnodes/pnodes as dirty in run_gc() could race with following
+ * checking, which leads inconsistent states between @c->nroot
+ * and @c->dirty_pn_cnt/@c->dirty_nn_cnt, holding @c->lp_mutex
+ * to avoid that.
+ */
+ mutex_lock(&c->lp_mutex);
+ /*
* Even though the TNC is clean, the LPT tree may have dirty nodes. For
* example, this may happen if the budgeting subsystem invoked GC to
* make some free space, and the GC found an LEB with only dirty and
* free space. In this case GC would just change the lprops of this
* LEB (by turning all space into free space) and unmap it.
*/
- if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
+ if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) {
+ mutex_unlock(&c->lp_mutex);
return 0;
+ }
ubifs_assert(c, atomic_long_read(&c->dirty_zn_cnt) == 0);
ubifs_assert(c, c->dirty_pn_cnt == 0);
ubifs_assert(c, c->dirty_nn_cnt == 0);
+ mutex_unlock(&c->lp_mutex);
return 1;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 3b13c648d490..551148de66cd 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -205,7 +205,6 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
err = fscrypt_prepare_lookup(dir, dentry, &nm);
- generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
return d_splice_alias(NULL, dentry);
if (err)
@@ -1234,6 +1233,8 @@ out_cancel:
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
out_inode:
+ /* Free inode->i_link before inode is marked as bad. */
+ fscrypt_free_inode(inode);
make_bad_inode(inode);
iput(inode);
out_fname:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2d2b39f843ce..5029eb3390a5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -318,8 +318,9 @@ static int write_begin_slow(struct address_space *mapping,
* This is a helper function for 'ubifs_write_begin()' which allocates budget
* for the operation. The budget is allocated differently depending on whether
* this is appending, whether the page is dirty or not, and so on. This
- * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
- * in case of success and %-ENOSPC in case of failure.
+ * function leaves the @ui->ui_mutex locked in case of appending.
+ *
+ * Returns: %0 in case of success and %-ENOSPC in case of failure.
*/
static int allocate_budget(struct ubifs_info *c, struct page *page,
struct ubifs_inode *ui, int appending)
@@ -600,7 +601,7 @@ out:
* @bu: bulk-read information
* @n: next zbranch slot
*
- * This function returns %0 on success and a negative error code on failure.
+ * Returns: %0 on success and a negative error code on failure.
*/
static int populate_page(struct ubifs_info *c, struct page *page,
struct bu_info *bu, int *n)
@@ -711,7 +712,7 @@ out_err:
* @bu: bulk-read information
* @page1: first page to read
*
- * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
+ * Returns: %1 if the bulk-read is done, otherwise %0 is returned.
*/
static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
struct page *page1)
@@ -821,7 +822,9 @@ out_bu_off:
* Some flash media are capable of reading sequentially at faster rates. UBIFS
* bulk-read facility is designed to take advantage of that, by reading in one
* go consecutive data nodes that are also located consecutively in the same
- * LEB. This function returns %1 if a bulk-read is done and %0 otherwise.
+ * LEB.
+ *
+ * Returns: %1 if a bulk-read is done and %0 otherwise.
*/
static int ubifs_bulk_read(struct page *page)
{
@@ -1109,7 +1112,9 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
* @attr: inode attribute changes description
*
* This function implements VFS '->setattr()' call when the inode is truncated
- * to a smaller size. Returns zero in case of success and a negative error code
+ * to a smaller size.
+ *
+ * Returns: %0 in case of success and a negative error code
* in case of failure.
*/
static int do_truncation(struct ubifs_info *c, struct inode *inode,
@@ -1215,7 +1220,9 @@ out_budg:
* @attr: inode attribute changes description
*
* This function implements VFS '->setattr()' call for all cases except
- * truncations to smaller size. Returns zero in case of success and a negative
+ * truncations to smaller size.
+ *
+ * Returns: %0 in case of success and a negative
* error code in case of failure.
*/
static int do_setattr(struct ubifs_info *c, struct inode *inode,
@@ -1360,6 +1367,8 @@ out:
* This helper function checks if the inode mtime/ctime should be updated or
* not. If current values of the time-stamps are within the UBIFS inode time
* granularity, they are not updated. This is an optimization.
+ *
+ * Returns: %1 if time update is needed, %0 if not
*/
static inline int mctime_update_needed(const struct inode *inode,
const struct timespec64 *now)
@@ -1375,11 +1384,12 @@ static inline int mctime_update_needed(const struct inode *inode,
/**
* ubifs_update_time - update time of inode.
* @inode: inode to update
- * @time: timespec structure to hold the current time value
* @flags: time updating control flag determines updating
* which time fields of @inode
*
* This function updates time of the inode.
+ *
+ * Returns: %0 for success or a negative error code otherwise.
*/
int ubifs_update_time(struct inode *inode, int flags)
{
@@ -1413,7 +1423,9 @@ int ubifs_update_time(struct inode *inode, int flags)
* @inode: inode to update
*
* This function updates mtime and ctime of the inode if it is not equivalent to
- * current time. Returns zero in case of success and a negative error code in
+ * current time.
+ *
+ * Returns: %0 in case of success and a negative error code in
* case of failure.
*/
static int update_mctime(struct inode *inode)
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index c59d47fe7939..17da28d6247a 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -365,6 +365,7 @@ static void destroy_replay_list(struct ubifs_info *c)
* @lnum: node logical eraseblock number
* @offs: node offset
* @len: node length
+ * @hash: node hash
* @key: node key
* @sqnum: sequence number
* @deletion: non-zero if this is a deletion
@@ -417,6 +418,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
* @lnum: node logical eraseblock number
* @offs: node offset
* @len: node length
+ * @hash: node hash
* @key: node key
* @name: directory entry name
* @nlen: directory entry name length
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 09e270d6ed02..d2881041b393 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2239,13 +2239,14 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out_umount;
}
+ generic_set_sb_d_ops(sb);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
err = -ENOMEM;
goto out_umount;
}
- import_uuid(&sb->s_uuid, c->uuid);
+ super_set_uuid(sb, c->uuid, sizeof(c->uuid));
mutex_unlock(&c->umount_mutex);
return 0;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 4fcefe5ef7cb..959551ff9a95 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1032,7 +1032,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new,
{
int fd;
- fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
if (fd < 0)
return fd;
@@ -2260,7 +2260,8 @@ static int new_userfaultfd(int flags)
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
+ /* Create a new inode so that the LSM can block the creation. */
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
if (fd < 0) {
mmdrop(ctx->mm);
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 9976a00a73f9..e965a48e7db9 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -421,10 +421,10 @@ xfs_attr_complete_op(
bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
args->op_flags &= ~XFS_DA_OP_REPLACE;
- if (do_replace) {
- args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ if (do_replace)
return replace_state;
- }
+
return XFS_DAS_DONE;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 98aaca933bdd..f362345467fa 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3277,7 +3277,7 @@ xfs_bmap_alloc_account(
struct xfs_bmalloca *ap)
{
bool isrt = XFS_IS_REALTIME_INODE(ap->ip) &&
- (ap->flags & XFS_BMAPI_ATTRFORK);
+ !(ap->flags & XFS_BMAPI_ATTRFORK);
uint fld;
if (ap->flags & XFS_BMAPI_COWFORK) {
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 31100120b2c5..e31663cb7b43 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1119,20 +1119,6 @@ xfs_rtbitmap_blockcount(
}
/*
- * Compute the maximum level number of the realtime summary file, as defined by
- * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct
- * use of rt volumes with more than 2^32 extents.
- */
-uint8_t
-xfs_compute_rextslog(
- xfs_rtbxlen_t rtextents)
-{
- if (!rtextents)
- return 0;
- return xfs_highbit64(rtextents);
-}
-
-/*
* Compute the number of rtbitmap words needed to populate every block of a
* bitmap that is large enough to track the given number of rt extents.
*/
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 274dc7dae1fa..152a66750af5 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -351,20 +351,6 @@ xfs_rtfree_extent(
int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
xfs_filblks_t rtlen);
-uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
-
-/* Do we support an rt volume having this number of rtextents? */
-static inline bool
-xfs_validate_rtextents(
- xfs_rtbxlen_t rtextents)
-{
- /* No runt rt volumes */
- if (rtextents == 0)
- return false;
-
- return true;
-}
-
xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
rtextents);
unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
@@ -383,8 +369,6 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
# define xfs_rtsummary_read_buf(a,b) (-ENOSYS)
# define xfs_rtbuf_cache_relse(a) (0)
# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS)
-# define xfs_compute_rextslog(rtx) (0)
-# define xfs_validate_rtextents(rtx) (false)
static inline xfs_filblks_t
xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
{
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4a9e8588f4c9..5bb6e2bd6dee 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1377,3 +1377,17 @@ xfs_validate_stripe_geometry(
}
return true;
}
+
+/*
+ * Compute the maximum level number of the realtime summary file, as defined by
+ * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct
+ * use of rt volumes with more than 2^32 extents.
+ */
+uint8_t
+xfs_compute_rextslog(
+ xfs_rtbxlen_t rtextents)
+{
+ if (!rtextents)
+ return 0;
+ return xfs_highbit64(rtextents);
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 19134b23c10b..2e8e8d63d4eb 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -38,4 +38,6 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp,
extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
__s64 sunit, __s64 swidth, int sectorsize, bool silent);
+uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 20b5375f2d9c..62e02d5380ad 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -251,4 +251,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
xfs_fileoff_t len);
+/* Do we support an rt volume having this number of rtextents? */
+static inline bool
+xfs_validate_rtextents(
+ xfs_rtbxlen_t rtextents)
+{
+ /* No runt rt volumes */
+ if (rtextents == 0)
+ return false;
+
+ return true;
+}
+
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 441ca9977652..46583517377f 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -15,6 +15,7 @@
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bit.h"
+#include "xfs_sb.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index fabd0ed9dfa6..b1ff4f33324a 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -16,6 +16,7 @@
#include "xfs_rtbitmap.h"
#include "xfs_bit.h"
#include "xfs_bmap.h"
+#include "xfs_sb.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 813f85156b0c..1698507d1ac7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -112,7 +112,7 @@ xfs_end_ioend(
* longer dirty. If we don't remove delalloc blocks here, they become
* stale and can corrupt free space accounting on unmount.
*/
- error = blk_status_to_errno(ioend->io_bio->bi_status);
+ error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
@@ -179,7 +179,7 @@ STATIC void
xfs_end_bio(
struct bio *bio)
{
- struct iomap_ioend *ioend = bio->bi_private;
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
unsigned long flags;
@@ -276,7 +276,8 @@ static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
struct inode *inode,
- loff_t offset)
+ loff_t offset,
+ unsigned int len)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -444,7 +445,7 @@ xfs_prepare_ioend(
/* send ioends that might require a transaction to the completion wq */
if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
(ioend->io_flags & IOMAP_F_SHARED))
- ioend->io_bio->bi_end_io = xfs_end_bio;
+ ioend->io_bio.bi_end_io = xfs_end_bio;
return status;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8e5bd50d29fe..01b41fabbe3c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1951,7 +1951,7 @@ xfs_free_buftarg(
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
- bdev_release(btp->bt_bdev_handle);
+ fput(btp->bt_bdev_file);
kmem_free(btp);
}
@@ -1994,7 +1994,7 @@ xfs_setsize_buftarg_early(
struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct bdev_handle *bdev_handle)
+ struct file *bdev_file)
{
xfs_buftarg_t *btp;
const struct dax_holder_operations *ops = NULL;
@@ -2005,9 +2005,9 @@ xfs_alloc_buftarg(
btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
btp->bt_mount = mp;
- btp->bt_bdev_handle = bdev_handle;
- btp->bt_dev = bdev_handle->bdev->bd_dev;
- btp->bt_bdev = bdev_handle->bdev;
+ btp->bt_bdev_file = bdev_file;
+ btp->bt_bdev = file_bdev(bdev_file);
+ btp->bt_dev = btp->bt_bdev->bd_dev;
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b470de08a46c..304e858d04fb 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -98,7 +98,7 @@ typedef unsigned int xfs_buf_flags_t;
*/
typedef struct xfs_buftarg {
dev_t bt_dev;
- struct bdev_handle *bt_bdev_handle;
+ struct file *bt_bdev_file;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
u64 bt_dax_part_off;
@@ -366,7 +366,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
* Handling of buftargs.
*/
struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
- struct bdev_handle *bdev_handle);
+ struct file *bdev_file);
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aabb25dc3efa..57fa21ad7912 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -62,7 +62,7 @@ xfs_uuid_mount(
int hole, i;
/* Publish UUID in struct super_block */
- uuid_copy(&mp->m_super->s_uuid, uuid);
+ super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid));
if (xfs_has_nouuid(mp))
return 0;
@@ -706,6 +706,8 @@ xfs_mountfs(
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
+ super_set_sysfs_name_id(mp->m_super);
+
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
NULL, mp->m_super->s_id);
if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index aff20ddd4a9f..00fbd5b6e582 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -350,7 +350,6 @@ xfs_setup_dax_always(
return -EINVAL;
}
- xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
return 0;
disable_dax:
@@ -362,16 +361,16 @@ STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
const char *name,
- struct bdev_handle **handlep)
+ struct file **bdev_filep)
{
int error = 0;
- *handlep = bdev_open_by_path(name,
+ *bdev_filep = bdev_file_open_by_path(name,
BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
mp->m_super, &fs_holder_ops);
- if (IS_ERR(*handlep)) {
- error = PTR_ERR(*handlep);
- *handlep = NULL;
+ if (IS_ERR(*bdev_filep)) {
+ error = PTR_ERR(*bdev_filep);
+ *bdev_filep = NULL;
xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
}
@@ -436,26 +435,26 @@ xfs_open_devices(
{
struct super_block *sb = mp->m_super;
struct block_device *ddev = sb->s_bdev;
- struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL;
+ struct file *logdev_file = NULL, *rtdev_file = NULL;
int error;
/*
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
- error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
+ error = xfs_blkdev_get(mp, mp->m_logname, &logdev_file);
if (error)
return error;
}
if (mp->m_rtname) {
- error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle);
+ error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_file);
if (error)
goto out_close_logdev;
- if (rtdev_handle->bdev == ddev ||
- (logdev_handle &&
- rtdev_handle->bdev == logdev_handle->bdev)) {
+ if (file_bdev(rtdev_file) == ddev ||
+ (logdev_file &&
+ file_bdev(rtdev_file) == file_bdev(logdev_file))) {
xfs_warn(mp,
"Cannot mount filesystem with identical rtdev and ddev/logdev.");
error = -EINVAL;
@@ -467,25 +466,25 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
- if (rtdev_handle) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle);
+ if (rtdev_file) {
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
- if (logdev_handle && logdev_handle->bdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle);
+ if (logdev_file && file_bdev(logdev_file) != ddev) {
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
mp->m_logdev_targp = mp->m_ddev_targp;
/* Handle won't be used, drop it */
- if (logdev_handle)
- bdev_release(logdev_handle);
+ if (logdev_file)
+ fput(logdev_file);
}
return 0;
@@ -496,11 +495,11 @@ xfs_open_devices(
out_free_ddev_targ:
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
- if (rtdev_handle)
- bdev_release(rtdev_handle);
+ if (rtdev_file)
+ fput(rtdev_file);
out_close_logdev:
- if (logdev_handle)
- bdev_release(logdev_handle);
+ if (logdev_file)
+ fput(logdev_file);
return error;
}
@@ -1496,6 +1495,18 @@ xfs_fs_fill_super(
mp->m_super = sb;
+ /*
+ * Copy VFS mount flags from the context now that all parameter parsing
+ * is guaranteed to have been completed by either the old mount API or
+ * the newer fsopen/fsconfig API.
+ */
+ if (fc->sb_flags & SB_RDONLY)
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+ if (fc->sb_flags & SB_DIRSYNC)
+ mp->m_features |= XFS_FEAT_DIRSYNC;
+ if (fc->sb_flags & SB_SYNCHRONOUS)
+ mp->m_features |= XFS_FEAT_WSYNC;
+
error = xfs_fs_validate_params(mp);
if (error)
return error;
@@ -1965,6 +1976,11 @@ static const struct fs_context_operations xfs_context_ops = {
.free = xfs_fs_free,
};
+/*
+ * WARNING: do not initialise any parameters in this function that depend on
+ * mount option parsing having already been performed as this can be called from
+ * fsopen() before any parameters have been set.
+ */
static int xfs_init_fs_context(
struct fs_context *fc)
{
@@ -1996,16 +2012,6 @@ static int xfs_init_fs_context(
mp->m_logbsize = -1;
mp->m_allocsize_log = 16; /* 64k */
- /*
- * Copy binary VFS mount flags we are interested in.
- */
- if (fc->sb_flags & SB_RDONLY)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
- if (fc->sb_flags & SB_DIRSYNC)
- mp->m_features |= XFS_FEAT_DIRSYNC;
- if (fc->sb_flags & SB_SYNCHRONOUS)
- mp->m_features |= XFS_FEAT_WSYNC;
-
fc->s_fs_info = mp;
fc->ops = &xfs_context_ops;
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 6ab2318a9c8e..3b103715acc9 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac)
* which implies that the page range can only be within the fixed inode size.
*/
static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset)
+ struct inode *inode, loff_t offset,
+ unsigned int len)
{
struct zonefs_zone *z = zonefs_inode_zone(inode);
@@ -348,7 +349,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
struct zonefs_inode_info *zi = ZONEFS_I(inode);
if (error) {
- zonefs_io_error(inode, true);
+ /*
+ * For Sync IOs, error recovery is called from
+ * zonefs_file_dio_write().
+ */
+ if (!is_sync_kiocb(iocb))
+ zonefs_io_error(inode, true);
return error;
}
@@ -491,6 +497,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = -EINVAL;
goto inode_unlock;
}
+ /*
+ * Advance the zone write pointer offset. This assumes that the
+ * IO will succeed, which is OK to do because we do not allow
+ * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
+ * fails, the error path will correct the write pointer offset.
+ */
+ z->z_wpoffset += count;
+ zonefs_inode_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -504,20 +518,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
if (ret == -ENOTBLK)
ret = -EBUSY;
- if (zonefs_zone_is_seq(z) &&
- (ret > 0 || ret == -EIOCBQUEUED)) {
- if (ret > 0)
- count = ret;
-
- /*
- * Update the zone write pointer offset assuming the write
- * operation succeeded. If it did not, the error recovery path
- * will correct it. Also do active seq file accounting.
- */
- mutex_lock(&zi->i_truncate_mutex);
- z->z_wpoffset += count;
- zonefs_inode_account_active(inode);
- mutex_unlock(&zi->i_truncate_mutex);
+ /*
+ * For a failed IO or partial completion, trigger error recovery
+ * to update the zone write pointer offset to a correct value.
+ * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
+ * have executed error recovery if the IO already completed when we
+ * reach here. However, we cannot know that and execute error recovery
+ * again (that will not change anything).
+ */
+ if (zonefs_zone_is_seq(z)) {
+ if (ret > 0 && ret != count)
+ ret = -EIO;
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ zonefs_io_error(inode, true);
}
inode_unlock:
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 93971742613a..aadad16738df 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,
trace_zonefs_zone_mgmt(sb, z, op);
ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
- z->z_size >> SECTOR_SHIFT, GFP_NOFS);
+ z->z_size >> SECTOR_SHIFT);
if (ret) {
zonefs_err(sb,
"Zone management operation %s at %llu failed %d\n",
@@ -246,16 +246,18 @@ static void zonefs_inode_update_mode(struct inode *inode)
z->z_mode = inode->i_mode;
}
-struct zonefs_ioerr_data {
- struct inode *inode;
- bool write;
-};
-
static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
void *data)
{
- struct zonefs_ioerr_data *err = data;
- struct inode *inode = err->inode;
+ struct blk_zone *z = data;
+
+ *z = *zone;
+ return 0;
+}
+
+static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone,
+ bool write)
+{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -270,8 +272,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
data_size = zonefs_check_zone_condition(sb, z, zone);
isize = i_size_read(inode);
if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
- !err->write && isize == data_size)
- return 0;
+ !write && isize == data_size)
+ return;
/*
* At this point, we detected either a bad zone or an inconsistency
@@ -292,7 +294,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* In all cases, warn about inode size inconsistency and handle the
* IO error according to the zone condition and to the mount options.
*/
- if (zonefs_zone_is_seq(z) && isize != data_size)
+ if (isize != data_size)
zonefs_warn(sb,
"inode %lu: invalid size %lld (should be %lld)\n",
inode->i_ino, isize, data_size);
@@ -352,8 +354,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
zonefs_i_size_write(inode, data_size);
z->z_wpoffset = data_size;
zonefs_inode_account_active(inode);
-
- return 0;
}
/*
@@ -367,23 +367,25 @@ void __zonefs_io_error(struct inode *inode, bool write)
{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
- struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
unsigned int noio_flag;
- unsigned int nr_zones = 1;
- struct zonefs_ioerr_data err = {
- .inode = inode,
- .write = write,
- };
+ struct blk_zone zone;
int ret;
/*
- * The only files that have more than one zone are conventional zone
- * files with aggregated conventional zones, for which the inode zone
- * size is always larger than the device zone size.
+ * Conventional zone have no write pointer and cannot become read-only
+ * or offline. So simply fake a report for a single or aggregated zone
+ * and let zonefs_handle_io_error() correct the zone inode information
+ * according to the mount options.
*/
- if (z->z_size > bdev_zone_sectors(sb->s_bdev))
- nr_zones = z->z_size >>
- (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+ if (!zonefs_zone_is_seq(z)) {
+ zone.start = z->z_sector;
+ zone.len = z->z_size >> SECTOR_SHIFT;
+ zone.wp = zone.start + zone.len;
+ zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zone.cond = BLK_ZONE_COND_NOT_WP;
+ zone.capacity = zone.len;
+ goto handle_io_error;
+ }
/*
* Memory allocations in blkdev_report_zones() can trigger a memory
@@ -394,12 +396,20 @@ void __zonefs_io_error(struct inode *inode, bool write)
* the GFP_NOIO context avoids both problems.
*/
noio_flag = memalloc_noio_save();
- ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones,
- zonefs_io_error_cb, &err);
- if (ret != nr_zones)
+ ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1,
+ zonefs_io_error_cb, &zone);
+ memalloc_noio_restore(noio_flag);
+
+ if (ret != 1) {
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
inode->i_ino, ret);
- memalloc_noio_restore(noio_flag);
+ zonefs_warn(sb, "remounting filesystem read-only\n");
+ sb->s_flags |= SB_RDONLY;
+ return;
+ }
+
+handle_io_error:
+ zonefs_handle_io_error(inode, &zone, write);
}
static struct kmem_cache *zonefs_inode_cachep;