summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c4
-rw-r--r--fs/backing-file.c23
-rw-r--r--fs/bcachefs/acl.c41
-rw-r--r--fs/bcachefs/alloc_background.c337
-rw-r--r--fs/bcachefs/alloc_background.h109
-rw-r--r--fs/bcachefs/alloc_foreground.c304
-rw-r--r--fs/bcachefs/alloc_foreground.h15
-rw-r--r--fs/bcachefs/alloc_types.h7
-rw-r--r--fs/bcachefs/backpointers.c158
-rw-r--r--fs/bcachefs/backpointers.h43
-rw-r--r--fs/bcachefs/bcachefs.h34
-rw-r--r--fs/bcachefs/bcachefs_format.h10
-rw-r--r--fs/bcachefs/bkey.c15
-rw-r--r--fs/bcachefs/bkey.h33
-rw-r--r--fs/bcachefs/bkey_methods.c22
-rw-r--r--fs/bcachefs/bkey_methods.h73
-rw-r--r--fs/bcachefs/bkey_sort.c79
-rw-r--r--fs/bcachefs/bkey_sort.h4
-rw-r--r--fs/bcachefs/bset.c29
-rw-r--r--fs/bcachefs/bset.h6
-rw-r--r--fs/bcachefs/btree_cache.c149
-rw-r--r--fs/bcachefs/btree_cache.h5
-rw-r--r--fs/bcachefs/btree_gc.c1032
-rw-r--r--fs/bcachefs/btree_gc.h44
-rw-r--r--fs/bcachefs/btree_io.c117
-rw-r--r--fs/bcachefs/btree_io.h2
-rw-r--r--fs/bcachefs/btree_iter.c347
-rw-r--r--fs/bcachefs/btree_iter.h95
-rw-r--r--fs/bcachefs/btree_journal_iter.c17
-rw-r--r--fs/bcachefs/btree_journal_iter.h2
-rw-r--r--fs/bcachefs/btree_key_cache.c107
-rw-r--r--fs/bcachefs/btree_key_cache_types.h8
-rw-r--r--fs/bcachefs/btree_locking.c179
-rw-r--r--fs/bcachefs/btree_locking.h4
-rw-r--r--fs/bcachefs/btree_trans_commit.c70
-rw-r--r--fs/bcachefs/btree_types.h127
-rw-r--r--fs/bcachefs/btree_update.c95
-rw-r--r--fs/bcachefs/btree_update.h14
-rw-r--r--fs/bcachefs/btree_update_interior.c95
-rw-r--r--fs/bcachefs/btree_update_interior.h7
-rw-r--r--fs/bcachefs/btree_write_buffer.c8
-rw-r--r--fs/bcachefs/buckets.c693
-rw-r--r--fs/bcachefs/buckets.h70
-rw-r--r--fs/bcachefs/chardev.c66
-rw-r--r--fs/bcachefs/checksum.c17
-rw-r--r--fs/bcachefs/data_update.c54
-rw-r--r--fs/bcachefs/debug.c80
-rw-r--r--fs/bcachefs/dirent.c97
-rw-r--r--fs/bcachefs/dirent.h8
-rw-r--r--fs/bcachefs/disk_groups.c11
-rw-r--r--fs/bcachefs/ec.c369
-rw-r--r--fs/bcachefs/ec.h7
-rw-r--r--fs/bcachefs/error.c59
-rw-r--r--fs/bcachefs/extent_update.c2
-rw-r--r--fs/bcachefs/extents.c151
-rw-r--r--fs/bcachefs/extents.h12
-rw-r--r--fs/bcachefs/eytzinger.c105
-rw-r--r--fs/bcachefs/fs-common.c38
-rw-r--r--fs/bcachefs/fs-io-buffered.c14
-rw-r--r--fs/bcachefs/fs-io-direct.c2
-rw-r--r--fs/bcachefs/fs-io-pagecache.c2
-rw-r--r--fs/bcachefs/fs-io.c9
-rw-r--r--fs/bcachefs/fs-ioctl.c2
-rw-r--r--fs/bcachefs/fs.c109
-rw-r--r--fs/bcachefs/fsck.c212
-rw-r--r--fs/bcachefs/inode.c64
-rw-r--r--fs/bcachefs/inode.h23
-rw-r--r--fs/bcachefs/io_misc.c10
-rw-r--r--fs/bcachefs/io_read.c68
-rw-r--r--fs/bcachefs/io_write.c95
-rw-r--r--fs/bcachefs/io_write_types.h1
-rw-r--r--fs/bcachefs/journal.c131
-rw-r--r--fs/bcachefs/journal.h6
-rw-r--r--fs/bcachefs/journal_io.c163
-rw-r--r--fs/bcachefs/journal_io.h5
-rw-r--r--fs/bcachefs/journal_reclaim.c10
-rw-r--r--fs/bcachefs/journal_sb.c10
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c77
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h2
-rw-r--r--fs/bcachefs/journal_types.h17
-rw-r--r--fs/bcachefs/logged_ops.c2
-rw-r--r--fs/bcachefs/lru.c4
-rw-r--r--fs/bcachefs/lru.h2
-rw-r--r--fs/bcachefs/migrate.c8
-rw-r--r--fs/bcachefs/move.c82
-rw-r--r--fs/bcachefs/movinggc.c4
-rw-r--r--fs/bcachefs/opts.h7
-rw-r--r--fs/bcachefs/printbuf.c232
-rw-r--r--fs/bcachefs/printbuf.h53
-rw-r--r--fs/bcachefs/quota.c123
-rw-r--r--fs/bcachefs/quota.h4
-rw-r--r--fs/bcachefs/rebalance.c10
-rw-r--r--fs/bcachefs/recovery.c142
-rw-r--r--fs/bcachefs/recovery_passes.c8
-rw-r--r--fs/bcachefs/reflink.c72
-rw-r--r--fs/bcachefs/reflink.h16
-rw-r--r--fs/bcachefs/replicas.c20
-rw-r--r--fs/bcachefs/sb-clean.c15
-rw-r--r--fs/bcachefs/sb-counters.c20
-rw-r--r--fs/bcachefs/sb-downgrade.c25
-rw-r--r--fs/bcachefs/sb-errors.c2
-rw-r--r--fs/bcachefs/sb-errors_types.h3
-rw-r--r--fs/bcachefs/sb-members.c149
-rw-r--r--fs/bcachefs/sb-members.h165
-rw-r--r--fs/bcachefs/sb-members_types.h21
-rw-r--r--fs/bcachefs/snapshot.c53
-rw-r--r--fs/bcachefs/snapshot.h16
-rw-r--r--fs/bcachefs/str_hash.h70
-rw-r--r--fs/bcachefs/subvolume.c22
-rw-r--r--fs/bcachefs/subvolume.h7
-rw-r--r--fs/bcachefs/super-io.c117
-rw-r--r--fs/bcachefs/super-io.h3
-rw-r--r--fs/bcachefs/super.c112
-rw-r--r--fs/bcachefs/super_types.h15
-rw-r--r--fs/bcachefs/sysfs.c178
-rw-r--r--fs/bcachefs/tests.c16
-rw-r--r--fs/bcachefs/trace.h103
-rw-r--r--fs/bcachefs/util.c61
-rw-r--r--fs/bcachefs/util.h5
-rw-r--r--fs/bcachefs/xattr.c47
-rw-r--r--fs/bcachefs/xattr.h2
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/volumes.c15
-rw-r--r--fs/btrfs/zoned.c2
-rw-r--r--fs/buffer.c191
-rw-r--r--fs/cachefiles/namei.c3
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/crypto/inline_crypt.c6
-rw-r--r--fs/dax.c14
-rw-r--r--fs/dcache.c15
-rw-r--r--fs/erofs/data.c12
-rw-r--r--fs/erofs/dir.c4
-rw-r--r--fs/erofs/internal.h4
-rw-r--r--fs/erofs/namei.c6
-rw-r--r--fs/erofs/super.c8
-rw-r--r--fs/erofs/xattr.c37
-rw-r--r--fs/erofs/zdata.c6
-rw-r--r--fs/exec.c11
-rw-r--r--fs/ext2/Kconfig1
-rw-r--r--fs/ext2/dir.c1
-rw-r--r--fs/ext2/file.c8
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext4/acl.h5
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h9
-rw-r--r--fs/ext4/ext4_jbd2.c2
-rw-r--r--fs/ext4/extents.c3
-rw-r--r--fs/ext4/file.c5
-rw-r--r--fs/ext4/inode.c11
-rw-r--r--fs/ext4/ioctl.c3
-rw-r--r--fs/ext4/mballoc-test.c76
-rw-r--r--fs/ext4/mballoc.c322
-rw-r--r--fs/ext4/mballoc.h14
-rw-r--r--fs/ext4/move_extent.c4
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c3
-rw-r--r--fs/ext4/readpage.c1
-rw-r--r--fs/ext4/super.c62
-rw-r--r--fs/ext4/sysfs.c174
-rw-r--r--fs/ext4/xattr.c145
-rw-r--r--fs/f2fs/checkpoint.c13
-rw-r--r--fs/f2fs/compress.c96
-rw-r--r--fs/f2fs/data.c236
-rw-r--r--fs/f2fs/f2fs.h57
-rw-r--r--fs/f2fs/file.c256
-rw-r--r--fs/f2fs/gc.c11
-rw-r--r--fs/f2fs/gc.h1
-rw-r--r--fs/f2fs/inline.c36
-rw-r--r--fs/f2fs/inode.c22
-rw-r--r--fs/f2fs/node.c20
-rw-r--r--fs/f2fs/recovery.c3
-rw-r--r--fs/f2fs/segment.c132
-rw-r--r--fs/f2fs/super.c80
-rw-r--r--fs/f2fs/sysfs.c21
-rw-r--r--fs/fat/dir.c12
-rw-r--r--fs/file.c19
-rw-r--r--fs/fuse/dev.c3
-rw-r--r--fs/fuse/file.c10
-rw-r--r--fs/fuse/ioctl.c60
-rw-r--r--fs/fuse/virtio_fs.c74
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/hugetlbfs/inode.c11
-rw-r--r--fs/internal.h3
-rw-r--r--fs/ioctl.c3
-rw-r--r--fs/iomap/Makefile2
-rw-r--r--fs/isofs/Makefile7
-rw-r--r--fs/isofs/compress.c4
-rw-r--r--fs/isofs/inode.c473
-rw-r--r--fs/jbd2/checkpoint.c24
-rw-r--r--fs/jbd2/commit.c3
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/kernfs/mount.c2
-rw-r--r--fs/lockd/host.c1
-rw-r--r--fs/namei.c6
-rw-r--r--fs/nfs/Kconfig4
-rw-r--r--fs/nfs/dir.c54
-rw-r--r--fs/nfs/filelayout/filelayout.c24
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c12
-rw-r--r--fs/nfs/fs_context.c11
-rw-r--r--fs/nfs/internal.h11
-rw-r--r--fs/nfs/iostat.h5
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4proc.c2
-rw-r--r--fs/nfs/nfs4state.c12
-rw-r--r--fs/nfs/nfs4trace.h42
-rw-r--r--fs/nfs/nfstrace.h41
-rw-r--r--fs/nfs/pnfs.c29
-rw-r--r--fs/nfs/pnfs.h3
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c10
-rw-r--r--fs/nfsd/export.c16
-rw-r--r--fs/nfsd/filecache.c4
-rw-r--r--fs/nfsd/netlink.c66
-rw-r--r--fs/nfsd/netlink.h10
-rw-r--r--fs/nfsd/netns.h1
-rw-r--r--fs/nfsd/nfs4callback.c31
-rw-r--r--fs/nfsd/nfs4proc.c79
-rw-r--r--fs/nfsd/nfs4state.c188
-rw-r--r--fs/nfsd/nfs4xdr.c83
-rw-r--r--fs/nfsd/nfsctl.c526
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfssvc.c11
-rw-r--r--fs/nfsd/state.h6
-rw-r--r--fs/nfsd/stats.c42
-rw-r--r--fs/nfsd/stats.h5
-rw-r--r--fs/nfsd/trace.h140
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/nfsd/vfs.h8
-rw-r--r--fs/nfsd/xdr4.h24
-rw-r--r--fs/nilfs2/btree.c23
-rw-r--r--fs/nilfs2/dir.c1
-rw-r--r--fs/nilfs2/gcinode.c1
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nilfs2/recovery.c9
-rw-r--r--fs/nilfs2/segment.c10
-rw-r--r--fs/nilfs2/super.c388
-rw-r--r--fs/nilfs2/the_nilfs.c25
-rw-r--r--fs/nilfs2/the_nilfs.h6
-rw-r--r--fs/notify/dnotify/dnotify.c4
-rw-r--r--fs/notify/fanotify/fanotify_user.c141
-rw-r--r--fs/notify/fdinfo.c20
-rw-r--r--fs/notify/fsnotify.c27
-rw-r--r--fs/notify/fsnotify.h39
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/notify/mark.c174
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c5
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c12
-rw-r--r--fs/ocfs2/export.c12
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/ioctl.c1
-rw-r--r--fs/ocfs2/localalloc.c34
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/ocfs2_fs.h3
-rw-r--r--fs/ocfs2/ocfs2_trace.h60
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/reservations.c2
-rw-r--r--fs/ocfs2/resize.c8
-rw-r--r--fs/ocfs2/suballoc.c117
-rw-r--r--fs/ocfs2/suballoc.h6
-rw-r--r--fs/open.c11
-rw-r--r--fs/overlayfs/dir.c152
-rw-r--r--fs/overlayfs/file.c3
-rw-r--r--fs/overlayfs/inode.c1
-rw-r--r--fs/overlayfs/overlayfs.h3
-rw-r--r--fs/overlayfs/util.c2
-rw-r--r--fs/pidfs.c28
-rw-r--r--fs/proc/fd.c4
-rw-r--r--fs/proc/inode.c10
-rw-r--r--fs/proc/meminfo.c3
-rw-r--r--fs/proc/page.c69
-rw-r--r--fs/proc/task_mmu.c93
-rw-r--r--fs/proc/vmcore.c7
-rw-r--r--fs/quota/dquot.c33
-rw-r--r--fs/ramfs/file-mmu.c2
-rw-r--r--fs/read_write.c13
-rw-r--r--fs/reiserfs/README16
-rw-r--r--fs/reiserfs/inode.c16
-rw-r--r--fs/reiserfs/journal.c5
-rw-r--r--fs/remap_range.c4
-rw-r--r--fs/smb/client/inode.c4
-rw-r--r--fs/smb/client/trace.h18
-rw-r--r--fs/splice.c4
-rw-r--r--fs/squashfs/file.c6
-rw-r--r--fs/squashfs/file_direct.c3
-rw-r--r--fs/squashfs/namei.c14
-rw-r--r--fs/squashfs/symlink.c35
-rw-r--r--fs/super.c1
-rw-r--r--fs/sysfs/file.c27
-rw-r--r--fs/udf/file.c20
-rw-r--r--fs/udf/inode.c65
-rw-r--r--fs/udf/super.c8
-rw-r--r--fs/udf/symlink.c34
-rw-r--r--fs/udf/udftime.c11
-rw-r--r--fs/unicode/Makefile14
-rw-r--r--fs/userfaultfd.c5
-rw-r--r--fs/xfs/Makefile22
-rw-r--r--fs/xfs/libxfs/xfs_ag.c12
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c24
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr.c233
-rw-r--r--fs/xfs/libxfs/xfs_attr.h46
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c154
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h4
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c102
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h8
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h1
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c377
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h13
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c189
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h34
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h37
-rw-r--r--fs/xfs/libxfs/xfs_defer.c12
-rw-r--r--fs/xfs/libxfs/xfs_defer.h10
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c281
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h23
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c42
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c18
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c100
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c44
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h15
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_exchmaps.c1235
-rw-r--r--fs/xfs/libxfs/xfs_exchmaps.h124
-rw-r--r--fs/xfs/libxfs/xfs_format.h34
-rw-r--r--fs/xfs/libxfs/xfs_fs.h158
-rw-r--r--fs/xfs/libxfs/xfs_health.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c56
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c8
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c57
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h6
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h89
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h4
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c46
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h6
-rw-r--r--fs/xfs/libxfs/xfs_parent.c379
-rw-r--r--fs/xfs/libxfs/xfs_parent.h110
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c57
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h17
-rw-r--r--fs/xfs/libxfs/xfs_sb.c9
-rw-r--r--fs/xfs/libxfs/xfs_shared.h6
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c54
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.h8
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c326
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.c121
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h29
-rw-r--r--fs/xfs/scrub/agheader.c43
-rw-r--r--fs/xfs/scrub/agheader_repair.c879
-rw-r--r--fs/xfs/scrub/agino_bitmap.h49
-rw-r--r--fs/xfs/scrub/alloc_repair.c2
-rw-r--r--fs/xfs/scrub/attr.c214
-rw-r--r--fs/xfs/scrub/attr.h7
-rw-r--r--fs/xfs/scrub/attr_repair.c1663
-rw-r--r--fs/xfs/scrub/attr_repair.h15
-rw-r--r--fs/xfs/scrub/bitmap.c22
-rw-r--r--fs/xfs/scrub/common.c41
-rw-r--r--fs/xfs/scrub/common.h27
-rw-r--r--fs/xfs/scrub/dab_bitmap.h37
-rw-r--r--fs/xfs/scrub/dabtree.c24
-rw-r--r--fs/xfs/scrub/dabtree.h3
-rw-r--r--fs/xfs/scrub/dir.c377
-rw-r--r--fs/xfs/scrub/dir_repair.c1958
-rw-r--r--fs/xfs/scrub/dirtree.c985
-rw-r--r--fs/xfs/scrub/dirtree.h178
-rw-r--r--fs/xfs/scrub/dirtree_repair.c821
-rw-r--r--fs/xfs/scrub/findparent.c454
-rw-r--r--fs/xfs/scrub/findparent.h56
-rw-r--r--fs/xfs/scrub/fscounters.c14
-rw-r--r--fs/xfs/scrub/fscounters.h1
-rw-r--r--fs/xfs/scrub/fscounters_repair.c12
-rw-r--r--fs/xfs/scrub/health.c1
-rw-r--r--fs/xfs/scrub/ino_bitmap.h37
-rw-r--r--fs/xfs/scrub/inode.c19
-rw-r--r--fs/xfs/scrub/inode_repair.c153
-rw-r--r--fs/xfs/scrub/iscan.c67
-rw-r--r--fs/xfs/scrub/iscan.h16
-rw-r--r--fs/xfs/scrub/listxattr.c320
-rw-r--r--fs/xfs/scrub/listxattr.h19
-rw-r--r--fs/xfs/scrub/nlinks.c133
-rw-r--r--fs/xfs/scrub/nlinks.h7
-rw-r--r--fs/xfs/scrub/nlinks_repair.c186
-rw-r--r--fs/xfs/scrub/orphanage.c627
-rw-r--r--fs/xfs/scrub/orphanage.h86
-rw-r--r--fs/xfs/scrub/parent.c700
-rw-r--r--fs/xfs/scrub/parent_repair.c1612
-rw-r--r--fs/xfs/scrub/quota_repair.c6
-rw-r--r--fs/xfs/scrub/readdir.c140
-rw-r--r--fs/xfs/scrub/readdir.h3
-rw-r--r--fs/xfs/scrub/reap.c445
-rw-r--r--fs/xfs/scrub/reap.h21
-rw-r--r--fs/xfs/scrub/repair.c127
-rw-r--r--fs/xfs/scrub/repair.h31
-rw-r--r--fs/xfs/scrub/rmap_repair.c24
-rw-r--r--fs/xfs/scrub/rtbitmap_repair.c2
-rw-r--r--fs/xfs/scrub/rtsummary.c33
-rw-r--r--fs/xfs/scrub/rtsummary.h37
-rw-r--r--fs/xfs/scrub/rtsummary_repair.c175
-rw-r--r--fs/xfs/scrub/scrub.c310
-rw-r--r--fs/xfs/scrub/scrub.h91
-rw-r--r--fs/xfs/scrub/stats.c1
-rw-r--r--fs/xfs/scrub/symlink.c13
-rw-r--r--fs/xfs/scrub/symlink_repair.c509
-rw-r--r--fs/xfs/scrub/tempexch.h22
-rw-r--r--fs/xfs/scrub/tempfile.c851
-rw-r--r--fs/xfs/scrub/tempfile.h48
-rw-r--r--fs/xfs/scrub/trace.c6
-rw-r--r--fs/xfs/scrub/trace.h1317
-rw-r--r--fs/xfs/scrub/xfarray.c27
-rw-r--r--fs/xfs/scrub/xfarray.h6
-rw-r--r--fs/xfs/scrub/xfblob.c168
-rw-r--r--fs/xfs/scrub/xfblob.h50
-rw-r--r--fs/xfs/scrub/xfile.c14
-rw-r--r--fs/xfs/scrub/xfile.h6
-rw-r--r--fs/xfs/scrub/xfs_scrub.h6
-rw-r--r--fs/xfs/xfs_acl.c17
-rw-r--r--fs/xfs/xfs_aops.c60
-rw-r--r--fs/xfs/xfs_attr_item.c554
-rw-r--r--fs/xfs/xfs_attr_item.h10
-rw-r--r--fs/xfs/xfs_attr_list.c120
-rw-r--r--fs/xfs/xfs_bmap_item.c4
-rw-r--r--fs/xfs/xfs_bmap_util.c67
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c5
-rw-r--r--fs/xfs/xfs_dir2_readdir.c25
-rw-r--r--fs/xfs/xfs_discard.c153
-rw-r--r--fs/xfs/xfs_dquot.c47
-rw-r--r--fs/xfs/xfs_dquot.h1
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_exchmaps_item.c614
-rw-r--r--fs/xfs/xfs_exchmaps_item.h64
-rw-r--r--fs/xfs/xfs_exchrange.c804
-rw-r--r--fs/xfs/xfs_exchrange.h38
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_export.h2
-rw-r--r--fs/xfs/xfs_extent_busy.c80
-rw-r--r--fs/xfs/xfs_file.c90
-rw-r--r--fs/xfs/xfs_file.h15
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_fsops.c29
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_handle.c952
-rw-r--r--fs/xfs/xfs_handle.h33
-rw-r--r--fs/xfs/xfs_health.c1
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_inode.c496
-rw-r--r--fs/xfs/xfs_inode.h41
-rw-r--r--fs/xfs/xfs_ioctl.c625
-rw-r--r--fs/xfs/xfs_ioctl.h28
-rw-r--r--fs/xfs/xfs_ioctl32.c1
-rw-r--r--fs/xfs/xfs_iomap.c105
-rw-r--r--fs/xfs/xfs_iops.c23
-rw-r--r--fs/xfs/xfs_iops.h7
-rw-r--r--fs/xfs/xfs_itable.c8
-rw-r--r--fs/xfs/xfs_iwalk.c4
-rw-r--r--fs/xfs/xfs_linux.h5
-rw-r--r--fs/xfs/xfs_log.c28
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c2
-rw-r--r--fs/xfs/xfs_log_priv.h8
-rw-r--r--fs/xfs/xfs_log_recover.c85
-rw-r--r--fs/xfs/xfs_mount.c109
-rw-r--r--fs/xfs/xfs_mount.h88
-rw-r--r--fs/xfs/xfs_qm.c4
-rw-r--r--fs/xfs/xfs_qm.h2
-rw-r--r--fs/xfs/xfs_quota.h23
-rw-r--r--fs/xfs/xfs_reflink.c48
-rw-r--r--fs/xfs/xfs_rtalloc.c29
-rw-r--r--fs/xfs/xfs_super.c76
-rw-r--r--fs/xfs/xfs_symlink.c91
-rw-r--r--fs/xfs/xfs_trace.c3
-rw-r--r--fs/xfs/xfs_trace.h466
-rw-r--r--fs/xfs/xfs_trans.c72
-rw-r--r--fs/xfs/xfs_trans_dquot.c15
-rw-r--r--fs/xfs/xfs_xattr.c92
-rw-r--r--fs/xfs/xfs_xattr.h3
485 files changed, 33089 insertions, 9493 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 6ed5507cd330..57c9f7c077e6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1608,7 +1608,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret)
- aio_rw_done(req, call_read_iter(file, req, &iter));
+ aio_rw_done(req, file->f_op->read_iter(req, &iter));
kfree(iovec);
return ret;
}
@@ -1639,7 +1639,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
if (S_ISREG(file_inode(file)->i_mode))
kiocb_start_write(req);
req->ki_flags |= IOCB_WRITE;
- aio_rw_done(req, call_write_iter(file, req, &iter));
+ aio_rw_done(req, file->f_op->write_iter(req, &iter));
}
kfree(iovec);
return ret;
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 740185198db3..afb557446c27 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -52,6 +52,29 @@ struct file *backing_file_open(const struct path *user_path, int flags,
}
EXPORT_SYMBOL_GPL(backing_file_open);
+struct file *backing_tmpfile_open(const struct path *user_path, int flags,
+ const struct path *real_parentpath,
+ umode_t mode, const struct cred *cred)
+{
+ struct mnt_idmap *real_idmap = mnt_idmap(real_parentpath->mnt);
+ struct file *f;
+ int error;
+
+ f = alloc_empty_backing_file(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ path_get(user_path);
+ *backing_file_user_path(f) = *user_path;
+ error = vfs_tmpfile(real_idmap, real_parentpath, f, mode);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
+ }
+ return f;
+}
+EXPORT_SYMBOL(backing_tmpfile_open);
+
struct backing_aio {
struct kiocb iocb;
refcount_t ref;
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 5c180fdc3efb..250d6c6d3a3a 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -282,18 +282,12 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct posix_acl *acl = NULL;
- struct bkey_s_c k;
- int ret;
retry:
bch2_trans_begin(trans);
- ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash, inode_inum(inode), &search, 0);
- if (ret)
- goto err;
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+ &hash, inode_inum(inode), &search, 0);
+ int ret = bkey_err(k);
if (ret)
goto err;
@@ -366,7 +360,7 @@ retry:
ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto btree_err;
@@ -414,39 +408,30 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
struct btree_iter iter;
- struct bkey_s_c_xattr xattr;
- struct bkey_i_xattr *new;
struct posix_acl *acl = NULL;
- struct bkey_s_c k;
- int ret;
- ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash_info, inum, &search, BTREE_ITER_INTENT);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+ &hash_info, inum, &search, BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
return bch2_err_matches(ret, ENOENT) ? 0 : ret;
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
ret = PTR_ERR_OR_ZERO(acl);
- if (IS_ERR_OR_NULL(acl))
+ if (ret)
goto err;
- ret = allocate_dropping_locks_errcode(trans,
- __posix_acl_chmod(&acl, _gfp, mode));
+ ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
if (ret)
goto err;
- new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
- if (IS_ERR(new)) {
- ret = PTR_ERR(new);
+ struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
goto err;
- }
new->k.p = iter.pos;
ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 534ba2b02bd6..346cd91f91f9 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -195,7 +195,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
}
int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
@@ -211,7 +211,7 @@ fsck_err:
}
int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_alloc_unpacked u;
@@ -225,7 +225,7 @@ fsck_err:
}
int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_alloc_unpacked u;
@@ -239,7 +239,7 @@ fsck_err:
}
int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
int ret = 0;
@@ -263,7 +263,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
- bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe,
+ bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe,
c, err, alloc_key_empty_but_have_data,
"empty data type free but have data");
break;
@@ -330,27 +330,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
bch2_prt_data_type(out, a->data_type);
prt_newline(out);
- prt_printf(out, "journal_seq %llu", a->journal_seq);
- prt_newline(out);
- prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a));
- prt_newline(out);
- prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a));
- prt_newline(out);
- prt_printf(out, "dirty_sectors %u", a->dirty_sectors);
- prt_newline(out);
- prt_printf(out, "cached_sectors %u", a->cached_sectors);
- prt_newline(out);
- prt_printf(out, "stripe %u", a->stripe);
- prt_newline(out);
- prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy);
- prt_newline(out);
- prt_printf(out, "io_time[READ] %llu", a->io_time[READ]);
- prt_newline(out);
- prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]);
- prt_newline(out);
- prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
- prt_newline(out);
- prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+ prt_printf(out, "journal_seq %llu\n", a->journal_seq);
+ prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
+ prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
+ prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
+ prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
+ prt_printf(out, "stripe %u\n", a->stripe);
+ prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
+ prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
+ prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
+ prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru);
+ prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
printbuf_indent_sub(out, 2);
}
@@ -439,22 +429,18 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct b
}
struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos pos)
+bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
{
- struct bkey_s_c k;
- struct bkey_i_alloc_v4 *a;
- int ret;
-
- k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
- BTREE_ITER_WITH_UPDATES|
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_with_updates|
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (unlikely(ret))
return ERR_PTR(ret);
- a = bch2_alloc_to_v4_mut_inlined(trans, k);
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
ret = PTR_ERR_OR_ZERO(a);
if (unlikely(ret))
goto err;
@@ -464,6 +450,20 @@ err:
return ERR_PTR(ret);
}
+__flatten
+struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
+ int ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ bch2_trans_iter_exit(trans, &iter);
+ return unlikely(ret) ? ERR_PTR(ret) : a;
+}
+
static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
{
*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
@@ -487,7 +487,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
}
int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
@@ -520,7 +520,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
int ret;
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
@@ -567,29 +567,31 @@ iter_err:
int bch2_alloc_read(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
+ struct bch_dev *ca = NULL;
int ret;
down_read(&c->gc_lock);
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
if (k.k->type != KEY_TYPE_bucket_gens)
continue;
- const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
-
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
*/
- if (!bch2_dev_exists2(c, k.k->p.inode))
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
continue;
+ }
- struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
for (u64 b = max_t(u64, ca->mi.first_bucket, start);
b < min_t(u64, ca->mi.nbuckets, end);
@@ -599,15 +601,16 @@ int bch2_alloc_read(struct bch_fs *c)
}));
} else {
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
*/
- if (!bch2_dev_bucket_exists(c, k.k->p))
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
continue;
-
- struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ }
struct bch_alloc_v4 a;
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
@@ -615,6 +618,7 @@ int bch2_alloc_read(struct bch_fs *c)
}));
}
+ bch2_dev_put(ca);
bch2_trans_put(trans);
up_read(&c->gc_lock);
@@ -625,12 +629,12 @@ int bch2_alloc_read(struct bch_fs *c)
/* Free space/discard btree: */
static int bch2_bucket_do_index(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bkey_s_c alloc_k,
const struct bch_alloc_v4 *a,
bool set)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
struct btree_iter iter;
struct bkey_s_c old;
struct bkey_i *k;
@@ -667,7 +671,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
old = bch2_bkey_get_iter(trans, &iter, btree,
bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bkey_err(old);
if (ret)
return ret;
@@ -711,8 +715,8 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
return ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
- BTREE_ITER_INTENT|
- BTREE_ITER_WITH_UPDATES);
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates);
ret = bkey_err(k);
if (ret)
return ret;
@@ -734,26 +738,24 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
int bch2_trigger_alloc(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
int ret = 0;
- if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
- "alloc key for invalid device or bucket"))
+ struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
+ if (!ca)
return -EIO;
- struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
-
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
- new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
+ alloc_data_type_set(new_a, new_a->data_type);
- if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) {
+ if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
@@ -770,10 +772,10 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (old_a->data_type != new_a->data_type ||
(new_a->data_type == BCH_DATA_free &&
alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
- ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
- bch2_bucket_do_index(trans, new.s_c, new_a, true);
+ ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
+ bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
if (ret)
- return ret;
+ goto err;
}
if (new_a->data_type == BCH_DATA_cached &&
@@ -787,24 +789,23 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bucket_to_u64(new.k->p),
old_lru, new_lru);
if (ret)
- return ret;
+ goto err;
}
- new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
- bch_dev_bkey_exists(c, new.k->p.inode));
+ new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca);
if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
ret = bch2_lru_change(trans,
BCH_LRU_FRAGMENTATION_START,
bucket_to_u64(new.k->p),
old_a->fragmentation_lru, new_a->fragmentation_lru);
if (ret)
- return ret;
+ goto err;
}
if (old_a->gen != new_a->gen) {
ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
if (ret)
- return ret;
+ goto err;
}
/*
@@ -812,21 +813,21 @@ int bch2_trigger_alloc(struct btree_trans *trans,
* not:
*/
- if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+ if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
old_a->cached_sectors) {
ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
-((s64) old_a->cached_sectors));
if (ret)
- return ret;
+ goto err;
}
}
- if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
u64 journal_seq = trans->journal_res.seq;
u64 bucket_journal_seq = new_a->journal_seq;
- if ((flags & BTREE_TRIGGER_INSERT) &&
+ if ((flags & BTREE_TRIGGER_insert) &&
data_type_is_empty(old_a->data_type) !=
data_type_is_empty(new_a->data_type) &&
new.k->type == KEY_TYPE_alloc_v4) {
@@ -854,7 +855,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (ret) {
bch2_fs_fatal_error(c,
"setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
- return ret;
+ goto err;
}
}
@@ -884,11 +885,11 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bch2_do_invalidates(c);
if (statechange(a->data_type == BCH_DATA_need_gc_gens))
- bch2_do_gc_gens(c);
+ bch2_gc_gens_async(c);
}
- if ((flags & BTREE_TRIGGER_GC) &&
- (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) {
+ if ((flags & BTREE_TRIGGER_gc) &&
+ (flags & BTREE_TRIGGER_bucket_invalidate)) {
struct bch_alloc_v4 new_a_convert;
const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
@@ -908,12 +909,13 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bucket_unlock(g);
percpu_up_read(&c->mark_lock);
}
-
- return 0;
+err:
+ bch2_dev_put(ca);
+ return ret;
}
/*
- * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
+ * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
* extents style btrees, but works on non-extents btrees:
*/
static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
@@ -958,35 +960,34 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
}
}
-static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
+static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
{
- struct bch_dev *ca;
+ if (*ca) {
+ if (bucket->offset < (*ca)->mi.first_bucket)
+ bucket->offset = (*ca)->mi.first_bucket;
- if (bch2_dev_bucket_exists(c, *bucket))
- return true;
-
- if (bch2_dev_exists2(c, bucket->inode)) {
- ca = bch_dev_bkey_exists(c, bucket->inode);
-
- if (bucket->offset < ca->mi.first_bucket) {
- bucket->offset = ca->mi.first_bucket;
+ if (bucket->offset < (*ca)->mi.nbuckets)
return true;
- }
+ bch2_dev_put(*ca);
+ *ca = NULL;
bucket->inode++;
bucket->offset = 0;
}
rcu_read_lock();
- ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
- if (ca)
- *bucket = POS(ca->dev_idx, ca->mi.first_bucket);
+ *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
+ if (*ca) {
+ *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
+ bch2_dev_get(*ca);
+ }
rcu_read_unlock();
- return ca != NULL;
+ return *ca != NULL;
}
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
+ struct bch_dev **ca, struct bkey *hole)
{
struct bch_fs *c = iter->trans->c;
struct bkey_s_c k;
@@ -995,22 +996,21 @@ again:
if (bkey_err(k))
return k;
+ *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
+
if (!k.k->type) {
- struct bpos bucket = bkey_start_pos(k.k);
+ struct bpos hole_start = bkey_start_pos(k.k);
- if (!bch2_dev_bucket_exists(c, bucket)) {
- if (!next_bucket(c, &bucket))
+ if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
+ if (!next_bucket(c, ca, &hole_start))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, bucket);
+ bch2_btree_iter_set_pos(iter, hole_start);
goto again;
}
- if (!bch2_dev_bucket_exists(c, k.k->p)) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
-
- bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset);
- }
+ if (k.k->p.offset > (*ca)->mi.nbuckets)
+ bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
}
return k;
@@ -1025,24 +1025,25 @@ int bch2_check_alloc_key(struct btree_trans *trans,
struct btree_iter *bucket_gens_iter)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a;
unsigned discard_key_type, freespace_key_type;
unsigned gens_offset;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
- int ret;
+ int ret = 0;
- if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
- alloc_key_to_missing_dev_bucket,
+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
+ if (fsck_err_on(!ca,
+ c, alloc_key_to_missing_dev_bucket,
"alloc key for invalid device:bucket %llu:%llu",
alloc_k.k->p.inode, alloc_k.k->p.offset))
- return bch2_btree_delete_at(trans, alloc_iter, 0);
+ ret = bch2_btree_delete_at(trans, alloc_iter, 0);
+ if (!ca)
+ return ret;
- ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
if (!ca->mi.freespace_initialized)
- return 0;
+ goto out;
a = bch2_alloc_to_v4(alloc_k, &a_convert);
@@ -1141,25 +1142,26 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
}
+out:
err:
fsck_err:
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
}
static noinline_for_stack
int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos start,
struct bpos *end,
struct btree_iter *freespace_iter)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
int ret;
- ca = bch_dev_bkey_exists(c, start.inode);
if (!ca->mi.freespace_initialized)
return 0;
@@ -1313,7 +1315,7 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
goto delete;
out:
fsck_err:
- set_btree_iter_dontneed(&alloc_iter);
+ bch2_set_btree_iter_dontneed(&alloc_iter);
bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
@@ -1337,30 +1339,25 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_i_bucket_gens g;
- struct bch_dev *ca;
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
u64 b;
- bool need_update = false, dev_exists;
+ bool need_update = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
bkey_reassemble(&g.k_i, k);
- /* if no bch_dev, skip out whether we repair or not */
- dev_exists = bch2_dev_exists2(c, k.k->p.inode);
- if (!dev_exists) {
- if (fsck_err_on(!dev_exists, c,
- bucket_gens_to_invalid_dev,
- "bucket_gens key for invalid device:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
+ if (!ca) {
+ if (fsck_err(c, bucket_gens_to_invalid_dev,
+ "bucket_gens key for invalid device:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter, 0);
- }
goto out;
}
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
if (fsck_err_on(end <= ca->mi.first_bucket ||
start >= ca->mi.nbuckets, c,
bucket_gens_to_invalid_buckets,
@@ -1398,6 +1395,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
}
out:
fsck_err:
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
}
@@ -1406,25 +1404,26 @@ int bch2_check_alloc_info(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
+ struct bch_dev *ca = NULL;
struct bkey hole;
struct bkey_s_c k;
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
while (1) {
struct bpos next;
bch2_trans_begin(trans);
- k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
+ k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
ret = bkey_err(k);
if (ret)
goto bkey_err;
@@ -1445,7 +1444,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
} else {
next = k.k->p;
- ret = bch2_check_alloc_hole_freespace(trans,
+ ret = bch2_check_alloc_hole_freespace(trans, ca,
bkey_start_pos(k.k),
&next,
&freespace_iter) ?:
@@ -1473,19 +1472,21 @@ bkey_err:
bch2_trans_iter_exit(trans, &freespace_iter);
bch2_trans_iter_exit(trans, &discard_iter);
bch2_trans_iter_exit(trans, &iter);
+ bch2_dev_put(ca);
+ ca = NULL;
if (ret < 0)
goto err;
ret = for_each_btree_key(trans, iter,
BTREE_ID_need_discard, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
bch2_check_discard_freespace_key(trans, &iter));
if (ret)
goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
while (1) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@@ -1515,7 +1516,7 @@ bkey_err:
ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_bucket_gens_key(trans, &iter, k));
err:
@@ -1562,7 +1563,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
ret = bch2_trans_update(trans, alloc_iter,
- &a_mut->k_i, BTREE_TRIGGER_NORUN);
+ &a_mut->k_i, BTREE_TRIGGER_norun);
if (ret)
goto err;
@@ -1601,7 +1602,7 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS_MIN, BTREE_ITER_PREFETCH, k,
+ POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_alloc_to_lru_ref(trans, &iter)));
bch_err_fn(c, ret);
@@ -1657,9 +1658,7 @@ static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_st
bch2_journal_flush_async(&c->journal, NULL);
if (s->ca)
- percpu_ref_put(&s->ca->ref);
- if (ca)
- percpu_ref_get(&ca->ref);
+ percpu_ref_put(&s->ca->io_ref);
s->ca = ca;
s->need_journal_commit_this_dev = 0;
}
@@ -1673,15 +1672,15 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
struct bpos pos = need_discard_iter->pos;
struct btree_iter iter = { NULL };
struct bkey_s_c k;
- struct bch_dev *ca;
struct bkey_i_alloc_v4 *a;
struct printbuf buf = PRINTBUF;
bool discard_locked = false;
int ret = 0;
- ca = bch_dev_bkey_exists(c, pos.inode);
-
- if (!percpu_ref_tryget(&ca->io_ref)) {
+ struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode
+ ? s->ca
+ : bch2_dev_get_ioref(c, pos.inode, WRITE);
+ if (!ca) {
bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
return 0;
}
@@ -1703,7 +1702,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
need_discard_iter->pos,
- BTREE_ITER_CACHED);
+ BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
goto out;
@@ -1713,7 +1712,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
if (ret)
goto out;
- if (a->v.dirty_sectors) {
+ if (bch2_bucket_sectors_total(a->v)) {
if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
trans, "attempting to discard bucket with dirty data\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
@@ -1771,7 +1770,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
}
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+ alloc_data_type_set(&a->v, a->v.data_type);
write:
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
@@ -1787,7 +1786,6 @@ out:
discard_in_flight_remove(c, iter.pos);
s->seen++;
bch2_trans_iter_exit(trans, &iter);
- percpu_ref_put(&ca->io_ref);
printbuf_exit(&buf);
return ret;
}
@@ -1827,7 +1825,7 @@ void bch2_do_discards(struct bch_fs *c)
static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
{
struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
int ret = bkey_err(k);
if (ret)
@@ -1840,7 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo
BUG_ON(a->v.dirty_sectors);
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+ alloc_data_type_set(&a->v, a->v.data_type);
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
err:
@@ -1862,9 +1860,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
if (i->snapshot)
continue;
- ca = bch_dev_bkey_exists(c, i->inode);
-
- if (!percpu_ref_tryget(&ca->io_ref)) {
+ ca = bch2_dev_get_ioref(c, i->inode, WRITE);
+ if (!ca) {
darray_remove_item(&c->discard_buckets_in_flight, i);
continue;
}
@@ -1903,9 +1900,12 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode);
+ bool dead = !ca || percpu_ref_is_dying(&ca->io_ref);
+ rcu_read_unlock();
- if (!percpu_ref_is_dying(&ca->io_ref) &&
+ if (!dead &&
!discard_in_flight_add(c, bucket) &&
bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
!queue_work(c->write_ref_wq, &c->discard_fast_work))
@@ -1918,7 +1918,6 @@ static int invalidate_one_bucket(struct btree_trans *trans,
s64 *nr_to_invalidate)
{
struct bch_fs *c = trans->c;
- struct btree_iter alloc_iter = { NULL };
struct bkey_i_alloc_v4 *a = NULL;
struct printbuf buf = PRINTBUF;
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
@@ -1936,7 +1935,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
return 0;
- a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
+ a = bch2_trans_start_alloc_update(trans, bucket);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
@@ -1961,18 +1960,15 @@ static int invalidate_one_bucket(struct btree_trans *trans,
a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
- ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
- BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc);
+ ret = bch2_trans_commit(trans, NULL, NULL,
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
--*nr_to_invalidate;
out:
- bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
err:
@@ -2014,11 +2010,11 @@ static void bch2_do_invalidates_work(struct work_struct *work)
ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
lru_pos(ca->dev_idx, 0, 0),
lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
if (ret < 0) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
break;
}
}
@@ -2051,7 +2047,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
/*
* Scan the alloc btree for every bucket on @ca, and add buckets to the
* freespace/need_discard/need_gc_gens btrees as needed:
@@ -2083,7 +2079,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- ret = bch2_bucket_do_index(trans, k, a, true) ?:
+ ret = bch2_bucket_do_index(trans, ca, k, a, true) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc);
if (ret)
@@ -2155,7 +2151,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
if (ret) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
bch_err_fn(c, ret);
return ret;
}
@@ -2182,7 +2178,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
u64 now;
int ret = 0;
- a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr));
+ if (bch2_trans_relock(trans))
+ bch2_trans_begin(trans);
+
+ a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
ret = PTR_ERR_OR_ZERO(a);
if (ret)
return ret;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 2790e516383d..ae31a94be6f9 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,21 +8,18 @@
#include "debug.h"
#include "super.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
{
- struct bch_dev *ca;
-
- if (!bch2_dev_exists2(c, pos.inode))
- return false;
-
- ca = bch_dev_bkey_exists(c, pos.inode);
- return pos.offset >= ca->mi.first_bucket &&
- pos.offset < ca->mi.nbuckets;
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, pos.inode);
+ bool ret = ca && bucket_valid(ca, pos.offset);
+ rcu_read_unlock();
+ return ret;
}
static inline u64 bucket_to_u64(struct bpos bucket)
@@ -40,38 +37,50 @@ static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
return a.gen - a.oldest_gen;
}
-static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
- u32 cached_sectors,
- u32 stripe,
- struct bch_alloc_v4 a,
- enum bch_data_type data_type)
+static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
{
- if (stripe)
- return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
- if (dirty_sectors)
- return data_type;
- if (cached_sectors)
- return BCH_DATA_cached;
- if (BCH_ALLOC_V4_NEED_DISCARD(&a))
- return BCH_DATA_need_discard;
- if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
- return BCH_DATA_need_gc_gens;
- return BCH_DATA_free;
+ dst->gen = src.gen;
+ dst->data_type = src.data_type;
+ dst->dirty_sectors = src.dirty_sectors;
+ dst->cached_sectors = src.cached_sectors;
+ dst->stripe = src.stripe;
}
-static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
- enum bch_data_type data_type)
+static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src)
{
- return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
- a.stripe, a, data_type);
+ dst->gen = src.gen;
+ dst->data_type = src.data_type;
+ dst->dirty_sectors = src.dirty_sectors;
+ dst->cached_sectors = src.cached_sectors;
+ dst->stripe = src.stripe;
+}
+
+static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
+{
+ struct bch_alloc_v4 ret = {};
+ __bucket_m_to_alloc(&ret, b);
+ return ret;
}
static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
{
- return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
+ switch (data_type) {
+ case BCH_DATA_cached:
+ case BCH_DATA_stripe:
+ return BCH_DATA_user;
+ default:
+ return data_type;
+ }
+}
+
+static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
+ enum bch_data_type ptr)
+{
+ return !data_type_is_empty(bucket) &&
+ bucket_data_type(bucket) != bucket_data_type(ptr);
}
-static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a)
+static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a)
{
return a.dirty_sectors + a.cached_sectors;
}
@@ -89,6 +98,27 @@ static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
return d ? max(0, ca->mi.bucket_size - d) : 0;
}
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+ enum bch_data_type data_type)
+{
+ if (a.stripe)
+ return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
+ if (a.dirty_sectors)
+ return data_type;
+ if (a.cached_sectors)
+ return BCH_DATA_cached;
+ if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+ return BCH_DATA_need_discard;
+ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+ return BCH_DATA_need_gc_gens;
+ return BCH_DATA_free;
+}
+
+static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type)
+{
+ a->data_type = alloc_data_type(*a, data_type);
+}
+
static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
{
return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
@@ -147,7 +177,9 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
}
struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
+bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct bpos);
void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
@@ -173,13 +205,13 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -213,7 +245,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
})
int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
@@ -233,7 +265,8 @@ static inline bool bkey_is_alloc(const struct bkey *k)
int bch2_alloc_read(struct bch_fs *);
int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
void bch2_do_discards(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index a1fc30adf912..927a5f300b30 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -71,7 +71,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *c)
{
rcu_read_lock();
for_each_member_device_rcu(c, ca, NULL)
- ca->alloc_cursor = 0;
+ memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
rcu_read_unlock();
}
@@ -100,7 +100,7 @@ static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *o
void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
if (ob->ec) {
ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
@@ -300,7 +300,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
k = bch2_bkey_get_iter(trans, &iter,
BTREE_ID_alloc, POS(ca->dev_idx, b),
- BTREE_ITER_CACHED);
+ BTREE_ITER_cached);
ret = bkey_err(k);
if (ret) {
ob = ERR_PTR(ret);
@@ -342,9 +342,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
struct bch_backpointer bp;
struct bpos bp_pos = POS_MIN;
- ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
+ ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1,
&bp_pos, &bp,
- BTREE_ITER_NOPRESERVE);
+ BTREE_ITER_nopreserve);
if (ret) {
ob = ERR_PTR(ret);
goto err;
@@ -363,10 +363,10 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
if (!ob)
- set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(&iter);
err:
if (iter.path)
- set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(&iter);
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
return ob;
@@ -389,7 +389,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
struct bkey_s_c k, ck;
struct open_bucket *ob = NULL;
u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
- u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+ u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
u64 alloc_cursor = alloc_start;
int ret;
@@ -404,9 +405,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
*/
again:
for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
- BTREE_ITER_SLOTS, k, ret) {
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
+ BTREE_ITER_slots, k, ret) {
+ u64 bucket = k.k->p.offset;
if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
break;
@@ -415,12 +415,29 @@ again:
is_superblock_bucket(ca, k.k->p.offset))
continue;
- a = bch2_alloc_to_v4(k, &a_convert);
+ if (s->btree_bitmap != BTREE_BITMAP_ANY &&
+ s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+ bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
+ if (s->btree_bitmap == BTREE_BITMAP_YES &&
+ bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
+ break;
+
+ bucket = sector_to_bucket(ca,
+ round_up(bucket_to_sector(ca, bucket) + 1,
+ 1ULL << ca->mi.btree_bitmap_shift));
+ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
+ s->buckets_seen++;
+ s->skipped_mi_btree_bitmap++;
+ continue;
+ }
+
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
if (a->data_type != BCH_DATA_free)
continue;
/* now check the cached key to serialize concurrent allocs of the bucket */
- ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+ ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached);
ret = bkey_err(ck);
if (ret)
break;
@@ -433,7 +450,7 @@ again:
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
next:
- set_btree_iter_dontneed(&citer);
+ bch2_set_btree_iter_dontneed(&citer);
bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
@@ -441,7 +458,6 @@ next:
bch2_trans_iter_exit(trans, &iter);
alloc_cursor = iter.pos.offset;
- ca->alloc_cursor = alloc_cursor;
if (!ob && ret)
ob = ERR_PTR(ret);
@@ -451,6 +467,8 @@ next:
goto again;
}
+ *dev_alloc_cursor = alloc_cursor;
+
return ob;
}
@@ -463,7 +481,8 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_s_c k;
struct open_bucket *ob = NULL;
- u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
u64 alloc_cursor = alloc_start;
int ret;
@@ -485,10 +504,30 @@ again:
s->buckets_seen++;
+ u64 bucket = alloc_cursor & ~(~0ULL << 56);
+ if (s->btree_bitmap != BTREE_BITMAP_ANY &&
+ s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+ bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
+ if (s->btree_bitmap == BTREE_BITMAP_YES &&
+ bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
+ goto fail;
+
+ bucket = sector_to_bucket(ca,
+ round_up(bucket_to_sector(ca, bucket) + 1,
+ 1ULL << ca->mi.btree_bitmap_shift));
+ u64 genbits = alloc_cursor >> 56;
+ alloc_cursor = bucket | (genbits << 56);
+
+ if (alloc_cursor > k.k->p.offset)
+ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
+ s->skipped_mi_btree_bitmap++;
+ continue;
+ }
+
ob = try_alloc_bucket(trans, ca, watermark,
alloc_cursor, s, k, cl);
if (ob) {
- set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(&iter);
break;
}
}
@@ -496,10 +535,9 @@ again:
if (ob || ret)
break;
}
+fail:
bch2_trans_iter_exit(trans, &iter);
- ca->alloc_cursor = alloc_cursor;
-
if (!ob && ret)
ob = ERR_PTR(ret);
@@ -508,14 +546,56 @@ again:
goto again;
}
+ *dev_alloc_cursor = alloc_cursor;
+
return ob;
}
+static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
+ enum bch_watermark watermark,
+ enum bch_data_type data_type,
+ struct closure *cl,
+ struct bch_dev_usage *usage,
+ struct bucket_alloc_state *s,
+ struct open_bucket *ob)
+{
+ struct printbuf buf = PRINTBUF;
+
+ printbuf_tabstop_push(&buf, 24);
+
+ prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx);
+ prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]);
+ prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]);
+ prt_printf(&buf, "blocking\t%u\n", cl != NULL);
+ prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets);
+ prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark));
+ prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
+ bch2_copygc_wait_amount(c),
+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
+ prt_printf(&buf, "seen\t%llu\n", s->buckets_seen);
+ prt_printf(&buf, "open\t%llu\n", s->skipped_open);
+ prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit);
+ prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow);
+ prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse);
+ prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap);
+
+ if (!IS_ERR(ob)) {
+ prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
+ trace_bucket_alloc(c, buf.buf);
+ } else {
+ prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob)));
+ trace_bucket_alloc_fail(c, buf.buf);
+ }
+
+ printbuf_exit(&buf);
+}
+
/**
* bch2_bucket_alloc_trans - allocate a single bucket from a specific device
* @trans: transaction object
* @ca: device to allocate from
* @watermark: how important is this allocation?
+ * @data_type: BCH_DATA_journal, btree, user...
* @cl: if not NULL, closure to be used to wait if buckets not available
* @usage: for secondarily also returning the current device usage
*
@@ -524,6 +604,7 @@ again:
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct bch_dev *ca,
enum bch_watermark watermark,
+ enum bch_data_type data_type,
struct closure *cl,
struct bch_dev_usage *usage)
{
@@ -531,7 +612,9 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct open_bucket *ob = NULL;
bool freespace = READ_ONCE(ca->mi.freespace_initialized);
u64 avail;
- struct bucket_alloc_state s = { 0 };
+ struct bucket_alloc_state s = {
+ .btree_bitmap = data_type == BCH_DATA_btree,
+ };
bool waiting = false;
again:
bch2_dev_usage_read_fast(ca, usage);
@@ -541,7 +624,7 @@ again:
bch2_do_discards(c);
if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
- bch2_do_gc_gens(c);
+ bch2_gc_gens_async(c);
if (should_invalidate_buckets(ca, *usage))
bch2_do_invalidates(c);
@@ -569,6 +652,11 @@ alloc:
if (s.skipped_need_journal_commit * 2 > avail)
bch2_journal_flush_async(&c->journal, NULL);
+ if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
+ s.btree_bitmap = BTREE_BITMAP_ANY;
+ goto alloc;
+ }
+
if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
freespace = false;
goto alloc;
@@ -578,33 +666,24 @@ err:
ob = ERR_PTR(-BCH_ERR_no_buckets_found);
if (!IS_ERR(ob))
- trace_and_count(c, bucket_alloc, ca,
- bch2_watermarks[watermark],
- ob->bucket,
- usage->d[BCH_DATA_free].buckets,
- avail,
- bch2_copygc_wait_amount(c),
- c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
- &s,
- cl == NULL,
- "");
+ ob->data_type = data_type;
+
+ if (!IS_ERR(ob))
+ count_event(c, bucket_alloc);
else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
- trace_and_count(c, bucket_alloc_fail, ca,
- bch2_watermarks[watermark],
- 0,
- usage->d[BCH_DATA_free].buckets,
- avail,
- bch2_copygc_wait_amount(c),
- c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
- &s,
- cl == NULL,
- bch2_err_str(PTR_ERR(ob)));
+ count_event(c, bucket_alloc_fail);
+
+ if (!IS_ERR(ob)
+ ? trace_bucket_alloc_enabled()
+ : trace_bucket_alloc_fail_enabled())
+ trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob);
return ob;
}
struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
enum bch_watermark watermark,
+ enum bch_data_type data_type,
struct closure *cl)
{
struct bch_dev_usage usage;
@@ -612,7 +691,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
bch2_trans_do(c, NULL, NULL, 0,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
- cl, &usage)));
+ data_type, cl, &usage)));
return ob;
}
@@ -678,8 +757,7 @@ static int add_new_bucket(struct bch_fs *c,
unsigned flags,
struct open_bucket *ob)
{
- unsigned durability =
- bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+ unsigned durability = ob_dev(c, ob)->mi.durability;
BUG_ON(*nr_effective >= nr_replicas);
@@ -711,37 +789,28 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
- unsigned dev;
- struct bch_dev *ca;
int ret = -BCH_ERR_insufficient_devices;
- unsigned i;
BUG_ON(*nr_effective >= nr_replicas);
- for (i = 0; i < devs_sorted.nr; i++) {
+ for (unsigned i = 0; i < devs_sorted.nr; i++) {
struct bch_dev_usage usage;
struct open_bucket *ob;
- dev = devs_sorted.devs[i];
-
- rcu_read_lock();
- ca = rcu_dereference(c->devs[dev]);
- if (ca)
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
-
+ unsigned dev = devs_sorted.devs[i];
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
if (!ca)
continue;
if (!ca->mi.durability && *have_cache) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
continue;
}
- ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
+ ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage);
if (!IS_ERR(ob))
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
if (IS_ERR(ob)) {
ret = PTR_ERR(ob);
@@ -750,8 +819,6 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- ob->data_type = data_type;
-
if (add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
have_cache, flags, ob)) {
@@ -836,7 +903,7 @@ static bool want_bucket(struct bch_fs *c,
bool *have_cache, bool ec,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
if (!test_bit(ob->dev, devs_may_alloc->d))
return false;
@@ -906,7 +973,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
struct bch_dev_usage usage;
u64 avail;
@@ -1291,7 +1358,7 @@ deallocate_extra_replicas(struct bch_fs *c,
unsigned i;
open_bucket_for_each(c, ptrs, ob, i) {
- unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+ unsigned d = ob_dev(c, ob)->mi.durability;
if (d && d <= extra_replicas) {
extra_replicas -= d;
@@ -1342,6 +1409,10 @@ retry:
*wp_ret = wp = writepoint_find(trans, write_point.v);
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ goto err;
+
/* metadata may not allocate on cache devices: */
if (wp->data_type != BCH_DATA_user)
have_cache = true;
@@ -1444,7 +1515,7 @@ err:
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
return (struct bch_extent_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_ptr,
@@ -1520,7 +1591,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
unsigned data_type = ob->data_type;
barrier(); /* READ_ONCE() doesn't work on bitfields */
@@ -1622,3 +1693,104 @@ void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
prt_str(out, "Btree write point\n");
bch2_write_point_to_text(out, c, &c->btree_write_point);
}
+
+void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ unsigned nr[BCH_DATA_NR];
+
+ memset(nr, 0, sizeof(nr));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+ nr[c->open_buckets[i].data_type]++;
+
+ printbuf_tabstop_push(out, 24);
+
+ percpu_down_read(&c->mark_lock);
+ prt_printf(out, "hidden\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.hidden));
+ prt_printf(out, "btree\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.btree));
+ prt_printf(out, "data\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.data));
+ prt_printf(out, "cached\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.cached));
+ prt_printf(out, "reserved\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.reserved));
+ prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
+ prt_printf(out, "nr_inodes\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes));
+ percpu_up_read(&c->mark_lock);
+
+ prt_newline(out);
+ prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty");
+ prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
+ prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT);
+ prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty");
+ prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]);
+ prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]);
+ prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr);
+}
+
+void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+ struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ unsigned nr[BCH_DATA_NR];
+
+ memset(nr, 0, sizeof(nr));
+
+ for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+ nr[c->open_buckets[i].data_type]++;
+
+ printbuf_tabstop_push(out, 12);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+
+ bch2_dev_usage_to_text(out, &stats);
+
+ prt_newline(out);
+
+ prt_printf(out, "reserves:\n");
+ for (unsigned i = 0; i < BCH_WATERMARK_NR; i++)
+ prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i));
+
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 12);
+ printbuf_tabstop_push(out, 16);
+
+ prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
+ prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats));
+}
+
+void bch2_print_allocator_stuck(struct bch_fs *c)
+{
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n");
+
+ prt_printf(&buf, "Allocator debug:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_fs_alloc_debug_to_text(&buf, c);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+
+ for_each_online_member(c, ca) {
+ prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
+ printbuf_indent_add(&buf, 2);
+ bch2_dev_alloc_debug_to_text(&buf, ca);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+
+ prt_printf(&buf, "Copygc debug:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_copygc_wait_to_text(&buf, c);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "Journal debug:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_journal_debug_to_text(&buf, &c->journal);
+ printbuf_indent_sub(&buf, 2);
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 7aaeec44c746..a42c9730d32a 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -30,8 +30,14 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
long bch2_bucket_alloc_new_fs(struct bch_dev *);
+static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
+{
+ return bch2_dev_have_ref(c, ob->dev);
+}
+
struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
- enum bch_watermark, struct closure *);
+ enum bch_watermark, enum bch_data_type,
+ struct closure *);
static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
struct open_bucket *ob)
@@ -184,7 +190,7 @@ bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
wp->sectors_allocated += sectors;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev *ca = ob_dev(c, ob);
struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
ptr.cached = cached ||
@@ -221,4 +227,9 @@ void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
+void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
+
+void bch2_print_allocator_stuck(struct bch_fs *);
+
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index c2226e947c41..9bbb28e90b93 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -9,11 +9,18 @@
#include "fifo.h"
struct bucket_alloc_state {
+ enum {
+ BTREE_BITMAP_NO,
+ BTREE_BITMAP_YES,
+ BTREE_BITMAP_ANY,
+ } btree_bitmap;
+
u64 buckets_seen;
u64 skipped_open;
u64 skipped_need_journal_commit;
u64 skipped_nocow;
u64 skipped_nouse;
+ u64 skipped_mi_btree_bitmap;
};
#define BCH_WATERMARKS() \
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index af7a71de1bdf..692b1c7d5018 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -23,6 +23,7 @@ static bool extent_matches_bp(struct bch_fs *c,
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bpos bucket2;
struct bch_backpointer bp2;
@@ -30,31 +31,43 @@ static bool extent_matches_bp(struct bch_fs *c,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2);
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ if (!ca)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2);
if (bpos_eq(bucket, bucket2) &&
- !memcmp(&bp, &bp2, sizeof(bp)))
+ !memcmp(&bp, &bp2, sizeof(bp))) {
+ rcu_read_unlock();
return true;
+ }
}
+ rcu_read_unlock();
return false;
}
int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
- /* these will be caught by fsck */
- if (!bch2_dev_exists2(c, bp.k->p.inode))
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, bp.k->p.inode);
+ if (!ca) {
+ /* these will be caught by fsck */
+ rcu_read_unlock();
return 0;
+ }
- struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode);
- struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+ struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p);
+ struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset);
+ rcu_read_unlock();
int ret = 0;
bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
- !bpos_eq(bp.k->p, bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset)),
+ !bpos_eq(bp.k->p, bp_pos),
c, err,
backpointer_bucket_offset_wrong,
"backpointer bucket_offset wrong");
@@ -75,10 +88,16 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer
void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
- if (bch2_dev_exists2(c, k.k->p.inode)) {
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, k.k->p.inode);
+ if (ca) {
+ struct bpos bucket = bp_pos_to_bucket(ca, k.k->p);
+ rcu_read_unlock();
prt_str(out, "bucket=");
- bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+ bch2_bpos_to_text(out, bucket);
prt_str(out, " ");
+ } else {
+ rcu_read_unlock();
}
bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
@@ -117,8 +136,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
bch_err(c, "%s", buf.buf);
} else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- prt_printf(&buf, "backpointer not found when deleting");
- prt_newline(&buf);
+ prt_printf(&buf, "backpointer not found when deleting\n");
printbuf_indent_add(&buf, 2);
prt_printf(&buf, "searching for ");
@@ -145,6 +163,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
}
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos bucket,
struct bch_backpointer bp,
struct bkey_s_c orig_k,
@@ -161,7 +180,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
return ret;
bkey_backpointer_init(&bp_k->k_i);
- bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
bp_k->v = bp;
if (!insert) {
@@ -171,9 +190,9 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
bp_k->k.p,
- BTREE_ITER_INTENT|
- BTREE_ITER_SLOTS|
- BTREE_ITER_WITH_UPDATES);
+ BTREE_ITER_intent|
+ BTREE_ITER_slots|
+ BTREE_ITER_with_updates);
ret = bkey_err(k);
if (ret)
goto err;
@@ -197,13 +216,13 @@ err:
* Find the next backpointer >= *bp_offset:
*/
int bch2_get_next_backpointer(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos bucket, int gen,
struct bpos *bp_pos,
struct bch_backpointer *bp,
unsigned iter_flags)
{
- struct bch_fs *c = trans->c;
- struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+ struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0);
struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
struct bkey_s_c k;
int ret = 0;
@@ -213,7 +232,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans,
if (gen >= 0) {
k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
- bucket, BTREE_ITER_CACHED|iter_flags);
+ bucket, BTREE_ITER_cached|iter_flags);
ret = bkey_err(k);
if (ret)
goto out;
@@ -223,7 +242,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans,
goto done;
}
- *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0));
+ *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0));
for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
*bp_pos, iter_flags, k, ret) {
@@ -249,7 +268,6 @@ static void backpointer_not_found(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
/*
* If we're using the btree write buffer, the backpointer we were
@@ -259,6 +277,10 @@ static void backpointer_not_found(struct btree_trans *trans,
if (likely(!bch2_backpointers_no_use_write_buffer))
return;
+ struct bpos bucket;
+ if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
+ return;
+
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
bp.level ? "btree node" : "extent");
prt_printf(&buf, "bucket: ");
@@ -288,15 +310,17 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
{
if (likely(!bp.level)) {
struct bch_fs *c = trans->c;
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
- struct bkey_s_c k;
+
+ struct bpos bucket;
+ if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
+ return bkey_s_c_err(-EIO);
bch2_trans_node_iter_init(trans, iter,
bp.btree_id,
bp.pos,
0, 0,
iter_flags);
- k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k)) {
bch2_trans_iter_exit(trans, iter);
return k;
@@ -325,18 +349,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
struct bch_backpointer bp)
{
struct bch_fs *c = trans->c;
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
- struct btree *b;
BUG_ON(!bp.level);
+ struct bpos bucket;
+ if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket))
+ return ERR_PTR(-EIO);
+
bch2_trans_node_iter_init(trans, iter,
bp.btree_id,
bp.pos,
0,
bp.level - 1,
0);
- b = bch2_btree_iter_peek_node(iter);
+ struct btree *b = bch2_btree_iter_peek_node(iter);
if (IS_ERR_OR_NULL(b))
goto err;
@@ -367,16 +393,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
struct printbuf buf = PRINTBUF;
int ret = 0;
- if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
- backpointer_to_missing_device,
- "backpointer for missing device:\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ struct bpos bucket;
+ if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
+ if (fsck_err(c, backpointer_to_missing_device,
+ "backpointer for missing device:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_btree_delete_at(trans, bp_iter, 0);
goto out;
}
- alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
- bp_pos_to_bucket(c, k.k->p), 0);
+ alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0);
ret = bkey_err(alloc_k);
if (ret)
goto out;
@@ -460,8 +486,8 @@ found:
bytes = p.crc.compressed_size << 9;
- struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
- if (!bch2_dev_get_ioref(ca, READ))
+ struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ);
+ if (!ca)
return false;
data_buf = kvmalloc(bytes, GFP_KERNEL);
@@ -511,25 +537,27 @@ static int check_bp_exists(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
struct bkey_buf tmp;
- int ret;
+ int ret = 0;
bch2_bkey_buf_init(&tmp);
- if (!bch2_dev_bucket_exists(c, bucket)) {
+ struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket);
+ if (!ca) {
prt_str(&buf, "extent for nonexistent device:bucket ");
bch2_bpos_to_text(&buf, bucket);
prt_str(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, orig_k);
bch_err(c, "%s", buf.buf);
- return -BCH_ERR_fsck_repair_unimplemented;
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
}
if (bpos_lt(bucket, s->bucket_start) ||
bpos_gt(bucket, s->bucket_end))
- return 0;
+ goto out;
bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp(c, bucket, bp.bucket_offset),
+ bucket_pos_to_bp(ca, bucket, bp.bucket_offset),
0);
ret = bkey_err(bp_k);
if (ret)
@@ -562,6 +590,7 @@ fsck_err:
bch2_trans_iter_exit(trans, &other_extent_iter);
bch2_trans_iter_exit(trans, &bp_iter);
bch2_bkey_buf_exit(&tmp, c);
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
check_existing_bp:
@@ -637,13 +666,13 @@ missing:
struct bkey_i_backpointer n_bp_k;
bkey_backpointer_init(&n_bp_k.k_i);
- n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
n_bp_k.v = bp;
prt_printf(&buf, "\n want: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
- ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
+ ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true);
goto out;
}
@@ -667,7 +696,14 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp);
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ if (ca)
+ bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp);
+ rcu_read_unlock();
+
+ if (!ca)
+ continue;
ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
@@ -760,7 +796,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
__for_each_btree_node(trans, iter, btree,
btree == start.btree ? start.pos : POS_MIN,
- 0, depth, BTREE_ITER_PREFETCH, b, ret) {
+ 0, depth, BTREE_ITER_prefetch, b, ret) {
mem_may_pin -= btree_buf_bytes(b);
if (mem_may_pin <= 0) {
c->btree_cache.pinned_nodes_end = *end =
@@ -794,31 +830,13 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
while (level >= depth) {
struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
- level,
- BTREE_ITER_PREFETCH);
- while (1) {
- bch2_trans_begin(trans);
-
- struct bkey_s_c k = bch2_btree_iter_peek(&iter);
- if (!k.k)
- break;
- ret = bkey_err(k) ?:
- check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- ret = 0;
- continue;
- }
- if (ret)
- break;
- if (bpos_eq(iter.pos, SPOS_MAX))
- break;
- bch2_btree_iter_advance(&iter);
- }
- bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level,
+ BTREE_ITER_prefetch);
+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ }));
if (ret)
return ret;
@@ -936,7 +954,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bpos last_flushed_pos = SPOS_MAX;
return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
- POS_MIN, BTREE_ITER_PREFETCH, k,
+ POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index c1b274eadda1..6021de1c5e98 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -6,6 +6,7 @@
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
+#include "error.h"
#include "super.h"
static inline u64 swab40(u64 x)
@@ -18,7 +19,7 @@ static inline u64 swab40(u64 x)
}
int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_backpointer_swab(struct bkey_s);
@@ -36,15 +37,29 @@ void bch2_backpointer_swab(struct bkey_s);
* Convert from pos in backpointer btree to pos of corresponding bucket in alloc
* btree:
*/
-static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
- struct bpos bp_pos)
+static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
}
+static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode);
+ if (ca)
+ *bucket = bp_pos_to_bucket(ca, bp_pos);
+ rcu_read_unlock();
+ return ca != NULL;
+}
+
+static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
+{
+ return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket),
+ c, "backpointer for missing device %llu", bp_pos.inode);
+}
+
static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
struct bpos bucket,
u64 bucket_offset)
@@ -57,32 +72,32 @@ static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
/*
* Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
*/
-static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
struct bpos bucket,
u64 bucket_offset)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
- EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
+ EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret)));
return ret;
}
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket,
- struct bch_backpointer, struct bkey_s_c, bool);
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *,
+ struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool);
static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos bucket,
struct bch_backpointer bp,
struct bkey_s_c orig_k,
bool insert)
{
if (unlikely(bch2_backpointers_no_use_write_buffer))
- return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert);
+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert);
struct bkey_i_backpointer bp_k;
bkey_backpointer_init(&bp_k.k_i);
- bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
bp_k.v = bp;
if (!insert) {
@@ -120,7 +135,7 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
}
}
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
const union bch_extent_entry *entry,
@@ -130,7 +145,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
s64 sectors = level ? btree_sectors(c) : k.k->size;
u32 bucket_offset;
- *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+ *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
*bp = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,
@@ -142,7 +157,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
};
}
-int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int,
struct bpos *, struct bch_backpointer *, unsigned);
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
struct bpos, struct bch_backpointer,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 91c3c1fef233..bc0ea2c4efef 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -359,6 +359,8 @@ do { \
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
"Disables merging of extents") \
+ BCH_DEBUG_PARAM(btree_node_merging_disabled, \
+ "Disables merging of btree nodes") \
BCH_DEBUG_PARAM(btree_gc_always_rewrite, \
"Causes mark and sweep to compact and rewrite every " \
"btree node it traverses") \
@@ -468,6 +470,7 @@ enum bch_time_stats {
#include "quota_types.h"
#include "rebalance_types.h"
#include "replicas_types.h"
+#include "sb-members_types.h"
#include "subvolume_types.h"
#include "super_types.h"
#include "thread_with_file_types.h"
@@ -516,8 +519,8 @@ enum gc_phase {
struct gc_pos {
enum gc_phase phase;
+ u16 level;
struct bpos pos;
- unsigned level;
};
struct reflink_gc {
@@ -534,7 +537,13 @@ struct io_count {
struct bch_dev {
struct kobject kobj;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ atomic_long_t ref;
+ bool dying;
+ unsigned long last_put;
+#else
struct percpu_ref ref;
+#endif
struct completion ref_completion;
struct percpu_ref io_ref;
struct completion io_ref_completion;
@@ -560,14 +569,11 @@ struct bch_dev {
struct bch_devs_mask self;
- /* biosets used in cloned bios for writing multiple replicas */
- struct bio_set replica_set;
-
/*
* Buckets:
* Per-bucket arrays are protected by c->mark_lock, bucket_lock and
* gc_lock, for device resize - holding any is sufficient for access:
- * Or rcu_read_lock(), but only for ptr_stale():
+ * Or rcu_read_lock(), but only for dev_ptr_stale():
*/
struct bucket_array __rcu *buckets_gc;
struct bucket_gens __rcu *bucket_gens;
@@ -581,7 +587,7 @@ struct bch_dev {
/* Allocator: */
u64 new_fs_bucket_idx;
- u64 alloc_cursor;
+ u64 alloc_cursor[3];
unsigned nr_open_buckets;
unsigned nr_btree_reserve;
@@ -627,12 +633,12 @@ struct bch_dev {
x(clean_shutdown) \
x(fsck_running) \
x(initial_gc_unfixed) \
- x(need_another_gc) \
x(need_delete_dead_snapshots) \
x(error) \
x(topology_error) \
x(errors_fixed) \
- x(errors_not_fixed)
+ x(errors_not_fixed) \
+ x(no_invalid_checks)
enum bch_fs_flags {
#define x(n) BCH_FS_##n,
@@ -715,6 +721,7 @@ struct btree_trans_buf {
x(discard_fast) \
x(invalidate) \
x(delete_dead_snapshots) \
+ x(gc_gens) \
x(snapshot_delete_pagecache) \
x(sysfs) \
x(btree_write_buffer)
@@ -926,7 +933,6 @@ struct bch_fs {
/* JOURNAL SEQ BLACKLIST */
struct journal_seq_blacklist_table *
journal_seq_blacklist_table;
- struct work_struct journal_seq_blacklist_gc_work;
/* ALLOCATOR */
spinlock_t freelist_lock;
@@ -957,8 +963,7 @@ struct bch_fs {
struct work_struct discard_fast_work;
/* GARBAGE COLLECTION */
- struct task_struct *gc_thread;
- atomic_t kick_gc;
+ struct work_struct gc_gens_work;
unsigned long gc_count;
enum btree_id gc_gens_btree;
@@ -988,6 +993,7 @@ struct bch_fs {
struct bio_set bio_read;
struct bio_set bio_read_split;
struct bio_set bio_write;
+ struct bio_set replica_set;
struct mutex bio_bounce_pages_lock;
mempool_t bio_bounce_pages;
struct bucket_nocow_lock_table
@@ -1115,7 +1121,6 @@ struct bch_fs {
u64 counters_on_mount[BCH_COUNTER_NR];
u64 __percpu *counters;
- unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
@@ -1250,11 +1255,6 @@ static inline s64 bch2_current_time(const struct bch_fs *c)
return timespec_to_bch2_time(c, now);
}
-static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
-{
- return dev < c->sb.nr_devices && c->devs[dev];
-}
-
static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
{
struct stdio_redirect *stdio = c->stdio;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 2e8b1a489c20..1bebba881d89 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -76,6 +76,7 @@
#include <asm/byteorder.h>
#include <linux/kernel.h>
#include <linux/uuid.h>
+#include <uapi/linux/magic.h>
#include "vstructs.h"
#ifdef __KERNEL__
@@ -589,6 +590,13 @@ struct bch_member {
__le64 errors_reset_time;
__le64 seq;
__le64 btree_allocated_bitmap;
+ /*
+ * On recovery from a clean shutdown we don't normally read the journal,
+ * but we still want to resume writing from where we left off so we
+ * don't overwrite more than is necessary, for list journal debugging:
+ */
+ __le32 last_journal_bucket;
+ __le32 last_journal_bucket_offset;
};
/*
@@ -1283,7 +1291,7 @@ enum bch_compression_opts {
UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \
0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-#define BCACHEFS_STATFS_MAGIC 0xca451a4e
+#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC
#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 76e79a15ba08..f46978e5cb7c 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -640,7 +640,7 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
int bch2_bkey_format_invalid(struct bch_fs *c,
struct bkey_format *f,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
unsigned i, bits = KEY_PACKED_BITS_START;
@@ -656,20 +656,17 @@ int bch2_bkey_format_invalid(struct bch_fs *c,
* unpacked format:
*/
for (i = 0; i < f->nr_fields; i++) {
- if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
+ if ((!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) &&
+ bch2_bkey_format_field_overflows(f, i)) {
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
u64 packed_max = f->bits_per_field[i]
? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
: 0;
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
- if (packed_max + field_offset < packed_max ||
- packed_max + field_offset > unpacked_max) {
- prt_printf(err, "field %u too large: %llu + %llu > %llu",
- i, packed_max, field_offset, unpacked_max);
- return -BCH_ERR_invalid;
- }
+ prt_printf(err, "field %u too large: %llu + %llu > %llu",
+ i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max);
+ return -BCH_ERR_invalid;
}
bits += f->bits_per_field[i];
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 3a45d128f608..fcd43915df07 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -9,10 +9,10 @@
#include "util.h"
#include "vstructs.h"
-enum bkey_invalid_flags {
- BKEY_INVALID_WRITE = (1U << 0),
- BKEY_INVALID_COMMIT = (1U << 1),
- BKEY_INVALID_JOURNAL = (1U << 2),
+enum bch_validate_flags {
+ BCH_VALIDATE_write = (1U << 0),
+ BCH_VALIDATE_commit = (1U << 1),
+ BCH_VALIDATE_journal = (1U << 2),
};
#if 0
@@ -574,8 +574,31 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s
void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+
+static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i)
+{
+ unsigned f_bits = f->bits_per_field[i];
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+ if (f_bits > unpacked_bits)
+ return true;
+
+ if ((f_bits == unpacked_bits) && field_offset)
+ return true;
+
+ u64 f_mask = f_bits
+ ? ~((~0ULL << (f_bits - 1)) << 1)
+ : 0;
+
+ if (((field_offset + f_mask) & unpacked_mask) < field_offset)
+ return true;
+ return false;
+}
+
int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index a275a9e8e341..c2c3dae52186 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -27,7 +27,7 @@ const char * const bch2_bkey_types[] = {
};
static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
}
@@ -41,7 +41,7 @@ static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
})
static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
int ret = 0;
@@ -58,7 +58,7 @@ fsck_err:
})
static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
}
@@ -82,7 +82,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
})
static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
}
@@ -123,9 +123,12 @@ const struct bkey_ops bch2_bkey_null_ops = {
};
int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
+ if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
+ return 0;
+
const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
int ret = 0;
@@ -159,9 +162,12 @@ const char *bch2_btree_node_type_str(enum btree_node_type type)
int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
+ if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
+ return 0;
+
int ret = 0;
bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
@@ -172,7 +178,7 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
return 0;
bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
- (type == BKEY_TYPE_btree || (flags & BKEY_INVALID_COMMIT)) &&
+ (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
@@ -224,7 +230,7 @@ fsck_err:
int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
return __bch2_bkey_invalid(c, k, type, flags, err) ?:
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 03efe8ee565a..726ef7483763 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -22,14 +22,15 @@ extern const struct bkey_ops bch2_bkey_null_ops;
*/
struct bkey_ops {
int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err);
+ enum bch_validate_flags flags, struct printbuf *err);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
int (*trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
void (*compat)(enum btree_id id, unsigned version,
unsigned big_endian, int write,
struct bkey_s);
@@ -48,11 +49,11 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
}
int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
struct bkey_s_c, struct printbuf *);
@@ -76,56 +77,10 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-enum btree_update_flags {
- __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
- __BTREE_UPDATE_NOJOURNAL,
- __BTREE_UPDATE_KEY_CACHE_RECLAIM,
-
- __BTREE_TRIGGER_NORUN,
- __BTREE_TRIGGER_TRANSACTIONAL,
- __BTREE_TRIGGER_ATOMIC,
- __BTREE_TRIGGER_GC,
- __BTREE_TRIGGER_INSERT,
- __BTREE_TRIGGER_OVERWRITE,
- __BTREE_TRIGGER_BUCKET_INVALIDATE,
-};
-
-#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
-#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
-
-/* Don't run triggers at all */
-#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
-
-/*
- * If set, we're running transactional triggers as part of a transaction commit:
- * triggers may generate new updates
- *
- * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set,
- * we're running atomic triggers during a transaction commit: we have our
- * journal reservation, we're holding btree node write locks, and we know the
- * transaction is going to commit (returning an error here is a fatal error,
- * causing us to go emergency read-only)
- */
-#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL)
-#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC)
-
-/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
-
-/* @new is entering the btree */
-#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
-
-/* @old is leaving the btree */
-#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-
-/* signal from bucket invalidate path to alloc trigger */
-#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-
static inline int bch2_key_trigger(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
@@ -135,8 +90,9 @@ static inline int bch2_key_trigger(struct btree_trans *trans,
}
static inline int bch2_key_trigger_old(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, unsigned flags)
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i deleted;
@@ -144,12 +100,13 @@ static inline int bch2_key_trigger_old(struct btree_trans *trans,
deleted.k.p = old.k->p;
return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
- BTREE_TRIGGER_OVERWRITE|flags);
+ BTREE_TRIGGER_overwrite|flags);
}
static inline int bch2_key_trigger_new(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s new, unsigned flags)
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s new,
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i deleted;
@@ -157,7 +114,7 @@ static inline int bch2_key_trigger_new(struct btree_trans *trans,
deleted.k.p = new.k->p;
return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
- BTREE_TRIGGER_INSERT|flags);
+ BTREE_TRIGGER_insert|flags);
}
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index bcca9e76a0b4..4536eb50fc40 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -6,9 +6,9 @@
#include "bset.h"
#include "extents.h"
-typedef int (*sort_cmp_fn)(struct btree *,
- struct bkey_packed *,
- struct bkey_packed *);
+typedef int (*sort_cmp_fn)(const struct btree *,
+ const struct bkey_packed *,
+ const struct bkey_packed *);
static inline bool sort_iter_end(struct sort_iter *iter)
{
@@ -70,9 +70,9 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
/*
* If keys compare equal, compare by pointer order:
*/
-static inline int key_sort_fix_overlapping_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
+static inline int key_sort_fix_overlapping_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
{
return bch2_bkey_cmp_packed(b, l, r) ?:
cmp_int((unsigned long) l, (unsigned long) r);
@@ -154,46 +154,59 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
return nr;
}
-static inline int sort_keys_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
+static inline int keep_unwritten_whiteouts_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
{
return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
- (int) l->needs_whiteout - (int) r->needs_whiteout;
+ (long) l - (long) r;
}
-unsigned bch2_sort_keys(struct bkey_packed *dst,
- struct sort_iter *iter,
- bool filter_whiteouts)
+#include "btree_update_interior.h"
+
+/*
+ * For sorting in the btree node write path: whiteouts not in the unwritten
+ * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are
+ * dropped if overwritten by real keys:
+ */
+unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter)
{
- const struct bkey_format *f = &iter->b->format;
struct bkey_packed *in, *next, *out = dst;
- sort_iter_sort(iter, sort_keys_cmp);
+ sort_iter_sort(iter, keep_unwritten_whiteouts_cmp);
- while ((in = sort_iter_next(iter, sort_keys_cmp))) {
- bool needs_whiteout = false;
+ while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) {
+ if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b))
+ continue;
- if (bkey_deleted(in) &&
- (filter_whiteouts || !in->needs_whiteout))
+ if ((next = sort_iter_peek(iter)) &&
+ !bch2_bkey_cmp_packed_inlined(iter->b, in, next))
continue;
- while ((next = sort_iter_peek(iter)) &&
- !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
- BUG_ON(in->needs_whiteout &&
- next->needs_whiteout);
- needs_whiteout |= in->needs_whiteout;
- in = sort_iter_next(iter, sort_keys_cmp);
- }
+ bkey_p_copy(out, in);
+ out = bkey_p_next(out);
+ }
- if (bkey_deleted(in)) {
- memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
- set_bkeyp_val_u64s(f, out, 0);
- } else {
- bkey_p_copy(out, in);
- }
- out->needs_whiteout |= needs_whiteout;
+ return (u64 *) out - (u64 *) dst;
+}
+
+/*
+ * Main sort routine for compacting a btree node in memory: we always drop
+ * whiteouts because any whiteouts that need to be written are in the unwritten
+ * whiteouts area:
+ */
+unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter)
+{
+ struct bkey_packed *in, *out = dst;
+
+ sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined);
+
+ while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) {
+ if (bkey_deleted(in))
+ continue;
+
+ bkey_p_copy(out, in);
out = bkey_p_next(out);
}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
index 7c0f0b160f18..9be969d46890 100644
--- a/fs/bcachefs/bkey_sort.h
+++ b/fs/bcachefs/bkey_sort.h
@@ -48,7 +48,7 @@ bch2_sort_repack(struct bset *, struct btree *,
struct btree_node_iter *,
struct bkey_format *, bool);
-unsigned bch2_sort_keys(struct bkey_packed *,
- struct sort_iter *, bool);
+unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *);
+unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *);
#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 3bb477840eab..575e1d0b6eeb 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -103,8 +103,6 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
{
- struct bset_tree *t;
-
console_lock();
for_each_bset(b, t)
bch2_dump_bset(c, b, bset(b, t), t - b->set);
@@ -136,7 +134,6 @@ void bch2_dump_btree_node_iter(struct btree *b,
struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
{
- struct bset_tree *t;
struct bkey_packed *k;
struct btree_nr_keys nr = {};
@@ -198,7 +195,6 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
{
struct btree_node_iter_set *set, *s2;
struct bkey_packed *k, *p;
- struct bset_tree *t;
if (bch2_btree_node_iter_end(iter))
return;
@@ -213,12 +209,14 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
/* Verify that set->end is correct: */
btree_node_iter_for_each(iter, set) {
for_each_bset(b, t)
- if (set->end == t->end_offset)
+ if (set->end == t->end_offset) {
+ BUG_ON(set->k < btree_bkey_first_offset(t) ||
+ set->k >= t->end_offset);
goto found;
+ }
BUG();
found:
- BUG_ON(set->k < btree_bkey_first_offset(t) ||
- set->k >= t->end_offset);
+ do {} while (0);
}
/* Verify iterator is sorted: */
@@ -377,11 +375,9 @@ static struct bkey_float *bkey_float(const struct btree *b,
return ro_aux_tree_base(b, t)->f + idx;
}
-static void bset_aux_tree_verify(const struct btree *b)
+static void bset_aux_tree_verify(struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- const struct bset_tree *t;
-
for_each_bset(b, t) {
if (t->aux_data_offset == U16_MAX)
continue;
@@ -685,20 +681,20 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
}
/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
+static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
{
bset_aux_tree_verify(b);
return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
}
-static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
+static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) /
(sizeof(struct bkey_float) + sizeof(u8));
}
-static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
+static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
}
@@ -1374,8 +1370,6 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
struct btree *b)
{
- struct bset_tree *t;
-
memset(iter, 0, sizeof(*iter));
for_each_bset(b, t)
@@ -1481,7 +1475,6 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
{
struct bkey_packed *k, *prev = NULL;
struct btree_node_iter_set *set;
- struct bset_tree *t;
unsigned end = 0;
if (bch2_expensive_debug_checks)
@@ -1550,9 +1543,7 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
{
- const struct bset_tree *t;
-
- for_each_bset(b, t) {
+ for_each_bset_c(b, t) {
enum bset_aux_tree_type type = bset_aux_tree_type(t);
size_t j;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 120a79fd456b..5c6c7a14fa0f 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -206,7 +206,10 @@ static inline size_t btree_aux_data_u64s(const struct btree *b)
}
#define for_each_bset(_b, _t) \
- for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+ for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+#define for_each_bset_c(_b, _t) \
+ for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
#define bset_tree_for_each_key(_b, _t, _k) \
for (_k = btree_bkey_first(_b, _t); \
@@ -294,7 +297,6 @@ static inline struct bset_tree *
bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
{
unsigned offset = __btree_node_key_to_offset(b, k);
- struct bset_tree *t;
for_each_bset(b, t)
if (offset <= t->end_offset) {
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 02c70e813fac..9e4ed75d3675 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -16,6 +16,12 @@
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
+#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
+do { \
+ if (shrinker_counter) \
+ bc->not_freed_##counter++; \
+} while (0)
+
const char * const bch2_btree_node_flags[] = {
#define x(f) #f,
BTREE_FLAGS()
@@ -162,6 +168,9 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
/* Cause future lookups for this node to fail: */
b->hash_val = 0;
+
+ if (b->c.btree_id < BTREE_ID_NR)
+ --bc->used_by_btree[b->c.btree_id];
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -169,8 +178,11 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
BUG_ON(b->hash_val);
b->hash_val = btree_ptr_hash_val(&b->key);
- return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
- bch_btree_cache_params);
+ int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+ bch_btree_cache_params);
+ if (!ret && b->c.btree_id < BTREE_ID_NR)
+ bc->used_by_btree[b->c.btree_id]++;
+ return ret;
}
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
@@ -190,6 +202,35 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
return ret;
}
+void bch2_btree_node_update_key_early(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b;
+ struct bkey_buf tmp;
+ int ret;
+
+ bch2_bkey_buf_init(&tmp);
+ bch2_bkey_buf_reassemble(&tmp, c, old);
+
+ b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
+ if (!IS_ERR_OR_NULL(b)) {
+ mutex_lock(&c->btree_cache.lock);
+
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, new);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+
+ mutex_unlock(&c->btree_cache.lock);
+ six_unlock_read(&b->c.lock);
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
__flatten
static inline struct btree *btree_cache_find(struct btree_cache *bc,
const struct bkey_i *k)
@@ -203,7 +244,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc,
* this version is for btree nodes that have already been freed (we're not
* reaping a real btree node)
*/
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
{
struct btree_cache *bc = &c->btree_cache;
int ret = 0;
@@ -225,38 +266,64 @@ wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_dirty(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
+ else if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ }
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
- if (!six_trylock_intent(&b->c.lock))
+ if (!six_trylock_intent(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ }
- if (!six_trylock_write(&b->c.lock))
+ if (!six_trylock_write(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
goto out_unlock_intent;
+ }
/* recheck under lock */
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
goto out_unlock;
+ }
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
- if (btree_node_noevict(b) ||
- btree_node_write_blocked(b) ||
- btree_node_will_make_reachable(b))
+ if (btree_node_noevict(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
+ goto out_unlock;
+ }
+ if (btree_node_write_blocked(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
goto out_unlock;
+ }
+ if (btree_node_will_make_reachable(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
+ goto out_unlock;
+ }
if (btree_node_dirty(b)) {
- if (!flush)
+ if (!flush) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
goto out_unlock;
+ }
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
@@ -286,14 +353,14 @@ out_unlock_intent:
goto out;
}
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
{
- return __btree_node_reclaim(c, b, false);
+ return __btree_node_reclaim(c, b, false, shrinker_counter);
}
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, true);
+ return __btree_node_reclaim(c, b, true, false);
}
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
@@ -341,11 +408,12 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
if (touched >= nr)
goto out;
- if (!btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b, true)) {
btree_node_data_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
+ bc->freed++;
}
}
restart:
@@ -354,9 +422,11 @@ restart:
if (btree_node_accessed(b)) {
clear_btree_node_accessed(b);
- } else if (!btree_node_reclaim(c, b)) {
+ bc->not_freed_access_bit++;
+ } else if (!btree_node_reclaim(c, b, true)) {
freed++;
btree_node_data_free(c, b);
+ bc->freed++;
bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
@@ -564,7 +634,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree *b;
list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b))
+ if (!btree_node_reclaim(c, b, false))
return b;
while (1) {
@@ -600,7 +670,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, freed, list)
- if (!btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b, false)) {
list_del_init(&b->list);
goto got_node;
}
@@ -626,7 +696,7 @@ got_node:
* the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b2, &bc->freeable, list)
- if (!btree_node_reclaim(c, b2)) {
+ if (!btree_node_reclaim(c, b2, false)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
btree_node_to_freedlist(bc, b2);
@@ -846,7 +916,6 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- struct bset_tree *t;
bool need_relock = false;
int ret;
@@ -966,7 +1035,6 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
{
struct bch_fs *c = trans->c;
struct btree *b;
- struct bset_tree *t;
int ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
@@ -1043,7 +1111,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- struct bset_tree *t;
int ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
@@ -1240,9 +1307,39 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
stats.failed);
}
-void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
+static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
+ const char *label, unsigned nr)
{
- prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
- prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
- prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+ prt_printf(out, "%s\t", label);
+ prt_human_readable_u64(out, nr * c->opts.btree_node_size);
+ prt_printf(out, " (%u)\n", nr);
+}
+
+void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
+
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ prt_btree_cache_line(out, c, "total:", bc->used);
+ prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty));
+ prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
+ prt_newline(out);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++)
+ prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]);
+
+ prt_newline(out);
+ prt_printf(out, "freed:\t%u\n", bc->freed);
+ prt_printf(out, "not freed:\n");
+ prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty);
+ prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight);
+ prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight);
+ prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent);
+ prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write);
+ prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit);
+ prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict);
+ prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked);
+ prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable);
}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 6d33885fdbde..fed35de3e4de 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -17,6 +17,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
+void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *);
+
void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
@@ -131,6 +134,6 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
const char *bch2_btree_id_str(enum btree_id);
void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *);
+void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 791470b0c654..8035c8b797ab 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -52,12 +52,6 @@ static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
}}};
}
-static bool should_restart_for_topology_repair(struct bch_fs *c)
-{
- return c->opts.fix_errors != FSCK_FIX_no &&
- !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
-}
-
static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
{
preempt_disable();
@@ -69,7 +63,7 @@ static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
{
- BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
__gc_pos_set(c, new_pos);
}
@@ -97,35 +91,6 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
}
}
-static void bch2_btree_node_update_key_early(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_i *new)
-{
- struct bch_fs *c = trans->c;
- struct btree *b;
- struct bkey_buf tmp;
- int ret;
-
- bch2_bkey_buf_init(&tmp);
- bch2_bkey_buf_reassemble(&tmp, c, old);
-
- b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
- if (!IS_ERR_OR_NULL(b)) {
- mutex_lock(&c->btree_cache.lock);
-
- bch2_btree_node_hash_remove(&c->btree_cache, b);
-
- bkey_copy(&b->key, new);
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
-
- mutex_unlock(&c->btree_cache.lock);
- six_unlock_read(&b->c.lock);
- }
-
- bch2_bkey_buf_exit(&tmp, c);
-}
-
static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
{
struct bkey_i_btree_ptr_v2 *new;
@@ -546,9 +511,9 @@ reconstruct_root:
if (!bch2_btree_has_scanned_nodes(c, i)) {
mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
"no nodes found for btree %s, continue?", bch2_btree_id_str(i));
- bch2_btree_root_alloc_fake(c, i, 0);
+ bch2_btree_root_alloc_fake_trans(trans, i, 0);
} else {
- bch2_btree_root_alloc_fake(c, i, 1);
+ bch2_btree_root_alloc_fake_trans(trans, i, 1);
bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
if (ret)
@@ -576,7 +541,7 @@ reconstruct_root:
goto reconstruct_root;
bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
- bch2_btree_root_alloc_fake(c, i, 0);
+ bch2_btree_root_alloc_fake_trans(trans, i, 0);
r->alive = false;
ret = 0;
}
@@ -586,495 +551,123 @@ fsck_err:
return ret;
}
-static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
- unsigned level, bool is_root,
- struct bkey_s_c *k)
+/* marking of btree keys/nodes: */
+
+static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
+ unsigned level, struct btree **prev,
+ struct btree_iter *iter, struct bkey_s_c k,
+ bool initial)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
- const union bch_extent_entry *entry_c;
- struct extent_ptr_decoded p = { 0 };
- bool do_update = false;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- /*
- * XXX
- * use check_bucket_ref here
- */
- bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c);
-
- if (fsck_err_on(!g->gen_valid,
- c, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- } else {
- do_update = true;
- }
- }
-
- if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
- c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- set_bit(BCH_FS_need_another_gc, &c->flags);
- } else {
- do_update = true;
- }
- }
-
- if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
- c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
- do_update = true;
-
- if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
- c, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
- do_update = true;
-
- if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
- continue;
-
- if (fsck_err_on(bucket_data_type(g->data_type) &&
- bucket_data_type(g->data_type) !=
- bucket_data_type(data_type), c,
- ptr_bucket_data_type_mismatch,
- "bucket %u:%zu different types of data in same bucket: %s, %s\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
- if (data_type == BCH_DATA_btree) {
- g->data_type = data_type;
- set_bit(BCH_FS_need_another_gc, &c->flags);
- } else {
- do_update = true;
- }
- }
-
- if (p.has_ec) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
- if (fsck_err_on(!m || !m->alive, c,
- ptr_to_missing_stripe,
- "pointer to nonexistent stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
- do_update = true;
-
- if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
- ptr_to_incorrect_stripe,
- "pointer does not match stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
- do_update = true;
- }
- }
-
- if (do_update) {
- if (is_root) {
- bch_err(c, "cannot update btree roots yet");
- ret = -EINVAL;
- goto err;
- }
-
- struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
- if (!new) {
- ret = -BCH_ERR_ENOMEM_gc_repair_key;
- bch_err_msg(c, ret, "allocating new key");
- goto err;
- }
-
- bkey_reassemble(new, *k);
-
- if (level) {
- /*
- * We don't want to drop btree node pointers - if the
- * btree node isn't there anymore, the read path will
- * sort it out:
- */
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
- ptr->gen = g->gen;
- }
- } else {
- struct bkey_ptrs ptrs;
- union bch_extent_entry *entry;
-restart_drop_ptrs:
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
-
- if ((p.ptr.cached &&
- (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
- (!p.ptr.cached &&
- gen_cmp(p.ptr.gen, g->gen) < 0) ||
- gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
- (g->data_type &&
- g->data_type != data_type)) {
- bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
- goto restart_drop_ptrs;
- }
- }
-again:
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_extent_entry_for_each(ptrs, entry) {
- if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
- entry->stripe_ptr.idx);
- union bch_extent_entry *next_ptr;
-
- bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
- if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
- goto found;
- next_ptr = NULL;
-found:
- if (!next_ptr) {
- bch_err(c, "aieee, found stripe ptr with no data ptr");
- continue;
- }
-
- if (!m || !m->alive ||
- !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
- &next_ptr->ptr,
- m->sectors)) {
- bch2_bkey_extent_entry_drop(new, entry);
- goto again;
- }
- }
- }
- }
- if (level)
- bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
+ if (iter) {
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct btree *b = path_l(path)->b;
- if (0) {
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, *k);
- bch_info(c, "updated %s", buf.buf);
-
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
- bch_info(c, "new key %s", buf.buf);
- }
-
- ret = bch2_journal_key_insert_take(c, btree_id, level, new);
- if (ret) {
- kfree(new);
- goto err;
+ if (*prev != b) {
+ int ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
}
-
- *k = bkey_i_to_s_c(new);
+ *prev = b;
}
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-/* marking of btree keys/nodes: */
-
-static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
- unsigned level, bool is_root,
- struct bkey_s_c *k,
- bool initial)
-{
- struct bch_fs *c = trans->c;
struct bkey deleted = KEY(0, 0, 0);
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
struct printbuf buf = PRINTBUF;
int ret = 0;
- deleted.p = k->k->p;
+ deleted.p = k.k->p;
if (initial) {
BUG_ON(bch2_journal_seq_verify &&
- k->k->version.lo > atomic64_read(&c->journal.seq));
+ k.k->version.lo > atomic64_read(&c->journal.seq));
- if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
+ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
bkey_version_in_future,
"key version number higher than recorded: %llu > %llu",
- k->k->version.lo,
+ k.k->version.lo,
atomic64_read(&c->key_version)))
- atomic64_set(&c->key_version, k->k->version.lo);
+ atomic64_set(&c->key_version, k.k->version.lo);
}
- ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
- if (ret)
- goto err;
-
- if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k),
+ if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
c, btree_bitmap_not_marked,
"btree ptr not marked in member info btree allocated bitmap\n %s",
- (bch2_bkey_val_to_text(&buf, c, *k),
+ (bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
mutex_lock(&c->sb_lock);
- bch2_dev_btree_bitmap_mark(c, *k);
+ bch2_dev_btree_bitmap_mark(c, k);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_key_trigger(trans, btree_id, level, old,
- unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
-fsck_err:
-err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
-{
- struct btree_node_iter iter;
- struct bkey unpacked;
- struct bkey_s_c k;
- int ret = 0;
+ /*
+ * We require a commit before key_trigger() because
+ * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
+ * wrong result if we run it multiple times.
+ */
+ unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
- ret = bch2_btree_node_check_topology(trans, b);
+ ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
+ BTREE_TRIGGER_check_repair|flags);
if (ret)
- return ret;
-
- if (!btree_node_type_needs_gc(btree_node_type(b)))
- return 0;
-
- bch2_btree_node_iter_init_from_start(&iter, b);
-
- while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
- &k, initial);
- if (ret)
- return ret;
+ goto out;
- bch2_btree_node_iter_advance(&iter, b);
+ if (trans->nr_updates) {
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
}
- return 0;
+ ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
+ BTREE_TRIGGER_gc|flags);
+out:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
}
-static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
- bool initial, bool metadata_only)
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct btree *b;
- unsigned depth = metadata_only ? 1 : 0;
+ int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1;
int ret = 0;
- gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
-
- __for_each_btree_node(trans, iter, btree_id, POS_MIN,
- 0, depth, BTREE_ITER_PREFETCH, b, ret) {
- bch2_verify_btree_nr_keys(b);
-
- gc_pos_set(c, gc_pos_btree_node(b));
-
- ret = btree_gc_mark_node(trans, b, initial);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- return ret;
+ /* We need to make sure every leaf node is readable before going RW */
+ if (initial)
+ target_depth = 0;
+ /* root */
mutex_lock(&c->btree_root_lock);
- b = bch2_btree_id_root(c, btree_id)->b;
+ struct btree *b = bch2_btree_id_root(c, btree)->b;
if (!btree_node_fake(b)) {
- struct bkey_s_c k = bkey_i_to_s_c(&b->key);
-
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
- true, &k, initial);
+ gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
+ ret = lockrestart_do(trans,
+ bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
+ NULL, NULL, bkey_i_to_s_c(&b->key), initial));
+ level = b->c.level;
}
- gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
mutex_unlock(&c->btree_root_lock);
- return ret;
-}
-
-static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
- unsigned target_depth)
-{
- struct bch_fs *c = trans->c;
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct bkey_buf cur;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = bch2_btree_node_check_topology(trans, b);
if (ret)
return ret;
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- bch2_bkey_buf_init(&cur);
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- BUG_ON(bpos_lt(k.k->p, b->data->min_key));
- BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+ for (; level >= target_depth; --level) {
+ struct btree *prev = NULL;
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
+ BTREE_ITER_prefetch);
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
- false, &k, true);
+ ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
+ bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
+ }));
if (ret)
- goto fsck_err;
-
- bch2_btree_and_journal_iter_advance(&iter);
- }
-
- if (b->c.level > target_depth) {
- bch2_btree_and_journal_iter_exit(&iter);
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- iter.prefetch = true;
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- struct btree *child;
-
- bch2_bkey_buf_reassemble(&cur, c, k);
- bch2_btree_and_journal_iter_advance(&iter);
-
- child = bch2_btree_node_get_noiter(trans, cur.k,
- b->c.btree_id, b->c.level - 1,
- false);
- ret = PTR_ERR_OR_ZERO(child);
-
- if (bch2_err_matches(ret, EIO)) {
- bch2_topology_error(c);
-
- if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
- btree_node_read_error,
- "Unreadable btree node at btree %s level %u:\n"
- " %s",
- bch2_btree_id_str(b->c.btree_id),
- b->c.level - 1,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
- should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto fsck_err;
- } else {
- /* Continue marking when opted to not
- * fix the error: */
- ret = 0;
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- continue;
- }
- } else if (ret) {
- bch_err_msg(c, ret, "getting btree node");
- break;
- }
-
- ret = bch2_gc_btree_init_recurse(trans, child,
- target_depth);
- six_unlock_read(&child->c.lock);
-
- if (ret)
- break;
- }
- }
-fsck_err:
- bch2_bkey_buf_exit(&cur, c);
- bch2_btree_and_journal_iter_exit(&iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bch2_gc_btree_init(struct btree_trans *trans,
- enum btree_id btree_id,
- bool metadata_only)
-{
- struct bch_fs *c = trans->c;
- struct btree *b;
- unsigned target_depth = metadata_only ? 1 : 0;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- b = bch2_btree_id_root(c, btree_id)->b;
-
- six_lock_read(&b->c.lock, NULL, NULL);
- printbuf_reset(&buf);
- bch2_bpos_to_text(&buf, b->data->min_key);
- if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
- btree_root_bad_min_key,
- "btree root with incorrect min_key: %s", buf.buf)) {
- bch_err(c, "repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto fsck_err;
- }
-
- printbuf_reset(&buf);
- bch2_bpos_to_text(&buf, b->data->max_key);
- if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
- btree_root_bad_max_key,
- "btree root with incorrect max_key: %s", buf.buf)) {
- bch_err(c, "repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto fsck_err;
- }
-
- if (b->c.level >= target_depth)
- ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
-
- if (!ret) {
- struct bkey_s_c k = bkey_i_to_s_c(&b->key);
-
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true,
- &k, true);
+ break;
}
-fsck_err:
- six_unlock_read(&b->c.lock);
- bch_err_fn(c, ret);
- printbuf_exit(&buf);
return ret;
}
@@ -1084,7 +677,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
(int) btree_id_to_gc_phase(r);
}
-static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
+static int bch2_gc_btrees(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
enum btree_id ids[BTREE_ID_NR];
@@ -1095,98 +688,36 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
- for (i = 0; i < BTREE_ID_NR && !ret; i++)
- ret = initial
- ? bch2_gc_btree_init(trans, ids[i], metadata_only)
- : bch2_gc_btree(trans, ids[i], initial, metadata_only);
+ for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
- for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
- if (!bch2_btree_id_root(c, i)->alive)
+ if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
continue;
- ret = initial
- ? bch2_gc_btree_init(trans, i, metadata_only)
- : bch2_gc_btree(trans, i, initial, metadata_only);
- }
+ ret = bch2_gc_btree(trans, btree, true);
+ if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
+ c, btree_node_read_error,
+ "btree node read error for %s",
+ bch2_btree_id_str(btree)))
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+ }
+fsck_err:
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
}
-static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
- u64 start, u64 end,
- enum bch_data_type type,
- unsigned flags)
-{
- u64 b = sector_to_bucket(ca, start);
-
- do {
- unsigned sectors =
- min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
-
- bch2_mark_metadata_bucket(c, ca, b, type, sectors,
- gc_phase(GC_PHASE_SB), flags);
- b++;
- start += sectors;
- } while (start < end);
-}
-
-static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
- unsigned flags)
-{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
- unsigned i;
- u64 b;
-
- for (i = 0; i < layout->nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout->sb_offset[i]);
-
- if (offset == BCH_SB_SECTOR)
- mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
- BCH_DATA_sb, flags);
-
- mark_metadata_sectors(c, ca, offset,
- offset + (1 << layout->sb_max_size_bits),
- BCH_DATA_sb, flags);
- }
-
- for (i = 0; i < ca->journal.nr; i++) {
- b = ca->journal.buckets[i];
- bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB), flags);
- }
-}
-
-static void bch2_mark_superblocks(struct bch_fs *c)
+static int bch2_mark_superblocks(struct bch_fs *c)
{
mutex_lock(&c->sb_lock);
gc_pos_set(c, gc_phase(GC_PHASE_SB));
- for_each_online_member(c, ca)
- bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
+ int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
mutex_unlock(&c->sb_lock);
+ return ret;
}
-#if 0
-/* Also see bch2_pending_btree_node_free_insert_done() */
-static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
-{
- struct btree_update *as;
- struct pending_btree_node_free *d;
-
- mutex_lock(&c->btree_interior_update_lock);
- gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
-
- for_each_pending_btree_node_free(c, as, d)
- if (d->index_update_done)
- bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-#endif
-
static void bch2_gc_free(struct bch_fs *c)
{
genradix_free(&c->reflink_gc_table);
@@ -1204,28 +735,23 @@ static void bch2_gc_free(struct bch_fs *c)
c->usage_gc = NULL;
}
-static int bch2_gc_done(struct bch_fs *c,
- bool initial, bool metadata_only)
+static int bch2_gc_done(struct bch_fs *c)
{
struct bch_dev *ca = NULL;
struct printbuf buf = PRINTBUF;
- bool verify = !metadata_only &&
- !c->opts.reconstruct_alloc &&
- (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
unsigned i;
int ret = 0;
percpu_down_write(&c->mark_lock);
-#define copy_field(_err, _f, _msg, ...) \
- if (dst->_f != src->_f && \
- (!verify || \
- fsck_err(c, _err, _msg ": got %llu, should be %llu" \
- , ##__VA_ARGS__, dst->_f, src->_f))) \
+#define copy_field(_err, _f, _msg, ...) \
+ if (fsck_err_on(dst->_f != src->_f, c, _err, \
+ _msg ": got %llu, should be %llu" , ##__VA_ARGS__, \
+ dst->_f, src->_f)) \
dst->_f = src->_f
-#define copy_dev_field(_err, _f, _msg, ...) \
+#define copy_dev_field(_err, _f, _msg, ...) \
copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
-#define copy_fs_field(_err, _f, _msg, ...) \
+#define copy_fs_field(_err, _f, _msg, ...) \
copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
@@ -1258,31 +784,24 @@ static int bch2_gc_done(struct bch_fs *c,
copy_fs_field(fs_usage_btree_wrong,
b.btree, "btree");
- if (!metadata_only) {
- copy_fs_field(fs_usage_data_wrong,
- b.data, "data");
- copy_fs_field(fs_usage_cached_wrong,
- b.cached, "cached");
- copy_fs_field(fs_usage_reserved_wrong,
- b.reserved, "reserved");
- copy_fs_field(fs_usage_nr_inodes_wrong,
- b.nr_inodes,"nr_inodes");
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- copy_fs_field(fs_usage_persistent_reserved_wrong,
- persistent_reserved[i],
- "persistent_reserved[%i]", i);
- }
+ copy_fs_field(fs_usage_data_wrong,
+ b.data, "data");
+ copy_fs_field(fs_usage_cached_wrong,
+ b.cached, "cached");
+ copy_fs_field(fs_usage_reserved_wrong,
+ b.reserved, "reserved");
+ copy_fs_field(fs_usage_nr_inodes_wrong,
+ b.nr_inodes,"nr_inodes");
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
+ copy_fs_field(fs_usage_persistent_reserved_wrong,
+ persistent_reserved[i],
+ "persistent_reserved[%i]", i);
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
- if (metadata_only &&
- (e->data_type == BCH_DATA_user ||
- e->data_type == BCH_DATA_cached))
- continue;
-
printbuf_reset(&buf);
bch2_replicas_entry_to_text(&buf, e);
@@ -1296,10 +815,8 @@ static int bch2_gc_done(struct bch_fs *c,
#undef copy_stripe_field
#undef copy_field
fsck_err:
- if (ca)
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
bch_err_fn(c, ret);
-
percpu_up_write(&c->mark_lock);
printbuf_exit(&buf);
return ret;
@@ -1322,7 +839,7 @@ static int bch2_gc_start(struct bch_fs *c)
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
if (!ca->usage_gc) {
bch_err(c, "error allocating ca->usage_gc");
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return -BCH_ERR_ENOMEM_gc_start;
}
@@ -1333,19 +850,6 @@ static int bch2_gc_start(struct bch_fs *c)
return 0;
}
-static int bch2_gc_reset(struct bch_fs *c)
-{
- for_each_member_device(c, ca) {
- free_percpu(ca->usage_gc);
- ca->usage_gc = NULL;
- }
-
- free_percpu(c->usage_gc);
- c->usage_gc = NULL;
-
- return bch2_gc_start(c);
-}
-
/* returns true if not equal */
static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
struct bch_alloc_v4 r)
@@ -1361,56 +865,41 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
static int bch2_alloc_write_key(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_s_c k,
- bool metadata_only)
+ struct bch_dev *ca,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
- struct bucket old_gc, gc, *b;
struct bkey_i_alloc_v4 *a;
- struct bch_alloc_v4 old_convert, new;
+ struct bch_alloc_v4 old_gc, gc, old_convert, new;
const struct bch_alloc_v4 *old;
int ret;
old = bch2_alloc_to_v4(k, &old_convert);
- new = *old;
+ gc = new = *old;
percpu_down_read(&c->mark_lock);
- b = gc_bucket(ca, iter->pos.offset);
- old_gc = *b;
+ __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
+
+ old_gc = gc;
if ((old->data_type == BCH_DATA_sb ||
old->data_type == BCH_DATA_journal) &&
!bch2_dev_is_online(ca)) {
- b->data_type = old->data_type;
- b->dirty_sectors = old->dirty_sectors;
+ gc.data_type = old->data_type;
+ gc.dirty_sectors = old->dirty_sectors;
}
/*
- * b->data_type doesn't yet include need_discard & need_gc_gen states -
+ * gc.data_type doesn't yet include need_discard & need_gc_gen states -
* fix that here:
*/
- b->data_type = __alloc_data_type(b->dirty_sectors,
- b->cached_sectors,
- b->stripe,
- *old,
- b->data_type);
- gc = *b;
+ alloc_data_type_set(&gc, gc.data_type);
if (gc.data_type != old_gc.data_type ||
gc.dirty_sectors != old_gc.dirty_sectors)
- bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
+ bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true);
percpu_up_read(&c->mark_lock);
- if (metadata_only &&
- gc.data_type != BCH_DATA_sb &&
- gc.data_type != BCH_DATA_journal &&
- gc.data_type != BCH_DATA_btree)
- return 0;
-
- if (gen_after(old->gen, gc.gen))
- return 0;
-
if (fsck_err_on(new.data_type != gc.data_type, c,
alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
@@ -1460,12 +949,12 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
+ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
fsck_err:
return ret;
}
-static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_alloc_done(struct bch_fs *c)
{
int ret = 0;
@@ -1474,11 +963,11 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
POS(ca->dev_idx, ca->mi.first_bucket),
POS(ca->dev_idx, ca->mi.nbuckets - 1),
- BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_slots|BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- bch2_alloc_write_key(trans, &iter, k, metadata_only)));
+ bch2_alloc_write_key(trans, &iter, ca, k)));
if (ret) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
break;
}
}
@@ -1487,14 +976,14 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
return ret;
}
-static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_alloc_start(struct bch_fs *c)
{
for_each_member_device(c, ca) {
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
bch_err(c, "error allocating ca->buckets[gc]");
return -BCH_ERR_ENOMEM_gc_alloc_start;
}
@@ -1504,54 +993,29 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
rcu_assign_pointer(ca->buckets_gc, buckets);
}
+ struct bch_dev *ca = NULL;
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
- struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
- struct bucket *g = gc_bucket(ca, k.k->p.offset);
+ BTREE_ITER_prefetch, k, ({
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+ struct bucket *g = gc_bucket(ca, k.k->p.offset);
g->gen_valid = 1;
g->gen = a->gen;
-
- if (metadata_only &&
- (a->data_type == BCH_DATA_user ||
- a->data_type == BCH_DATA_cached ||
- a->data_type == BCH_DATA_parity)) {
- g->data_type = a->data_type;
- g->dirty_sectors = a->dirty_sectors;
- g->cached_sectors = a->cached_sectors;
- g->stripe = a->stripe;
- g->stripe_redundancy = a->stripe_redundancy;
- }
-
0;
})));
+ bch2_dev_put(ca);
bch_err_fn(c, ret);
return ret;
}
-static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
-{
- for_each_member_device(c, ca) {
- struct bucket_array *buckets = gc_bucket_array(ca);
- struct bucket *g;
-
- for_each_bucket(g, buckets) {
- if (metadata_only &&
- (g->data_type == BCH_DATA_user ||
- g->data_type == BCH_DATA_cached ||
- g->data_type == BCH_DATA_parity))
- continue;
- g->data_type = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- }
- }
-}
-
static int bch2_gc_write_reflink_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -1601,35 +1065,27 @@ fsck_err:
return ret;
}
-static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_reflink_done(struct bch_fs *c)
{
size_t idx = 0;
- if (metadata_only)
- return 0;
-
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
c->reflink_gc_nr = 0;
return ret;
}
-static int bch2_gc_reflink_start(struct bch_fs *c,
- bool metadata_only)
+static int bch2_gc_reflink_start(struct bch_fs *c)
{
-
- if (metadata_only)
- return 0;
-
c->reflink_gc_nr = 0;
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
const __le64 *refcount = bkey_refcount_c(k);
if (!refcount)
@@ -1652,15 +1108,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
return ret;
}
-static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
-{
- struct genradix_iter iter;
- struct reflink_gc *r;
-
- genradix_for_each(&c->reflink_gc_table, iter, r)
- r->refcount = 0;
-}
-
static int bch2_gc_write_stripes_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
@@ -1714,30 +1161,20 @@ fsck_err:
return ret;
}
-static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+static int bch2_gc_stripes_done(struct bch_fs *c)
{
- if (metadata_only)
- return 0;
-
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_stripes_key(trans, &iter, k)));
}
-static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
-{
- genradix_free(&c->gc_stripes);
-}
-
/**
- * bch2_gc - walk _all_ references to buckets, and recompute them:
+ * bch2_check_allocations - walk all references to buckets, and recompute them:
*
* @c: filesystem object
- * @initial: are we in recovery?
- * @metadata_only: are we just checking metadata references, or everything?
*
* Returns: 0 on success, or standard errcode on failure
*
@@ -1756,9 +1193,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
-int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
+int bch2_check_allocations(struct bch_fs *c)
{
- unsigned iter = 0;
int ret;
lockdep_assert_held(&c->state_lock);
@@ -1768,62 +1204,30 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
bch2_btree_interior_updates_flush(c);
ret = bch2_gc_start(c) ?:
- bch2_gc_alloc_start(c, metadata_only) ?:
- bch2_gc_reflink_start(c, metadata_only);
+ bch2_gc_alloc_start(c) ?:
+ bch2_gc_reflink_start(c);
if (ret)
goto out;
-again:
- gc_pos_set(c, gc_phase(GC_PHASE_START));
- bch2_mark_superblocks(c);
+ gc_pos_set(c, gc_phase(GC_PHASE_START));
- ret = bch2_gc_btrees(c, initial, metadata_only);
+ ret = bch2_mark_superblocks(c);
+ BUG_ON(ret);
+ ret = bch2_gc_btrees(c);
if (ret)
goto out;
-#if 0
- bch2_mark_pending_btree_node_frees(c);
-#endif
c->gc_count++;
- if (test_bit(BCH_FS_need_another_gc, &c->flags) ||
- (!iter && bch2_test_restart_gc)) {
- if (iter++ > 2) {
- bch_info(c, "Unable to fix bucket gens, looping");
- ret = -EINVAL;
- goto out;
- }
-
- /*
- * XXX: make sure gens we fixed got saved
- */
- bch_info(c, "Second GC pass needed, restarting:");
- clear_bit(BCH_FS_need_another_gc, &c->flags);
- __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
-
- bch2_gc_stripes_reset(c, metadata_only);
- bch2_gc_alloc_reset(c, metadata_only);
- bch2_gc_reflink_reset(c, metadata_only);
- ret = bch2_gc_reset(c);
- if (ret)
- goto out;
-
- /* flush fsck errors, reset counters */
- bch2_flush_fsck_errs(c);
- goto again;
- }
+ bch2_journal_block(&c->journal);
out:
- if (!ret) {
- bch2_journal_block(&c->journal);
+ ret = bch2_gc_alloc_done(c) ?:
+ bch2_gc_done(c) ?:
+ bch2_gc_stripes_done(c) ?:
+ bch2_gc_reflink_done(c);
- ret = bch2_gc_alloc_done(c, metadata_only) ?:
- bch2_gc_done(c, initial, metadata_only) ?:
- bch2_gc_stripes_done(c, metadata_only) ?:
- bch2_gc_reflink_done(c, metadata_only);
-
- bch2_journal_unblock(&c->journal);
- }
+ bch2_journal_unblock(&c->journal);
percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
@@ -1852,23 +1256,33 @@ static int gc_btree_gens_key(struct btree_trans *trans,
struct bkey_i *u;
int ret;
+ if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
+ return -EROFS;
+
percpu_down_read(&c->mark_lock);
+ rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
- if (ptr_stale(ca, ptr) > 16) {
+ if (dev_ptr_stale(ca, ptr) > 16) {
+ rcu_read_unlock();
percpu_up_read(&c->mark_lock);
goto update;
}
}
bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
if (gen_after(*gen, ptr->gen))
*gen = ptr->gen;
}
+ rcu_read_unlock();
percpu_up_read(&c->mark_lock);
return 0;
update:
@@ -1881,10 +1295,9 @@ update:
return 0;
}
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
+ struct btree_iter *iter, struct bkey_s_c k)
{
- struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
struct bkey_i_alloc_v4 *a_mut;
@@ -1899,7 +1312,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
return ret;
a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
- a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
+ alloc_data_type_set(&a_mut->v, a_mut->v.data_type);
return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
}
@@ -1927,7 +1340,7 @@ int bch2_gc_gens(struct bch_fs *c)
ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
if (!ca->oldest_gen) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
ret = -BCH_ERR_ENOMEM_gc_gens;
goto err;
}
@@ -1945,7 +1358,7 @@ int bch2_gc_gens(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, i,
POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
@@ -1954,14 +1367,23 @@ int bch2_gc_gens(struct bch_fs *c)
goto err;
}
+ struct bch_dev *ca = NULL;
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN,
- BTREE_ITER_PREFETCH,
+ BTREE_ITER_prefetch,
k,
NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_alloc_write_oldest_gen(trans, &iter, k)));
+ BCH_TRANS_COMMIT_no_enospc, ({
+ ca = bch2_dev_iterate(c, ca, k.k->p.inode);
+ if (!ca) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
+ bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
+ })));
+ bch2_dev_put(ca);
+
if (ret)
goto err;
@@ -1985,87 +1407,23 @@ err:
return ret;
}
-static int bch2_gc_thread(void *arg)
+static void bch2_gc_gens_work(struct work_struct *work)
{
- struct bch_fs *c = arg;
- struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last = atomic64_read(&clock->now);
- unsigned last_kick = atomic_read(&c->kick_gc);
-
- set_freezable();
-
- while (1) {
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- return 0;
- }
-
- if (atomic_read(&c->kick_gc) != last_kick)
- break;
-
- if (c->btree_gc_periodic) {
- unsigned long next = last + c->capacity / 16;
-
- if (atomic64_read(&clock->now) >= next)
- break;
-
- bch2_io_clock_schedule_timeout(clock, next);
- } else {
- schedule();
- }
-
- try_to_freeze();
- }
- __set_current_state(TASK_RUNNING);
-
- last = atomic64_read(&clock->now);
- last_kick = atomic_read(&c->kick_gc);
-
- /*
- * Full gc is currently incompatible with btree key cache:
- */
-#if 0
- ret = bch2_gc(c, false, false);
-#else
- bch2_gc_gens(c);
-#endif
- debug_check_no_locks_held();
- }
-
- return 0;
+ struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
+ bch2_gc_gens(c);
+ bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
}
-void bch2_gc_thread_stop(struct bch_fs *c)
+void bch2_gc_gens_async(struct bch_fs *c)
{
- struct task_struct *p;
-
- p = c->gc_thread;
- c->gc_thread = NULL;
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
+ !queue_work(c->write_ref_wq, &c->gc_gens_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
}
-int bch2_gc_thread_start(struct bch_fs *c)
+void bch2_fs_gc_init(struct bch_fs *c)
{
- struct task_struct *p;
+ seqcount_init(&c->gc_pos_lock);
- if (c->gc_thread)
- return 0;
-
- p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
- if (IS_ERR(p)) {
- bch_err_fn(c, PTR_ERR(p));
- return PTR_ERR(p);
- }
-
- get_task_struct(p);
- c->gc_thread = p;
- wake_up_process(p);
- return 0;
+ INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 607575f83a00..1b6489d8e0f4 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -6,10 +6,7 @@
#include "btree_types.h"
int bch2_check_topology(struct bch_fs *);
-int bch2_gc(struct bch_fs *, bool, bool);
-int bch2_gc_gens(struct bch_fs *);
-void bch2_gc_thread_stop(struct bch_fs *);
-int bch2_gc_thread_start(struct bch_fs *);
+int bch2_check_allocations(struct bch_fs *);
/*
* For concurrent mark and sweep (with other index updates), we define a total
@@ -37,16 +34,16 @@ static inline struct gc_pos gc_phase(enum gc_phase phase)
{
return (struct gc_pos) {
.phase = phase,
- .pos = POS_MIN,
.level = 0,
+ .pos = POS_MIN,
};
}
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
- return cmp_int(l.phase, r.phase) ?:
- bpos_cmp(l.pos, r.pos) ?:
- cmp_int(l.level, r.level);
+ return cmp_int(l.phase, r.phase) ?:
+ -cmp_int(l.level, r.level) ?:
+ bpos_cmp(l.pos, r.pos);
}
static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
@@ -60,13 +57,13 @@ static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
}
}
-static inline struct gc_pos gc_pos_btree(enum btree_id id,
- struct bpos pos, unsigned level)
+static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
+ struct bpos pos)
{
return (struct gc_pos) {
- .phase = btree_id_to_gc_phase(id),
- .pos = pos,
+ .phase = btree_id_to_gc_phase(btree),
.level = level,
+ .pos = pos,
};
}
@@ -76,19 +73,7 @@ static inline struct gc_pos gc_pos_btree(enum btree_id id,
*/
static inline struct gc_pos gc_pos_btree_node(struct btree *b)
{
- return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level);
-}
-
-/*
- * GC position of the pointer to a btree root: we don't use
- * gc_pos_pointer_to_btree_node() here to avoid a potential race with
- * btree_split() increasing the tree depth - the new root will have level > the
- * old root and thus have a greater gc position than the old root, but that
- * would be incorrect since once gc has marked the root it's not coming back.
- */
-static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
-{
- return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH);
+ return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p);
}
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
@@ -104,11 +89,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
return ret;
}
-static inline void bch2_do_gc_gens(struct bch_fs *c)
-{
- atomic_inc(&c->kick_gc);
- if (c->gc_thread)
- wake_up_process(c->gc_thread);
-}
+int bch2_gc_gens(struct bch_fs *);
+void bch2_gc_gens_async(struct bch_fs *);
+void bch2_fs_gc_init(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index debb0edc3455..cbf8f5d90602 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -23,6 +23,18 @@
#include <linux/sched/mm.h>
+static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
+{
+ prt_printf(out, "btree=%s l=%u seq %llux\n",
+ bch2_btree_id_str(BTREE_NODE_ID(bn)),
+ (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq);
+ prt_str(out, "min: ");
+ bch2_bpos_to_text(out, bn->min_key);
+ prt_newline(out);
+ prt_str(out, "max: ");
+ bch2_bpos_to_text(out, bn->max_key);
+}
+
void bch2_btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));
@@ -217,7 +229,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t,
static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
{
- struct bset_tree *t;
bool ret = false;
for_each_bset(b, t) {
@@ -288,8 +299,7 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
static void btree_node_sort(struct bch_fs *c, struct btree *b,
unsigned start_idx,
- unsigned end_idx,
- bool filter_whiteouts)
+ unsigned end_idx)
{
struct btree_node *out;
struct sort_iter_stack sort_iter;
@@ -320,7 +330,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
start_time = local_clock();
- u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
+ u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
out->keys.u64s = cpu_to_le16(u64s);
@@ -426,13 +436,12 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b)
break;
if (b->nsets - unwritten_idx > 1) {
- btree_node_sort(c, b, unwritten_idx,
- b->nsets, false);
+ btree_node_sort(c, b, unwritten_idx, b->nsets);
ret = true;
}
if (unwritten_idx > 1) {
- btree_node_sort(c, b, 0, unwritten_idx, false);
+ btree_node_sort(c, b, 0, unwritten_idx);
ret = true;
}
@@ -441,8 +450,6 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b)
void bch2_btree_build_aux_trees(struct btree *b)
{
- struct bset_tree *t;
-
for_each_bset(b, t)
bch2_bset_build_aux_tree(b, t,
!bset_written(b, bset(b, t)) &&
@@ -524,7 +531,9 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "at btree ");
bch2_btree_pos_to_text(out, c, b);
- prt_printf(out, "\n node offset %u/%u",
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "\nnode offset %u/%u",
b->written, btree_ptr_sectors_written(&b->key));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
@@ -543,6 +552,7 @@ static int __btree_err(int ret,
const char *fmt, ...)
{
struct printbuf out = PRINTBUF;
+ bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
va_list args;
btree_err_msg(&out, c, ca, b, i, b->written, write);
@@ -564,12 +574,14 @@ static int __btree_err(int ret,
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
ret = -BCH_ERR_btree_node_read_err_bad_node;
- if (ret != -BCH_ERR_btree_node_read_err_fixable)
+ if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
bch2_sb_error_count(c, err_type);
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
- ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf);
+ ret = !silent
+ ? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf)
+ : -BCH_ERR_fsck_fix;
if (ret != -BCH_ERR_fsck_fix &&
ret != -BCH_ERR_fsck_ignore)
goto fsck_err;
@@ -577,14 +589,17 @@ static int __btree_err(int ret,
break;
case -BCH_ERR_btree_node_read_err_want_retry:
case -BCH_ERR_btree_node_read_err_must_retry:
- bch2_print_string_as_lines(KERN_ERR, out.buf);
+ if (!silent)
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
break;
case -BCH_ERR_btree_node_read_err_bad_node:
- bch2_print_string_as_lines(KERN_ERR, out.buf);
+ if (!silent)
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
ret = bch2_topology_error(c);
break;
case -BCH_ERR_btree_node_read_err_incompatible:
- bch2_print_string_as_lines(KERN_ERR, out.buf);
+ if (!silent)
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
ret = -BCH_ERR_fsck_errors_not_fixed;
break;
default:
@@ -619,8 +634,6 @@ fsck_err:
__cold
void bch2_btree_node_drop_keys_outside_node(struct btree *b)
{
- struct bset_tree *t;
-
for_each_bset(b, t) {
struct bset *i = bset(b, t);
struct bkey_packed *k;
@@ -1021,18 +1034,19 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
btree_node_bad_seq,
- "got wrong btree node (want %llx got %llx)\n"
- "got btree %s level %llu pos %s",
- bp->seq, b->data->keys.seq,
- bch2_btree_id_str(BTREE_NODE_ID(b->data)),
- BTREE_NODE_LEVEL(b->data),
- buf.buf);
+ "got wrong btree node: got\n%s",
+ (printbuf_reset(&buf),
+ bch2_btree_node_header_to_text(&buf, b->data),
+ buf.buf));
} else {
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
btree_node_bad_seq,
- "bad btree header: seq 0");
+ "bad btree header: seq 0\n%s",
+ (printbuf_reset(&buf),
+ bch2_btree_node_header_to_text(&buf, b->data),
+ buf.buf));
}
while (b->written < (ptr_written ?: btree_sectors(c))) {
@@ -1095,7 +1109,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
nonce = btree_nonce(i, b->written << 9);
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
csum_bad = bch2_crc_cmp(bne->csum, csum);
- if (csum_bad)
+ if (ca && csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
btree_err_on(csum_bad,
@@ -1249,12 +1263,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
+ rcu_read_lock();
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
- if (ca2->mi.state != BCH_MEMBER_STATE_rw)
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
set_btree_node_need_rewrite(b);
}
+ rcu_read_unlock();
if (!ptr_written)
set_btree_node_need_rewrite(b);
@@ -1279,8 +1295,8 @@ static void btree_node_read_work(struct work_struct *work)
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
struct btree *b = rb->b;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
struct printbuf buf = PRINTBUF;
@@ -1292,8 +1308,8 @@ static void btree_node_read_work(struct work_struct *work)
while (1) {
retry = true;
bch_info(c, "retrying read");
- ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
- rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
+ rb->have_ioref = ca != NULL;
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_buf_bytes(b);
@@ -1307,7 +1323,7 @@ static void btree_node_read_work(struct work_struct *work)
start:
printbuf_reset(&buf);
bch2_btree_pos_to_text(&buf, c, b);
- bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
"btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
@@ -1363,7 +1379,7 @@ static void btree_node_read_endio(struct bio *bio)
struct bch_fs *c = rb->c;
if (rb->have_ioref) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
}
@@ -1560,7 +1576,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
struct btree_node_read_all *ra = rb->ra;
if (rb->have_ioref) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
}
@@ -1602,14 +1618,14 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
struct btree_read_bio *rb =
container_of(ra->bio[i], struct btree_read_bio, bio);
rb->c = c;
rb->b = b;
rb->ra = ra;
rb->start_time = local_clock();
- rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ rb->have_ioref = ca != NULL;
rb->idx = i;
rb->pick = pick;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
@@ -1679,7 +1695,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
return;
}
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
bio = bio_alloc_bioset(NULL,
buf_pages(b->data, btree_buf_bytes(b)),
@@ -1691,7 +1707,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
rb->b = b;
rb->ra = NULL;
rb->start_time = local_clock();
- rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ rb->have_ioref = ca != NULL;
rb->pick = pick;
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
@@ -1846,7 +1862,6 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
- struct bch_extent_ptr *ptr;
int ret = 0;
btree_bounce_free(c,
@@ -1896,13 +1911,14 @@ static void btree_node_write_endio(struct bio *bio)
struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio);
struct bch_fs *c = wbio->c;
struct btree *b = wbio->bio.bi_private;
- struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
+ struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
unsigned long flags;
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ if (!ca ||
+ bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"btree write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
@@ -1969,7 +1985,6 @@ static void btree_write_submit(struct work_struct *work)
void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
{
struct btree_write_bio *wbio;
- struct bset_tree *t;
struct bset *i;
struct btree_node *bn = NULL;
struct btree_node_entry *bne = NULL;
@@ -2095,11 +2110,11 @@ do_write:
unwritten_whiteouts_end(b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
- b->whiteout_u64s = 0;
-
- u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
+ u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
le16_add_cpu(&i->u64s, u64s);
+ b->whiteout_u64s = 0;
+
BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
set_needs_whiteout(i, false);
@@ -2226,7 +2241,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
{
bool invalidated_iter = false;
struct btree_node_entry *bne;
- struct bset_tree *t;
if (!btree_node_just_written(b))
return false;
@@ -2249,7 +2263,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
* single bset:
*/
if (b->nsets > 1) {
- btree_node_sort(c, b, 0, b->nsets, true);
+ btree_node_sort(c, b, 0, b->nsets);
invalidated_iter = true;
} else {
invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
@@ -2346,20 +2360,13 @@ void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
printbuf_tabstop_push(out, 20);
printbuf_tabstop_push(out, 10);
- prt_tab(out);
- prt_str(out, "nr");
- prt_tab(out);
- prt_str(out, "size");
- prt_newline(out);
+ prt_printf(out, "\tnr\tsize\n");
for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
u64 nr = atomic64_read(&c->btree_write_stats[i].nr);
u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes);
- prt_printf(out, "%s:", bch2_btree_write_types[i]);
- prt_tab(out);
- prt_u64(out, nr);
- prt_tab(out);
+ prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr);
prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
prt_newline(out);
}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index e251cb6b965f..2b8b564fc560 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -81,8 +81,6 @@ static inline bool should_compact_bset_lazy(struct btree *b,
static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
{
- struct bset_tree *t;
-
for_each_bset(b, t)
if (should_compact_bset_lazy(b, t))
return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2a211a4bebd1..5bf98cb8b15d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -61,7 +61,7 @@ static inline int btree_path_cmp(const struct btree_path *l,
static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
{
/* Are we iterating over keys in all snapshots? */
- if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ if (iter->flags & BTREE_ITER_all_snapshots) {
p = bpos_successor(p);
} else {
p = bpos_nosnap_successor(p);
@@ -74,7 +74,7 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
{
/* Are we iterating over keys in all snapshots? */
- if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ if (iter->flags & BTREE_ITER_all_snapshots) {
p = bpos_predecessor(p);
} else {
p = bpos_nosnap_predecessor(p);
@@ -88,7 +88,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
{
struct bpos pos = iter->pos;
- if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ if ((iter->flags & BTREE_ITER_is_extents) &&
!bkey_eq(pos, POS_MAX))
pos = bkey_successor(iter, pos);
return pos;
@@ -253,13 +253,13 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(iter->btree_id >= BTREE_ID_NR);
- BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached);
+ BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
- BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
- (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+ BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
+ (iter->flags & BTREE_ITER_all_snapshots));
- BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
- (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
+ (iter->flags & BTREE_ITER_all_snapshots) &&
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
@@ -269,10 +269,10 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
- BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
!iter->pos.snapshot);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
iter->pos.snapshot != iter->snapshot);
BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
@@ -289,7 +289,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
if (!bch2_debug_check_iterators)
return 0;
- if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+ if (!(iter->flags & BTREE_ITER_filter_snapshots))
return 0;
if (bkey_err(k) || !k.k)
@@ -300,8 +300,8 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
k.k->p.snapshot));
bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
- BTREE_ITER_NOPRESERVE|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_nopreserve|
+ BTREE_ITER_all_snapshots);
prev = bch2_btree_iter_prev(&copy);
if (!prev.k)
goto out;
@@ -897,7 +897,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
bch2_bkey_buf_reassemble(out, c, k);
- if ((flags & BTREE_ITER_PREFETCH) &&
+ if ((flags & BTREE_ITER_prefetch) &&
c->opts.btree_node_prefetch)
ret = btree_path_prefetch_j(trans, path, &jiter);
@@ -944,7 +944,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
bch2_bkey_buf_unpack(&tmp, c, l->b, k);
- if ((flags & BTREE_ITER_PREFETCH) &&
+ if ((flags & BTREE_ITER_prefetch) &&
c->opts.btree_node_prefetch) {
ret = btree_path_prefetch(trans, path);
if (ret)
@@ -999,6 +999,7 @@ retry_all:
bch2_trans_unlock(trans);
cond_resched();
+ trans->locked = true;
if (unlikely(trans->memory_allocation_failure)) {
struct closure cl;
@@ -1162,6 +1163,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out_uptodate;
path->level = btree_path_up_until_good_node(trans, path, 0);
+ unsigned max_level = path->level;
EBUG_ON(btree_path_node(path, path->level) &&
!btree_node_locked(path, path->level));
@@ -1192,6 +1194,16 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
}
+
+ if (unlikely(max_level > path->level)) {
+ struct btree_path *linked;
+ unsigned iter;
+
+ trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
+ for (unsigned j = path->level + 1; j < max_level; j++)
+ linked->l[j] = path->l[j];
+ }
+
out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
out:
@@ -1221,11 +1233,14 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path
}
static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
- bool intent)
+ bool intent, unsigned long ip)
{
btree_path_idx_t new = btree_path_alloc(trans, src);
btree_path_copy(trans, trans->paths + new, trans->paths + src);
__btree_path_get(trans->paths + new, intent);
+#ifdef TRACK_PATH_ALLOCATED
+ trans->paths[new].ip_allocated = ip;
+#endif
return new;
}
@@ -1234,7 +1249,7 @@ btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
btree_path_idx_t path, bool intent, unsigned long ip)
{
__btree_path_put(trans->paths + path, intent);
- path = btree_path_clone(trans, path, intent);
+ path = btree_path_clone(trans, path, intent, ip);
trans->paths[path].preserve = false;
return path;
}
@@ -1334,6 +1349,26 @@ static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t
__clear_bit(path, trans->paths_allocated);
}
+static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
+{
+ unsigned l = path->level;
+
+ do {
+ if (!btree_path_node(path, l))
+ break;
+
+ if (!is_btree_node(path, l))
+ return false;
+
+ if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
+ return false;
+
+ l++;
+ } while (l < path->locks_want);
+
+ return true;
+}
+
void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
{
struct btree_path *path = trans->paths + path_idx, *dup;
@@ -1348,10 +1383,15 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
return;
- if (path->should_be_locked &&
- !trans->restarted &&
- (!dup || !bch2_btree_path_relock_norestart(trans, dup)))
- return;
+ if (path->should_be_locked && !trans->restarted) {
+ if (!dup)
+ return;
+
+ if (!(trans->locked
+ ? bch2_btree_path_relock_norestart(trans, dup)
+ : bch2_btree_path_can_relock(trans, dup)))
+ return;
+ }
if (dup) {
dup->preserve |= path->preserve;
@@ -1384,22 +1424,26 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
(void *) trans->last_restarted_ip);
}
+void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
+{
+ panic("trans should be locked, unlocked by %pS\n",
+ (void *) trans->last_unlock_ip);
+}
+
noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
- prt_printf(buf, "transaction updates for %s journal seq %llu",
+ prt_printf(buf, "transaction updates for %s journal seq %llu\n",
trans->fn, trans->journal_res.seq);
- prt_newline(buf);
printbuf_indent_add(buf, 2);
trans_for_each_update(trans, i) {
struct bkey_s_c old = { &i->old_k, i->old_v };
- prt_printf(buf, "update: btree=%s cached=%u %pS",
+ prt_printf(buf, "update: btree=%s cached=%u %pS\n",
bch2_btree_id_str(i->btree_id),
i->cached,
(void *) i->ip_allocated);
- prt_newline(buf);
prt_printf(buf, " old ");
bch2_bkey_val_to_text(buf, trans->c, old);
@@ -1428,23 +1472,63 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
printbuf_exit(&buf);
}
-static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
+static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
{
struct btree_path *path = trans->paths + path_idx;
- prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
+ prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ",
path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
+ path->cached ? 'C' : 'B',
bch2_btree_id_str(path->btree_id),
path->level);
bch2_bpos_to_text(out, path->pos);
- prt_printf(out, " locks %u", path->nodes_locked);
#ifdef TRACK_PATH_ALLOCATED
prt_printf(out, " %pS", (void *) path->ip_allocated);
#endif
+}
+
+static const char *btree_node_locked_str(enum btree_node_locked_type t)
+{
+ switch (t) {
+ case BTREE_NODE_UNLOCKED:
+ return "unlocked";
+ case BTREE_NODE_READ_LOCKED:
+ return "read";
+ case BTREE_NODE_INTENT_LOCKED:
+ return "intent";
+ case BTREE_NODE_WRITE_LOCKED:
+ return "write";
+ default:
+ return NULL;
+ }
+}
+
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
+{
+ bch2_btree_path_to_text_short(out, trans, path_idx);
+
+ struct btree_path *path = trans->paths + path_idx;
+
+ prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
+ prt_printf(out, "l=%u locks %s seq %u node ", l,
+ btree_node_locked_str(btree_node_locked_type(path, l)),
+ path->l[l].lock_seq);
+
+ int ret = PTR_ERR_OR_ZERO(path->l[l].b);
+ if (ret)
+ prt_str(out, bch2_err_str(ret));
+ else
+ prt_printf(out, "%px", path->l[l].b);
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
}
static noinline __cold
@@ -1456,8 +1540,10 @@ void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
if (!nosort)
btree_trans_sort_paths(trans);
- trans_for_each_path_idx_inorder(trans, iter)
- bch2_btree_path_to_text(out, trans, iter.path_idx);
+ trans_for_each_path_idx_inorder(trans, iter) {
+ bch2_btree_path_to_text_short(out, trans, iter.path_idx);
+ prt_newline(out);
+ }
}
noinline __cold
@@ -1608,11 +1694,12 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
unsigned flags, unsigned long ip)
{
struct btree_path *path;
- bool cached = flags & BTREE_ITER_CACHED;
- bool intent = flags & BTREE_ITER_INTENT;
+ bool cached = flags & BTREE_ITER_cached;
+ bool intent = flags & BTREE_ITER_intent;
struct trans_for_each_path_inorder_iter iter;
btree_path_idx_t path_pos = 0, path_idx;
+ bch2_trans_verify_not_unlocked(trans);
bch2_trans_verify_not_in_restart(trans);
bch2_trans_verify_locks(trans);
@@ -1657,7 +1744,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
trans->paths_sorted = false;
}
- if (!(flags & BTREE_ITER_NOPRESERVE))
+ if (!(flags & BTREE_ITER_nopreserve))
path->preserve = true;
if (path->intent_ref)
@@ -1678,6 +1765,22 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
return path_idx;
}
+btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
+ enum btree_id btree_id,
+ unsigned level,
+ struct bpos pos)
+{
+ btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
+ BTREE_ITER_nopreserve|
+ BTREE_ITER_intent, _RET_IP_);
+ path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
+
+ struct btree_path *path = trans->paths + path_idx;
+ bch2_btree_path_downgrade(trans, path);
+ __bch2_btree_path_unlock(trans, path);
+ return path_idx;
+}
+
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
{
@@ -1719,6 +1822,19 @@ hole:
return (struct bkey_s_c) { u, NULL };
}
+
+void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+
+ if (!iter->path || trans->restarted)
+ return;
+
+ struct btree_path *path = btree_iter_path(trans, iter);
+ path->preserve = false;
+ if (path->ref == 1)
+ path->should_be_locked = false;
+}
/* Btree iterators: */
int __must_check
@@ -1733,9 +1849,11 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
struct btree_trans *trans = iter->trans;
int ret;
+ bch2_trans_verify_not_unlocked(trans);
+
iter->path = bch2_btree_path_set_pos(trans, iter->path,
btree_iter_search_key(iter),
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
@@ -1774,7 +1892,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->k.p = iter->pos = b->key.k.p;
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out:
@@ -1835,13 +1953,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
if (bpos_eq(iter->pos, b->key.k.p)) {
__btree_path_set_level_up(trans, path, path->level++);
} else {
+ if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(trans, path, path->level + 1);
+
/*
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
iter->path = bch2_btree_path_set_pos(trans, iter->path,
bpos_successor(iter->pos),
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
path = btree_iter_path(trans, iter);
@@ -1859,7 +1980,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
iter->k.p = iter->pos = b->key.k.p;
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
@@ -1878,11 +1999,11 @@ err:
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
struct bpos pos = iter->k.p;
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ bool ret = !(iter->flags & BTREE_ITER_all_snapshots
? bpos_eq(pos, SPOS_MAX)
: bkey_eq(pos, SPOS_MAX));
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_successor(iter, pos);
bch2_btree_iter_set_pos(iter, pos);
return ret;
@@ -1891,11 +2012,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
{
struct bpos pos = bkey_start_pos(&iter->k);
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ bool ret = !(iter->flags & BTREE_ITER_all_snapshots
? bpos_eq(pos, POS_MIN)
: bkey_eq(pos, POS_MIN));
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_predecessor(iter, pos);
bch2_btree_iter_set_pos(iter, pos);
return ret;
@@ -2006,7 +2127,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
struct bkey_s_c k;
int ret;
- if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
+ bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked(trans);
+
+ if ((iter->flags & BTREE_ITER_key_cache_fill) &&
bpos_eq(iter->pos, pos))
return bkey_s_c_null;
@@ -2015,17 +2139,17 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
if (!iter->key_cache_path)
iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
- iter->flags & BTREE_ITER_INTENT, 0,
- iter->flags|BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL,
+ iter->flags & BTREE_ITER_intent, 0,
+ iter->flags|BTREE_ITER_cached|
+ BTREE_ITER_cached_nofill,
_THIS_IP_);
iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
- iter->flags|BTREE_ITER_CACHED) ?:
+ iter->flags|BTREE_ITER_cached) ?:
bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
if (unlikely(ret))
return bkey_s_c_err(ret);
@@ -2053,7 +2177,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
struct btree_path_level *l;
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2078,7 +2202,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
k = btree_path_level_peek_all(trans->c, l, &iter->k);
- if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
(k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
k = k2;
@@ -2089,10 +2213,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
}
}
- if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+ if (unlikely(iter->flags & BTREE_ITER_with_journal))
k = btree_trans_peek_journal(trans, iter, k);
- if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
bch2_btree_trans_peek_updates(trans, iter, &k);
@@ -2144,11 +2268,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
struct bpos iter_pos;
int ret;
- EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
+ bch2_trans_verify_not_unlocked(trans);
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
if (iter->update_path) {
bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
}
@@ -2171,7 +2296,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* isn't monotonically increasing before FILTER_SNAPSHOTS, and
* that's what we check against in extents mode:
*/
- if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+ if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
? bkey_gt(k.k->p, end)
: k.k->p.inode > end.inode))
goto end;
@@ -2179,13 +2304,13 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
if (iter->update_path &&
!bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
}
- if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
- (iter->flags & BTREE_ITER_INTENT) &&
- !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ if ((iter->flags & BTREE_ITER_filter_snapshots) &&
+ (iter->flags & BTREE_ITER_intent) &&
+ !(iter->flags & BTREE_ITER_is_extents) &&
!iter->update_path) {
struct bpos pos = k.k->p;
@@ -2200,12 +2325,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* advance, same as on exit for iter->path, but only up
* to snapshot
*/
- __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
iter->update_path = iter->path;
iter->update_path = bch2_btree_path_set_pos(trans,
iter->update_path, pos,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
_THIS_IP_);
ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
if (unlikely(ret)) {
@@ -2218,7 +2343,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* We can never have a key in a leaf node at POS_MAX, so
* we don't have to check these successor() calls:
*/
- if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ if ((iter->flags & BTREE_ITER_filter_snapshots) &&
!bch2_snapshot_is_ancestor(trans->c,
iter->snapshot,
k.k->p.snapshot)) {
@@ -2227,7 +2352,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
}
if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ !(iter->flags & BTREE_ITER_all_snapshots)) {
search_key = bkey_successor(iter, k.k->p);
continue;
}
@@ -2237,12 +2362,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* equal to the key we just returned - except extents can
* straddle iter->pos:
*/
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (!(iter->flags & BTREE_ITER_is_extents))
iter_pos = k.k->p;
else
iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
- if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+ if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
? bkey_gt(iter_pos, end)
: bkey_ge(iter_pos, end)))
goto end;
@@ -2253,7 +2378,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
iter->pos = iter_pos;
iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
@@ -2266,7 +2391,7 @@ out_no_locked:
btree_path_set_should_be_locked(trans->paths + iter->update_path);
}
- if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ if (!(iter->flags & BTREE_ITER_all_snapshots))
iter->pos.snapshot = iter->snapshot;
ret = bch2_btree_iter_verify_ret(iter, k);
@@ -2316,21 +2441,22 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
btree_path_idx_t saved_path = 0;
int ret;
+ bch2_trans_verify_not_unlocked(trans);
EBUG_ON(btree_iter_path(trans, iter)->cached ||
btree_iter_path(trans, iter)->level);
- if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+ if (iter->flags & BTREE_ITER_with_journal)
return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ if (iter->flags & BTREE_ITER_filter_snapshots)
search_key.snapshot = U32_MAX;
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2345,17 +2471,17 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
if (!k.k ||
- ((iter->flags & BTREE_ITER_IS_EXTENTS)
+ ((iter->flags & BTREE_ITER_is_extents)
? bpos_ge(bkey_start_pos(k.k), search_key)
: bpos_gt(k.k->p, search_key)))
k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
- if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
bch2_btree_trans_peek_prev_updates(trans, iter, &k);
if (likely(k.k)) {
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+ if (iter->flags & BTREE_ITER_filter_snapshots) {
if (k.k->p.snapshot == iter->snapshot)
goto got_key;
@@ -2366,7 +2492,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
*/
if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
bch2_path_put_nokeep(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->path = saved_path;
saved_path = 0;
iter->k = saved_k;
@@ -2379,9 +2505,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
k.k->p.snapshot)) {
if (saved_path)
bch2_path_put_nokeep(trans, saved_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
saved_path = btree_path_clone(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent,
+ _THIS_IP_);
path = btree_iter_path(trans, iter);
saved_k = *k.k;
saved_v = k.v;
@@ -2392,9 +2519,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
}
got_key:
if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ !(iter->flags & BTREE_ITER_all_snapshots)) {
search_key = bkey_predecessor(iter, k.k->p);
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ if (iter->flags & BTREE_ITER_filter_snapshots)
search_key.snapshot = U32_MAX;
continue;
}
@@ -2418,11 +2545,11 @@ got_key:
if (bkey_lt(k.k->p, iter->pos))
iter->pos = k.k->p;
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ if (iter->flags & BTREE_ITER_filter_snapshots)
iter->pos.snapshot = iter->snapshot;
out_no_locked:
if (saved_path)
- bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
+ bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -2452,12 +2579,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bkey_s_c k;
int ret;
+ bch2_trans_verify_not_unlocked(trans);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
+ EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
/* extents can't span inode numbers: */
- if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ if ((iter->flags & BTREE_ITER_is_extents) &&
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
if (iter->pos.inode == KEY_INODE_MAX)
return bkey_s_c_null;
@@ -2467,7 +2595,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
search_key = btree_iter_search_key(iter);
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
@@ -2476,22 +2604,22 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out_no_locked;
}
- if ((iter->flags & BTREE_ITER_CACHED) ||
- !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
+ if ((iter->flags & BTREE_ITER_cached) ||
+ !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
k = bkey_s_c_null;
- if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates)) {
bch2_btree_trans_peek_slot_updates(trans, iter, &k);
if (k.k)
goto out;
}
- if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+ if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
(k = btree_trans_peek_slot_journal(trans, iter)).k)
goto out;
- if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
(k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
if (!bkey_err(k))
iter->k = *k.k;
@@ -2506,12 +2634,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bpos next;
struct bpos end = iter->pos;
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ if (iter->flags & BTREE_ITER_is_extents)
end.offset = U64_MAX;
EBUG_ON(btree_iter_path(trans, iter)->level);
- if (iter->flags & BTREE_ITER_INTENT) {
+ if (iter->flags & BTREE_ITER_intent) {
struct btree_iter iter2;
bch2_trans_copy_iter(&iter2, iter);
@@ -2542,7 +2670,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bkey_init(&iter->k);
iter->k.p = iter->pos;
- if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ if (iter->flags & BTREE_ITER_is_extents) {
bch2_key_resize(&iter->k,
min_t(u64, KEY_SIZE_MAX,
(next.inode == iter->pos.inode
@@ -2726,13 +2854,13 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
{
if (iter->update_path)
bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
if (iter->path)
bch2_path_put(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
if (iter->key_cache_path)
bch2_path_put(trans, iter->key_cache_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->path = 0;
iter->update_path = 0;
iter->key_cache_path = 0;
@@ -2757,9 +2885,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
unsigned depth,
unsigned flags)
{
- flags |= BTREE_ITER_NOT_EXTENTS;
- flags |= __BTREE_ITER_ALL_SNAPSHOTS;
- flags |= BTREE_ITER_ALL_SNAPSHOTS;
+ flags |= BTREE_ITER_not_extents;
+ flags |= BTREE_ITER_snapshot_field;
+ flags |= BTREE_ITER_all_snapshots;
bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
__bch2_btree_iter_flags(trans, btree_id, flags),
@@ -2782,9 +2910,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
dst->ip_allocated = _RET_IP_;
#endif
if (src->path)
- __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent);
if (src->update_path)
- __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
dst->key_cache_path = 0;
}
@@ -2953,7 +3081,8 @@ u32 bch2_trans_begin(struct btree_trans *trans)
if (!trans->restarted &&
(need_resched() ||
time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
- drop_locks_do(trans, (cond_resched(), 0));
+ bch2_trans_unlock(trans);
+ cond_resched();
now = local_clock();
}
trans->last_begin_time = now;
@@ -2963,11 +3092,14 @@ u32 bch2_trans_begin(struct btree_trans *trans)
bch2_trans_srcu_unlock(trans);
trans->last_begin_ip = _RET_IP_;
+ trans->locked = true;
+
if (trans->restarted) {
bch2_btree_path_traverse_all(trans);
trans->notrace_relock_fail = false;
}
+ bch2_trans_verify_not_unlocked(trans);
return trans->restart_count;
}
@@ -3020,7 +3152,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
*/
BUG_ON(pos_task &&
pid == pos_task->pid &&
- bch2_trans_locked(pos));
+ pos->locked);
if (pos_task && pid < pos_task->pid) {
list_add_tail(&trans->list, &pos->list);
@@ -3036,8 +3168,9 @@ got_trans:
trans->last_begin_time = local_clock();
trans->fn_idx = fn_idx;
trans->locking_wait.task = current;
+ trans->locked = true;
trans->journal_replay_not_finished =
- unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+ unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
atomic_inc_not_zero(&c->journal_keys.ref);
trans->nr_paths = ARRAY_SIZE(trans->_paths);
trans->paths_allocated = trans->_paths_allocated;
@@ -3166,13 +3299,11 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
pid = owner ? owner->pid : 0;
rcu_read_unlock();
- prt_tab(out);
- prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
+ prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
b->level, bch2_btree_id_str(b->btree_id));
bch2_bpos_to_text(out, btree_node_pos(b));
- prt_tab(out);
- prt_printf(out, " locks %u:%u:%u held by pid %u",
+ prt_printf(out, "\t locks %u:%u:%u held by pid %u",
c.n[0], c.n[1], c.n[2], pid);
}
@@ -3229,10 +3360,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
b = READ_ONCE(trans->locking);
if (b) {
- prt_printf(out, " blocked for %lluus on",
- div_u64(local_clock() - trans->locking_wait.start_time,
- 1000));
- prt_newline(out);
+ prt_printf(out, " blocked for %lluus on\n",
+ div_u64(local_clock() - trans->locking_wait.start_time, 1000));
prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]);
bch2_btree_bkey_cached_common_to_text(out, b);
prt_newline(out);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1c70836dd7cc..eab2a25bdc7a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -216,9 +216,13 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
btree_path_idx_t,
unsigned, unsigned long);
+static inline void bch2_trans_verify_not_unlocked(struct btree_trans *);
+
static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
btree_path_idx_t path, unsigned flags)
{
+ bch2_trans_verify_not_unlocked(trans);
+
if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
@@ -227,6 +231,9 @@ static inline int __must_check bch2_btree_path_traverse(struct btree_trans *tran
btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
unsigned, unsigned, unsigned, unsigned long);
+btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id,
+ unsigned, struct bpos);
+
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
/*
@@ -283,7 +290,6 @@ int bch2_trans_relock(struct btree_trans *);
int bch2_trans_relock_notrace(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
void bch2_trans_unlock_long(struct btree_trans *);
-bool bch2_trans_locked(struct btree_trans *);
static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
{
@@ -309,6 +315,14 @@ static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
bch2_trans_in_restart_error(trans);
}
+void __noreturn bch2_trans_unlocked_error(struct btree_trans *);
+
+static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans)
+{
+ if (!trans->locked)
+ bch2_trans_unlocked_error(trans);
+}
+
__always_inline
static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
{
@@ -386,10 +400,10 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
if (unlikely(iter->update_path))
bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
- if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ if (!(iter->flags & BTREE_ITER_all_snapshots))
new_pos.snapshot = iter->snapshot;
__bch2_btree_iter_set_pos(iter, new_pos);
@@ -397,7 +411,7 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
{
- BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
+ BUG_ON(!(iter->flags & BTREE_ITER_is_extents));
iter->pos = bkey_start_pos(&iter->k);
}
@@ -416,20 +430,20 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
unsigned btree_id,
unsigned flags)
{
- if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+ if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
btree_id_is_extents(btree_id))
- flags |= BTREE_ITER_IS_EXTENTS;
+ flags |= BTREE_ITER_is_extents;
- if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+ if (!(flags & BTREE_ITER_snapshot_field) &&
!btree_type_has_snapshot_field(btree_id))
- flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+ flags &= ~BTREE_ITER_all_snapshots;
- if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ if (!(flags & BTREE_ITER_all_snapshots) &&
btree_type_has_snapshots(btree_id))
- flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+ flags |= BTREE_ITER_filter_snapshots;
if (trans->journal_replay_not_finished)
- flags |= BTREE_ITER_WITH_JOURNAL;
+ flags |= BTREE_ITER_with_journal;
return flags;
}
@@ -439,10 +453,10 @@ static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
unsigned flags)
{
if (!btree_id_cached(trans->c, btree_id)) {
- flags &= ~BTREE_ITER_CACHED;
- flags &= ~BTREE_ITER_WITH_KEY_CACHE;
- } else if (!(flags & BTREE_ITER_CACHED))
- flags |= BTREE_ITER_WITH_KEY_CACHE;
+ flags &= ~BTREE_ITER_cached;
+ flags &= ~BTREE_ITER_with_key_cache;
+ } else if (!(flags & BTREE_ITER_cached))
+ flags |= BTREE_ITER_with_key_cache;
return __bch2_btree_iter_flags(trans, btree_id, flags);
}
@@ -494,18 +508,7 @@ void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
unsigned, unsigned, unsigned);
void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
-static inline void set_btree_iter_dontneed(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
-
- if (!iter->path || trans->restarted)
- return;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- path->preserve = false;
- if (path->ref == 1)
- path->should_be_locked = false;
-}
+void bch2_set_btree_iter_dontneed(struct btree_iter *);
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -619,14 +622,14 @@ u32 bch2_trans_begin(struct btree_trans *);
static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek_prev(iter);
}
static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek(iter);
}
@@ -634,7 +637,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
struct bpos end,
unsigned flags)
{
- if (!(flags & BTREE_ITER_SLOTS))
+ if (!(flags & BTREE_ITER_slots))
return bch2_btree_iter_peek_upto(iter, end);
if (bkey_gt(iter->pos, end))
@@ -699,16 +702,12 @@ transaction_restart: \
_ret2 ?: trans_was_restarted(_trans, _restart_count); \
})
-#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _do) \
+#define for_each_btree_key_upto_continue(_trans, _iter, \
+ _end, _flags, _k, _do) \
({ \
- struct btree_iter _iter; \
struct bkey_s_c _k; \
int _ret3 = 0; \
\
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
do { \
_ret3 = lockrestart_do(_trans, ({ \
(_k) = bch2_btree_iter_peek_upto_type(&(_iter), \
@@ -724,6 +723,21 @@ transaction_restart: \
_ret3; \
})
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \
+ for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
+
+#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _do) \
+({ \
+ bch2_trans_begin(trans); \
+ \
+ struct btree_iter _iter; \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\
+})
+
#define for_each_btree_key(_trans, _iter, _btree_id, \
_start, _flags, _k, _do) \
for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \
@@ -794,14 +808,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
return k;
}
-#define for_each_btree_key_old(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
@@ -861,6 +867,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
})
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 1e8cf49a6935..332dbf164929 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -623,3 +623,20 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
keys->data[dst++] = *i;
keys->nr = keys->gap = dst;
}
+
+void bch2_journal_keys_dump(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct printbuf buf = PRINTBUF;
+
+ pr_info("%zu keys:", keys->nr);
+
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i) {
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+ pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf);
+ }
+ printbuf_exit(&buf);
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index af25046ebcaa..1ba4a79b0ef9 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -70,4 +70,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
unsigned, unsigned,
struct bpos, struct bpos);
+void bch2_journal_keys_dump(struct bch_fs *);
+
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 7dafa1accec2..75f5e6fe4634 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -383,9 +383,9 @@ static int btree_key_cache_fill(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
- BTREE_ITER_KEY_CACHE_FILL|
- BTREE_ITER_CACHED_NOFILL);
- iter.flags &= ~BTREE_ITER_WITH_JOURNAL;
+ BTREE_ITER_key_cache_fill|
+ BTREE_ITER_cached_nofill);
+ iter.flags &= ~BTREE_ITER_with_journal;
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -456,7 +456,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
/* We're not likely to need this iterator again: */
- set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(&iter);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -515,23 +515,10 @@ retry:
fill:
path->uptodate = BTREE_ITER_UPTODATE;
- if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
- /*
- * Using the underscore version because we haven't set
- * path->uptodate yet:
- */
- if (!path->locks_want &&
- !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
- trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
- goto err;
- }
-
- ret = btree_key_cache_fill(trans, path, ck);
- if (ret)
- goto err;
-
- ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
+ if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) {
+ ret = bch2_btree_path_upgrade(trans, path, 1) ?:
+ btree_key_cache_fill(trans, path, ck) ?:
+ bch2_btree_path_relock(trans, path, _THIS_IP_);
if (ret)
goto err;
@@ -622,13 +609,13 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
- BTREE_ITER_SLOTS|
- BTREE_ITER_INTENT|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_slots|
+ BTREE_ITER_intent|
+ BTREE_ITER_all_snapshots);
bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
+ b_iter.flags &= ~BTREE_ITER_with_key_cache;
ret = bch2_btree_iter_traverse(&c_iter);
if (ret)
@@ -661,14 +648,14 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
commit_flags |= BCH_WATERMARK_reclaim;
if (ck->journal.seq != journal_last_seq(j) ||
- !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags))
+ !test_bit(JOURNAL_space_low, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
- BTREE_UPDATE_KEY_CACHE_RECLAIM|
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
- BTREE_TRIGGER_NORUN) ?:
+ BTREE_UPDATE_key_cache_reclaim|
+ BTREE_UPDATE_internal_snapshot_node|
+ BTREE_TRIGGER_norun) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
@@ -790,7 +777,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
* flushing. The flush callback will not proceed unless ->seq matches
* the latest pin, so make sure it starts with a consistent value.
*/
- if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+ if (!(insert_entry->flags & BTREE_UPDATE_nojournal) ||
!journal_pin_active(&ck->journal)) {
ck->seq = trans->journal_res.seq;
}
@@ -835,6 +822,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
int srcu_idx;
mutex_lock(&bc->lock);
+ bc->requested_to_free += sc->nr_to_scan;
+
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
flags = memalloc_nofs_save();
@@ -853,6 +842,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
atomic_long_dec(&bc->nr_freed);
freed++;
bc->nr_freed_nonpcpu--;
+ bc->freed++;
}
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
@@ -866,6 +856,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
atomic_long_dec(&bc->nr_freed);
freed++;
bc->nr_freed_pcpu--;
+ bc->freed++;
}
rcu_read_lock();
@@ -884,13 +875,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
ck = container_of(pos, struct bkey_cached, hash);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ bc->skipped_dirty++;
goto next;
} else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+ bc->skipped_accessed++;
goto next;
} else if (bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(bc, ck);
bkey_cached_free(bc, ck);
+ bc->moved_to_freelist++;
+ } else {
+ bc->skipped_lock_fail++;
}
scanned++;
@@ -1037,14 +1033,47 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
return 0;
}
-void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
{
- prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed));
- prt_newline(out);
- prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys));
- prt_newline(out);
- prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty));
- prt_newline(out);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+ printbuf_tabstop_push(out, 24);
+ printbuf_tabstop_push(out, 12);
+
+ unsigned flags = memalloc_nofs_save();
+ mutex_lock(&bc->lock);
+ prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
+ prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
+ prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed));
+ prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu);
+ prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu);
+
+ prt_printf(out, "\nshrinker:\n");
+ prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
+ prt_printf(out, "freed:\t%lu\r\n", bc->freed);
+ prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist);
+ prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
+ prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
+ prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
+
+ prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier));
+
+ struct bkey_cached *ck;
+ unsigned iter = 0;
+ list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
+ prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
+ if (++iter > 10)
+ break;
+ }
+
+ iter = 0;
+ list_for_each_entry(ck, &bc->freed_pcpu, list) {
+ prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
+ if (++iter > 10)
+ break;
+ }
+ mutex_unlock(&bc->lock);
+ memalloc_flags_restore(flags);
}
void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
index 290e4e57df5b..237e8bb3ac40 100644
--- a/fs/bcachefs/btree_key_cache_types.h
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -24,6 +24,14 @@ struct btree_key_cache {
atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
+
+ /* shrinker stats */
+ unsigned long requested_to_free;
+ unsigned long freed;
+ unsigned long moved_to_freelist;
+ unsigned long skipped_dirty;
+ unsigned long skipped_accessed;
+ unsigned long skipped_lock_fail;
};
struct bkey_cached_key {
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index f2caf491957e..c3e9b0cc7bbd 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -83,8 +83,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
{
struct trans_waiting_for_lock *i;
- prt_printf(out, "Found lock cycle (%u entries):", g->nr);
- prt_newline(out);
+ prt_printf(out, "Found lock cycle (%u entries):\n", g->nr);
for (i = g->g; i < g->g + g->nr; i++) {
struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
@@ -224,8 +223,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
bch2_btree_trans_to_text(&buf, trans);
- prt_printf(&buf, "backtrace:");
- prt_newline(&buf);
+ prt_printf(&buf, "backtrace:\n");
printbuf_indent_add(&buf, 2);
bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
printbuf_indent_sub(&buf, 2);
@@ -492,8 +490,6 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
if (path->uptodate == BTREE_ITER_NEED_RELOCK)
path->uptodate = BTREE_ITER_UPTODATE;
- bch2_trans_verify_locks(trans);
-
return path->uptodate < BTREE_ITER_NEED_RELOCK;
}
@@ -609,7 +605,9 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_pa
{
struct get_locks_fail f;
- return btree_path_get_locks(trans, path, false, &f);
+ bool ret = btree_path_get_locks(trans, path, false, &f);
+ bch2_trans_verify_locks(trans);
+ return ret;
}
int __bch2_btree_path_relock(struct btree_trans *trans,
@@ -632,7 +630,9 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
path->locks_want = new_locks_want;
- return btree_path_get_locks(trans, path, true, f);
+ bool ret = btree_path_get_locks(trans, path, true, f);
+ bch2_trans_verify_locks(trans);
+ return ret;
}
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
@@ -640,8 +640,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
unsigned new_locks_want,
struct get_locks_fail *f)
{
- if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
- return true;
+ bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f);
+ if (ret)
+ goto out;
/*
* XXX: this is ugly - we'd prefer to not be mucking with other
@@ -675,8 +676,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
btree_path_get_locks(trans, linked, true, NULL);
}
}
-
- return false;
+out:
+ bch2_trans_verify_locks(trans);
+ return ret;
}
void __bch2_btree_path_downgrade(struct btree_trans *trans,
@@ -725,82 +727,100 @@ void bch2_trans_downgrade(struct btree_trans *trans)
bch2_btree_path_downgrade(trans, path);
}
-int bch2_trans_relock(struct btree_trans *trans)
+static inline void __bch2_trans_unlock(struct btree_trans *trans)
{
struct btree_path *path;
unsigned i;
- if (unlikely(trans->restarted))
- return -((int) trans->restarted);
+ trans_for_each_path(trans, path, i)
+ __bch2_btree_path_unlock(trans, path);
+}
- trans_for_each_path(trans, path, i) {
- struct get_locks_fail f;
+static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
+ struct get_locks_fail *f, bool trace)
+{
+ if (!trace)
+ goto out;
- if (path->should_be_locked &&
- !btree_path_get_locks(trans, path, false, &f)) {
- if (trace_trans_restart_relock_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bpos_to_text(&buf, path->pos);
- prt_printf(&buf, " l=%u seq=%u node seq=",
- f.l, path->l[f.l].lock_seq);
- if (IS_ERR_OR_NULL(f.b)) {
- prt_str(&buf, bch2_err_str(PTR_ERR(f.b)));
- } else {
- prt_printf(&buf, "%u", f.b->c.lock.seq);
-
- struct six_lock_count c =
- bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l);
- prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-
- c = six_lock_counts(&f.b->c.lock);
- prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
- }
+ if (trace_trans_restart_relock_enabled()) {
+ struct printbuf buf = PRINTBUF;
- trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
- printbuf_exit(&buf);
- }
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq);
+ if (IS_ERR_OR_NULL(f->b)) {
+ prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
+ } else {
+ prt_printf(&buf, "%u", f->b->c.lock.seq);
- count_event(trans->c, trans_restart_relock);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ struct six_lock_count c =
+ bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l);
+ prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+
+ c = six_lock_counts(&f->b->c.lock);
+ prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
}
+
+ trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
}
- return 0;
+ count_event(trans->c, trans_restart_relock);
+out:
+ __bch2_trans_unlock(trans);
+ bch2_trans_verify_locks(trans);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
-int bch2_trans_relock_notrace(struct btree_trans *trans)
+static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
{
- struct btree_path *path;
- unsigned i;
+ bch2_trans_verify_locks(trans);
if (unlikely(trans->restarted))
return -((int) trans->restarted);
+ if (unlikely(trans->locked))
+ goto out;
+
+ struct btree_path *path;
+ unsigned i;
+
+ trans_for_each_path(trans, path, i) {
+ struct get_locks_fail f;
- trans_for_each_path(trans, path, i)
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path)) {
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
- }
+ !btree_path_get_locks(trans, path, false, &f))
+ return bch2_trans_relock_fail(trans, path, &f, trace);
+ }
+
+ trans->locked = true;
+out:
+ bch2_trans_verify_locks(trans);
return 0;
}
+int bch2_trans_relock(struct btree_trans *trans)
+{
+ return __bch2_trans_relock(trans, true);
+}
+
+int bch2_trans_relock_notrace(struct btree_trans *trans)
+{
+ return __bch2_trans_relock(trans, false);
+}
+
void bch2_trans_unlock_noassert(struct btree_trans *trans)
{
- struct btree_path *path;
- unsigned i;
+ __bch2_trans_unlock(trans);
- trans_for_each_path(trans, path, i)
- __bch2_btree_path_unlock(trans, path);
+ trans->locked = false;
+ trans->last_unlock_ip = _RET_IP_;
}
void bch2_trans_unlock(struct btree_trans *trans)
{
- struct btree_path *path;
- unsigned i;
+ __bch2_trans_unlock(trans);
- trans_for_each_path(trans, path, i)
- __bch2_btree_path_unlock(trans, path);
+ trans->locked = false;
+ trans->last_unlock_ip = _RET_IP_;
}
void bch2_trans_unlock_long(struct btree_trans *trans)
@@ -809,17 +829,6 @@ void bch2_trans_unlock_long(struct btree_trans *trans)
bch2_trans_srcu_unlock(trans);
}
-bool bch2_trans_locked(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- if (path->nodes_locked)
- return true;
- return false;
-}
-
int __bch2_trans_mutex_lock(struct btree_trans *trans,
struct mutex *lock)
{
@@ -836,15 +845,19 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans,
void bch2_btree_path_verify_locks(struct btree_path *path)
{
- unsigned l;
+ /*
+ * A path may be uptodate and yet have nothing locked if and only if
+ * there is no node at path->level, which generally means we were
+ * iterating over all nodes and got to the end of the btree
+ */
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+ btree_path_node(path, path->level) &&
+ !path->nodes_locked);
- if (!path->nodes_locked) {
- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
- btree_path_node(path, path->level));
+ if (!path->nodes_locked)
return;
- }
- for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
int want = btree_lock_want(path, l);
int have = btree_node_locked_type(path, l);
@@ -857,8 +870,24 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
}
}
+static bool bch2_trans_locked(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ unsigned i;
+
+ trans_for_each_path(trans, path, i)
+ if (path->nodes_locked)
+ return true;
+ return false;
+}
+
void bch2_trans_verify_locks(struct btree_trans *trans)
{
+ if (!trans->locked) {
+ BUG_ON(bch2_trans_locked(trans));
+ return;
+ }
+
struct btree_path *path;
unsigned i;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 4bd72c855da1..7f41545b9147 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -364,14 +364,14 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
- struct get_locks_fail f;
+ struct get_locks_fail f = {};
unsigned old_locks_want = path->locks_want;
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
if (path->locks_want < new_locks_want
? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
- : path->uptodate == BTREE_ITER_UPTODATE)
+ : path->nodes_locked)
return 0;
trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index bbec91e8e650..74e1ff225674 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "alloc_foreground.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
@@ -19,6 +20,26 @@
#include <linux/prefetch.h>
+static const char * const trans_commit_flags_strs[] = {
+#define x(n, ...) #n,
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
+ NULL
+};
+
+void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags)
+{
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+
+ prt_printf(out, "watermark=%s", bch2_watermarks[watermark]);
+
+ flags >>= BCH_WATERMARK_BITS;
+ if (flags) {
+ prt_char(out, ' ');
+ bch2_prt_bitflags(out, trans_commit_flags_strs, flags);
+ }
+}
+
static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
{
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -315,8 +336,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->btree_id != path->btree_id);
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
- !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
- test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+ !(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
+ test_bit(JOURNAL_replay_done, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
}
@@ -443,13 +464,13 @@ static int run_one_mem_trigger(struct btree_trans *trans,
verify_update_old_key(trans, i);
- if (unlikely(flags & BTREE_TRIGGER_NORUN))
+ if (unlikely(flags & BTREE_TRIGGER_norun))
return 0;
if (old_ops->trigger == new_ops->trigger) {
ret = bch2_key_trigger(trans, i->btree_id, i->level,
old, bkey_i_to_s(new),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+ BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
} else {
ret = bch2_key_trigger_new(trans, i->btree_id, i->level,
bkey_i_to_s(new), flags) ?:
@@ -472,11 +493,11 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
struct bkey_s_c old = { &old_k, i->old_v };
const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
- unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL;
+ unsigned flags = i->flags|BTREE_TRIGGER_transactional;
verify_update_old_key(trans, i);
- if ((i->flags & BTREE_TRIGGER_NORUN) ||
+ if ((i->flags & BTREE_TRIGGER_norun) ||
!(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
return 0;
@@ -486,8 +507,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
i->overwrite_trigger_run = true;
i->insert_trigger_run = true;
return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
- BTREE_TRIGGER_INSERT|
- BTREE_TRIGGER_OVERWRITE|flags) ?: 1;
+ BTREE_TRIGGER_insert|
+ BTREE_TRIGGER_overwrite|flags) ?: 1;
} else if (overwrite && !i->overwrite_trigger_run) {
i->overwrite_trigger_run = true;
return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
@@ -572,7 +593,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+ BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
(!i->insert_trigger_run || !i->overwrite_trigger_run));
#endif
@@ -590,7 +611,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
- int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
if (ret)
return ret;
}
@@ -609,6 +630,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
unsigned u64s = 0;
int ret;
+ bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_in_restart(trans);
+
if (race_fault()) {
trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
@@ -686,7 +710,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
trans_for_each_update(trans, i)
if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags);
+ ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
if (ret)
goto fatal_err;
}
@@ -705,7 +729,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (i->key_cache_already_flushed)
continue;
- if (i->flags & BTREE_UPDATE_NOJOURNAL)
+ if (i->flags & BTREE_UPDATE_nojournal)
continue;
verify_update_old_key(trans, i);
@@ -766,16 +790,15 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
}
static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct btree_insert_entry *i,
struct printbuf *err)
{
struct bch_fs *c = trans->c;
printbuf_reset(err);
- prt_printf(err, "invalid bkey on insert from %s -> %ps",
+ prt_printf(err, "invalid bkey on insert from %s -> %ps\n",
trans->fn, (void *) i->ip_allocated);
- prt_newline(err);
printbuf_indent_add(err, 2);
bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
@@ -796,8 +819,7 @@ static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "invalid bkey on insert from %s", trans->fn);
- prt_newline(&buf);
+ prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn);
printbuf_indent_add(&buf, 2);
bch2_journal_entry_to_text(&buf, c, i);
@@ -988,6 +1010,9 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
int ret = 0;
+ bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_in_restart(trans);
+
if (!trans->nr_updates &&
!trans->journal_entries_u64s)
goto out_reset;
@@ -1000,10 +1025,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
trans_for_each_update(trans, i) {
struct printbuf buf = PRINTBUF;
- enum bkey_invalid_flags invalid_flags = 0;
+ enum bch_validate_flags invalid_flags = 0;
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+ invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
i->bkey_type, invalid_flags, &buf)))
@@ -1018,10 +1043,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
for (struct jset_entry *i = trans->journal_entries;
i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
i = vstruct_next(i)) {
- enum bkey_invalid_flags invalid_flags = 0;
+ enum bch_validate_flags invalid_flags = 0;
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+ invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
if (unlikely(bch2_journal_entry_validate(c, NULL, i,
bcachefs_metadata_version_current,
@@ -1065,7 +1090,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (i->key_cache_already_flushed)
continue;
- if (i->flags & BTREE_UPDATE_NOJOURNAL)
+ if (i->flags & BTREE_UPDATE_nojournal)
continue;
/* we're going to journal the key being updated: */
@@ -1086,6 +1111,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
}
retry:
errored_at = NULL;
+ bch2_trans_verify_not_unlocked(trans);
bch2_trans_verify_not_in_restart(trans);
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index c69b233c41bb..d63db4fefe73 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -163,9 +163,21 @@ struct btree_cache {
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
+ unsigned freed;
+ unsigned not_freed_lock_intent;
+ unsigned not_freed_lock_write;
+ unsigned not_freed_dirty;
+ unsigned not_freed_read_in_flight;
+ unsigned not_freed_write_in_flight;
+ unsigned not_freed_noevict;
+ unsigned not_freed_write_blocked;
+ unsigned not_freed_will_make_reachable;
+ unsigned not_freed_access_bit;
atomic_t dirty;
struct shrinker *shrink;
+ unsigned used_by_btree[BTREE_ID_NR];
+
/*
* If we need to allocate memory for a new btree node and that
* allocation fails, we can cannibalize another node in the btree cache
@@ -187,36 +199,89 @@ struct btree_node_iter {
} data[MAX_BSETS];
};
+#define BTREE_ITER_FLAGS() \
+ x(slots) \
+ x(intent) \
+ x(prefetch) \
+ x(is_extents) \
+ x(not_extents) \
+ x(cached) \
+ x(with_key_cache) \
+ x(with_updates) \
+ x(with_journal) \
+ x(snapshot_field) \
+ x(all_snapshots) \
+ x(filter_snapshots) \
+ x(nopreserve) \
+ x(cached_nofill) \
+ x(key_cache_fill) \
+
+#define STR_HASH_FLAGS() \
+ x(must_create) \
+ x(must_replace)
+
+#define BTREE_UPDATE_FLAGS() \
+ x(internal_snapshot_node) \
+ x(nojournal) \
+ x(key_cache_reclaim)
+
+
/*
- * Iterate over all possible positions, synthesizing deleted keys for holes:
- */
-static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0;
-/*
- * Indicates that intent locks should be taken on leaf nodes, because we expect
- * to be doing updates:
- */
-static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1;
-/*
- * Causes the btree iterator code to prefetch additional btree nodes from disk:
- */
-static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2;
-/*
- * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
- * @pos or the first key strictly greater than @pos
+ * BTREE_TRIGGER_norun - don't run triggers at all
+ *
+ * BTREE_TRIGGER_transactional - we're running transactional triggers as part of
+ * a transaction commit: triggers may generate new updates
+ *
+ * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction
+ * commit: we have our journal reservation, we're holding btree node write
+ * locks, and we know the transaction is going to commit (returning an error
+ * here is a fatal error, causing us to go emergency read-only)
+ *
+ * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage
+ *
+ * BTREE_TRIGGER_insert - @new is entering the btree
+ * BTREE_TRIGGER_overwrite - @old is leaving the btree
+ *
+ * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc
+ * trigger
*/
-static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3;
-static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4;
-static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5;
-static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6;
-static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7;
-static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8;
-static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9;
-static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
-static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11;
-static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12;
-static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13;
-static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14;
-#define __BTREE_ITER_FLAGS_END 15
+#define BTREE_TRIGGER_FLAGS() \
+ x(norun) \
+ x(transactional) \
+ x(atomic) \
+ x(check_repair) \
+ x(gc) \
+ x(insert) \
+ x(overwrite) \
+ x(is_root) \
+ x(bucket_invalidate)
+
+enum {
+#define x(n) BTREE_ITER_FLAG_BIT_##n,
+ BTREE_ITER_FLAGS()
+ STR_HASH_FLAGS()
+ BTREE_UPDATE_FLAGS()
+ BTREE_TRIGGER_FLAGS()
+#undef x
+};
+
+/* iter flags must fit in a u16: */
+//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15);
+
+enum btree_iter_update_trigger_flags {
+#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ BTREE_ITER_FLAGS()
+#undef x
+#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ STR_HASH_FLAGS()
+#undef x
+#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ BTREE_UPDATE_FLAGS()
+#undef x
+#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
+ BTREE_TRIGGER_FLAGS()
+#undef x
+};
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -307,7 +372,7 @@ struct btree_iter {
*/
struct bkey k;
- /* BTREE_ITER_WITH_JOURNAL: */
+ /* BTREE_ITER_with_journal: */
size_t journal_idx;
#ifdef TRACK_PATH_ALLOCATED
unsigned long ip_allocated;
@@ -418,6 +483,8 @@ struct btree_trans {
u8 lock_must_abort;
bool lock_may_not_fail:1;
bool srcu_held:1;
+ bool locked:1;
+ bool write_locked:1;
bool used_mempool:1;
bool in_traverse_all:1;
bool paths_sorted:1;
@@ -425,13 +492,13 @@ struct btree_trans {
bool journal_transaction_names:1;
bool journal_replay_not_finished:1;
bool notrace_relock_fail:1;
- bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
u64 last_begin_time;
unsigned long last_begin_ip;
unsigned long last_restarted_ip;
+ unsigned long last_unlock_ip;
unsigned long srcu_lock_time;
const char *fn;
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 8e47e260eba5..f3c645a43dcb 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -25,14 +25,14 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
static int __must_check
bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
- struct bkey_i *, enum btree_update_flags,
+ struct bkey_i *, enum btree_iter_update_trigger_flags,
unsigned long ip);
static noinline int extent_front_merge(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
struct bkey_i **insert,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct bkey_i *update;
@@ -104,8 +104,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
pos.snapshot++;
for_each_btree_key_norestart(trans, iter, btree_id, pos,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_NOPRESERVE, k, ret) {
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_nopreserve, k, ret) {
if (!bkey_eq(k.k->p, pos))
break;
@@ -138,8 +138,8 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
darray_init(&s);
bch2_trans_iter_init(trans, &old_iter, id, old_pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots);
while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
!(ret = bkey_err(old_k)) &&
bkey_eq(old_pos, old_k.k->p)) {
@@ -151,8 +151,8 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
continue;
new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_not_extents|
+ BTREE_ITER_intent);
ret = bkey_err(new_k);
if (ret)
break;
@@ -168,7 +168,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
update->k.type = KEY_TYPE_whiteout;
ret = bch2_trans_update(trans, &new_iter, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
}
bch2_trans_iter_exit(trans, &new_iter);
@@ -185,7 +185,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
struct btree_iter *iter,
- enum btree_update_flags flags,
+ enum btree_iter_update_trigger_flags flags,
struct bkey_s_c old,
struct bkey_s_c new)
{
@@ -218,7 +218,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
old.k->p, update->k.p) ?:
bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ BTREE_UPDATE_internal_snapshot_node|flags);
if (ret)
return ret;
}
@@ -235,7 +235,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
old.k->p, update->k.p) ?:
bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ BTREE_UPDATE_internal_snapshot_node|flags);
if (ret)
return ret;
}
@@ -260,7 +260,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
}
ret = bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+ BTREE_UPDATE_internal_snapshot_node|flags);
if (ret)
return ret;
}
@@ -273,7 +273,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
bch2_cut_front(new.k->p, update);
ret = bch2_trans_update_by_path(trans, iter->path, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ BTREE_UPDATE_internal_snapshot_node|
flags, _RET_IP_);
if (ret)
return ret;
@@ -285,7 +285,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
static int bch2_trans_update_extent(struct btree_trans *trans,
struct btree_iter *orig_iter,
struct bkey_i *insert,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -293,9 +293,9 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
int ret = 0;
bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
- BTREE_ITER_INTENT|
- BTREE_ITER_WITH_UPDATES|
- BTREE_ITER_NOT_EXTENTS);
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates|
+ BTREE_ITER_not_extents);
k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
@@ -346,7 +346,7 @@ err:
static noinline int flush_new_cached_update(struct btree_trans *trans,
struct btree_insert_entry *i,
- enum btree_update_flags flags,
+ enum btree_iter_update_trigger_flags flags,
unsigned long ip)
{
struct bkey k;
@@ -354,7 +354,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
btree_path_idx_t path_idx =
bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
- BTREE_ITER_INTENT, _THIS_IP_);
+ BTREE_ITER_intent, _THIS_IP_);
ret = bch2_btree_path_traverse(trans, path_idx, 0);
if (ret)
goto out;
@@ -372,7 +372,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
goto out;
i->key_cache_already_flushed = true;
- i->flags |= BTREE_TRIGGER_NORUN;
+ i->flags |= BTREE_TRIGGER_norun;
btree_path_set_should_be_locked(btree_path);
ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
@@ -383,7 +383,7 @@ out:
static int __must_check
bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
- struct bkey_i *k, enum btree_update_flags flags,
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
unsigned long ip)
{
struct bch_fs *c = trans->c;
@@ -479,15 +479,15 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
if (!iter->key_cache_path)
iter->key_cache_path =
bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
- BTREE_ITER_INTENT|
- BTREE_ITER_CACHED, _THIS_IP_);
+ BTREE_ITER_intent|
+ BTREE_ITER_cached, _THIS_IP_);
iter->key_cache_path =
bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
- iter->flags & BTREE_ITER_INTENT,
+ iter->flags & BTREE_ITER_intent,
_THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED);
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached);
if (unlikely(ret))
return ret;
@@ -505,17 +505,17 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
}
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_update_flags flags)
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
{
btree_path_idx_t path_idx = iter->update_path ?: iter->path;
int ret;
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ if (iter->flags & BTREE_ITER_is_extents)
return bch2_trans_update_extent(trans, iter, k, flags);
if (bkey_deleted(&k->k) &&
- !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
- (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+ !(flags & BTREE_UPDATE_key_cache_reclaim) &&
+ (iter->flags & BTREE_ITER_filter_snapshots)) {
ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
if (unlikely(ret < 0))
return ret;
@@ -528,7 +528,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
* Ensure that updates to cached btrees go to the key cache:
*/
struct btree_path *path = trans->paths + path_idx;
- if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ if (!(flags & BTREE_UPDATE_key_cache_reclaim) &&
!path->cached &&
!path->level &&
btree_id_cached(trans->c, path->btree_id)) {
@@ -587,7 +587,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k;
int ret = 0;
- bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
k = bch2_btree_iter_prev(iter);
ret = bkey_err(k);
if (ret)
@@ -621,15 +621,15 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
int bch2_btree_insert_nonextent(struct btree_trans *trans,
enum btree_id btree, struct bkey_i *k,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
int ret;
bch2_trans_iter_init(trans, &iter, btree, k->k.p,
- BTREE_ITER_CACHED|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_cached|
+ BTREE_ITER_not_extents|
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
@@ -637,16 +637,13 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
}
int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
- struct bkey_i *k, enum btree_update_flags flags)
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
- int ret;
-
bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k, flags);
+ BTREE_ITER_intent|flags);
+ int ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -698,8 +695,8 @@ int bch2_btree_delete(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, btree, pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_btree_delete_at(trans, &iter, update_flags);
bch2_trans_iter_exit(trans, &iter);
@@ -717,7 +714,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bkey_s_c k;
int ret = 0;
- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(trans->c, 0);
@@ -745,7 +742,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
*/
delete.k.p = iter.pos;
- if (iter.flags & BTREE_ITER_IS_EXTENTS)
+ if (iter.flags & BTREE_ITER_is_extents)
bch2_key_resize(&delete.k,
bpos_min(end, k.k->p).offset -
iter.pos.offset);
@@ -804,7 +801,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
k->k.p = pos;
struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, k, 0);
@@ -852,7 +849,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
if (ret)
goto err;
- if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+ if (!test_bit(JOURNAL_running, &c->journal.flags)) {
ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
if (ret)
goto err;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index cc7c53e83f89..b4894e4d5447 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -44,16 +44,18 @@ enum bch_trans_commit_flags {
#undef x
};
+void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
+
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
unsigned, unsigned);
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
- struct bkey_i *, enum btree_update_flags);
+ struct bkey_i *, enum btree_iter_update_trigger_flags);
int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
- enum btree_update_flags);
+ enum btree_iter_update_trigger_flags);
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, int flags);
@@ -94,14 +96,14 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
}
int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
- enum btree_update_flags,
+ enum btree_iter_update_trigger_flags,
struct bkey_s_c, struct bkey_s_c);
int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
enum btree_id, struct bpos);
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, enum btree_update_flags);
+ struct bkey_i *, enum btree_iter_update_trigger_flags);
struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
@@ -276,7 +278,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr
unsigned flags, unsigned type, unsigned min_bytes)
{
struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
- btree_id, pos, flags|BTREE_ITER_INTENT, type);
+ btree_id, pos, flags|BTREE_ITER_intent, type);
struct bkey_i *ret = IS_ERR(k.k)
? ERR_CAST(k.k)
: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
@@ -299,7 +301,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
unsigned flags, unsigned type, unsigned min_bytes)
{
struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
- btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes);
+ btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
int ret;
if (IS_ERR(mut))
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index b4efd8cc4d1a..60b8544cea48 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -38,22 +38,6 @@ static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
- enum btree_id btree_id,
- unsigned level,
- struct bpos pos)
-{
- btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
- BTREE_ITER_NOPRESERVE|
- BTREE_ITER_INTENT, _RET_IP_);
- path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
-
- struct btree_path *path = trans->paths + path_idx;
- bch2_btree_path_downgrade(trans, path);
- __bch2_btree_path_unlock(trans, path);
- return path_idx;
-}
-
/*
* Verify that child nodes correctly span parent node's range:
*/
@@ -73,6 +57,24 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
!bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
b->data->min_key));
+ if (b == btree_node_root(c, b)) {
+ if (!bpos_eq(b->data->min_key, POS_MIN)) {
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->min_key);
+ need_fsck_err(c, btree_root_bad_min_key,
+ "btree root with incorrect min_key: %s", buf.buf);
+ goto topology_repair;
+ }
+
+ if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->max_key);
+ need_fsck_err(c, btree_root_bad_max_key,
+ "btree root with incorrect max_key: %s", buf.buf);
+ goto topology_repair;
+ }
+ }
+
if (!b->c.level)
return 0;
@@ -158,7 +160,6 @@ topology_repair:
static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
{
struct bkey_packed *k;
- struct bset_tree *t;
struct bkey uk;
for_each_bset(b, t)
@@ -646,7 +647,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
- BTREE_TRIGGER_TRANSACTIONAL);
+ BTREE_TRIGGER_transactional);
if (ret)
return ret;
}
@@ -655,7 +656,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
- BTREE_TRIGGER_TRANSACTIONAL);
+ BTREE_TRIGGER_transactional);
if (ret)
return ret;
}
@@ -735,9 +736,6 @@ err:
*/
b = READ_ONCE(as->b);
if (b) {
- btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
- as->btree_id, b->c.level, b->key.k.p);
- struct btree_path *path = trans->paths + path_idx;
/*
* @b is the node we did the final insert into:
*
@@ -755,12 +753,16 @@ err:
* btree_node_lock_nopath() (the use of which is always suspect,
* we need to work on removing this in the future)
*
- * It should be, but get_unlocked_mut_path() -> bch2_path_get()
+ * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
* calls bch2_path_upgrade(), before we call path_make_mut(), so
* we may rarely end up with a locked path besides the one we
* have here:
*/
bch2_trans_unlock(trans);
+ bch2_trans_begin(trans);
+ btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
+ as->btree_id, b->c.level, b->key.k.p);
+ struct btree_path *path = trans->paths + path_idx;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
@@ -1154,13 +1156,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags |= watermark;
if (watermark < BCH_WATERMARK_reclaim &&
- test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) {
+ test_bit(JOURNAL_space_low, &c->journal.flags)) {
if (flags & BCH_TRANS_COMMIT_journal_reclaim)
return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
- bch2_trans_unlock(trans);
- wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags));
- ret = bch2_trans_relock(trans);
+ ret = drop_locks_do(trans,
+ ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; }));
if (ret)
return ERR_PTR(ret);
}
@@ -1206,7 +1207,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->start_time = start_time;
as->ip_started = _RET_IP_;
as->mode = BTREE_UPDATE_none;
- as->watermark = watermark;
+ as->flags = flags;
as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level_start = level_start;
@@ -1360,7 +1361,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
!btree_ptr_sectors_written(insert));
- if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
@@ -1619,12 +1620,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
+ path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + path1, n1);
- path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p);
+ path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p);
six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + path2, n2);
@@ -1669,7 +1670,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n1);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
+ path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + path1, n1);
@@ -1947,6 +1948,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
u64 start_time = local_clock();
int ret = 0;
+ bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked(trans);
BUG_ON(!trans->paths[path].should_be_locked);
BUG_ON(!btree_node_locked(&trans->paths[path], level));
@@ -1979,7 +1982,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
: bpos_successor(b->data->max_key);
sib_path = bch2_path_get(trans, btree, sib_pos,
- U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
+ U8_MAX, level, BTREE_ITER_intent, _THIS_IP_);
ret = bch2_btree_path_traverse(trans, sib_path, false);
if (ret)
goto err;
@@ -2072,7 +2075,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p);
+ new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + new_path, n);
@@ -2150,7 +2153,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
+ new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, trans->paths + new_path, n);
@@ -2333,10 +2336,10 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
if (!skip_triggers) {
ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
bkey_i_to_s_c(&b->key),
- BTREE_TRIGGER_TRANSACTIONAL) ?:
+ BTREE_TRIGGER_transactional) ?:
bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
bkey_i_to_s(new_key),
- BTREE_TRIGGER_TRANSACTIONAL);
+ BTREE_TRIGGER_transactional);
if (ret)
return ret;
}
@@ -2353,7 +2356,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bch2_trans_copy_iter(&iter2, iter);
iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
- iter2.flags & BTREE_ITER_INTENT,
+ iter2.flags & BTREE_ITER_intent,
_THIS_IP_);
struct btree_path *path2 = btree_iter_path(trans, &iter2);
@@ -2365,7 +2368,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
trans->paths_sorted = false;
ret = bch2_btree_iter_traverse(&iter2) ?:
- bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
+ bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
if (ret)
goto err;
} else {
@@ -2473,7 +2476,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto out;
@@ -2487,7 +2490,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
BUG_ON(!btree_node_hashed(b));
- struct bch_extent_ptr *ptr;
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
@@ -2511,7 +2513,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
bch2_btree_set_root_inmem(c, b);
}
-static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level)
+int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level)
{
struct bch_fs *c = trans->c;
struct closure cl;
@@ -2559,17 +2561,18 @@ static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id
void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
{
- bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level));
+ bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level));
}
static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
{
- prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
- (void *) as->ip_started,
+ prt_printf(out, "%ps: ", (void *) as->ip_started);
+ bch2_trans_commit_flags_to_text(out, as->flags);
+
+ prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
bch2_btree_id_str(as->btree_id),
as->update_level_start,
as->update_level_end,
- bch2_watermarks[as->watermark],
bch2_btree_update_modes[as->mode],
as->nodes_written,
closure_nr_remaining(&as->cl),
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index c1a479ebaad1..b5b76ce01cfc 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -52,7 +52,7 @@ struct btree_update {
struct list_head unwritten_list;
enum btree_update_mode mode;
- enum bch_watermark watermark;
+ enum bch_trans_commit_flags flags;
unsigned nodes_written:1;
unsigned took_gc_lock:1;
@@ -144,6 +144,9 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
EBUG_ON(!btree_node_locked(path, level));
+ if (bch2_btree_node_merging_disabled)
+ return 0;
+
b = path->l[level].b;
if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
return 0;
@@ -172,6 +175,8 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
struct bkey_i *, unsigned, bool);
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
+
+int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned);
void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 36a6f42aba5e..75c8a196b3f6 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -122,7 +122,7 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
trans->journal_res.seq = wb->journal_seq;
return bch2_trans_update(trans, iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
@@ -191,13 +191,13 @@ btree_write_buffered_insert(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
- BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ BTREE_ITER_cached|BTREE_ITER_intent);
trans->journal_res.seq = wb->journal_seq;
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -332,7 +332,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
if (!iter.path || iter.btree_id != k->btree) {
bch2_trans_iter_exit(trans, &iter);
bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
- BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_intent|BTREE_ITER_all_snapshots);
}
bch2_btree_iter_set_pos(&iter, k->k.k.p);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 82f179258867..e28d28ac2a13 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -274,25 +274,14 @@ void bch2_dev_usage_init(struct bch_dev *ca)
void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
{
- prt_tab(out);
- prt_str(out, "buckets");
- prt_tab_rjust(out);
- prt_str(out, "sectors");
- prt_tab_rjust(out);
- prt_str(out, "fragmented");
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
bch2_prt_data_type(out, i);
- prt_tab(out);
- prt_u64(out, usage->d[i].buckets);
- prt_tab_rjust(out);
- prt_u64(out, usage->d[i].sectors);
- prt_tab_rjust(out);
- prt_u64(out, usage->d[i].fragmented);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
+ usage->d[i].buckets,
+ usage->d[i].sectors,
+ usage->d[i].fragmented);
}
}
@@ -329,26 +318,6 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
preempt_enable();
}
-static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
-{
- return (struct bch_alloc_v4) {
- .gen = b.gen,
- .data_type = b.data_type,
- .dirty_sectors = b.dirty_sectors,
- .cached_sectors = b.cached_sectors,
- .stripe = b.stripe,
- };
-}
-
-void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
- struct bucket *old, struct bucket *new)
-{
- struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old);
- struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new);
-
- bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true);
-}
-
static inline int __update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct bch_replicas_entry_v1 *r,
@@ -496,78 +465,276 @@ int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64
return bch2_update_replicas_list(trans, &r.e, sectors);
}
-int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type data_type,
- unsigned sectors, struct gc_pos pos,
- unsigned flags)
+int bch2_check_fix_ptrs(struct btree_trans *trans,
+ enum btree_id btree, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
{
- struct bucket old, new, *g;
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry_c;
+ struct extent_ptr_decoded p = { 0 };
+ bool do_update = false;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
- BUG_ON(data_type != BCH_DATA_sb &&
- data_type != BCH_DATA_journal);
+ percpu_down_read(&c->mark_lock);
- /*
- * Backup superblock might be past the end of our normal usable space:
- */
- if (b >= ca->mi.nbuckets)
- return 0;
+ rcu_read_lock();
+ bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ if (!ca) {
+ if (fsck_err(c, ptr_to_invalid_device,
+ "pointer to missing device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ do_update = true;
+ continue;
+ }
- percpu_down_read(&c->mark_lock);
- g = gc_bucket(ca, b);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c);
+
+ if (fsck_err_on(!g->gen_valid,
+ c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ } else {
+ do_update = true;
+ }
+ }
- bucket_lock(g);
- old = *g;
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached &&
+ (g->data_type != BCH_DATA_btree ||
+ data_type == BCH_DATA_btree)) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ do_update = true;
+ }
+ }
- if (bch2_fs_inconsistent_on(g->data_type &&
- g->data_type != data_type, c,
- "different types of data in same bucket: %s, %s",
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type))) {
- ret = -EIO;
- goto err;
- }
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ do_update = true;
+
+ if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+ c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ do_update = true;
+
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
+ continue;
+
+ if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
+ c, ptr_bucket_data_type_mismatch,
+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (data_type == BCH_DATA_btree) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = data_type;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ do_update = true;
+ }
+ }
- if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
- ca->dev_idx, b, g->gen,
- bch2_data_type_str(g->data_type ?: data_type),
- g->dirty_sectors, sectors)) {
- ret = -EIO;
- goto err;
+ if (p.has_ec) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
+
+ if (fsck_err_on(!m || !m->alive, c,
+ ptr_to_missing_stripe,
+ "pointer to nonexistent stripe %llu\n"
+ "while marking %s",
+ (u64) p.ec.idx,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ do_update = true;
+
+ if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
+ ptr_to_incorrect_stripe,
+ "pointer does not match stripe %llu\n"
+ "while marking %s",
+ (u64) p.ec.idx,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ do_update = true;
+ }
}
+ rcu_read_unlock();
- g->data_type = data_type;
- g->dirty_sectors += sectors;
- new = *g;
+ if (do_update) {
+ if (flags & BTREE_TRIGGER_is_root) {
+ bch_err(c, "cannot update btree roots yet");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
+
+ rcu_read_lock();
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev));
+ rcu_read_unlock();
+
+ if (level) {
+ /*
+ * We don't want to drop btree node pointers - if the
+ * btree node isn't there anymore, the read path will
+ * sort it out:
+ */
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ rcu_read_lock();
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, ptr);
+
+ ptr->gen = g->gen;
+ }
+ rcu_read_unlock();
+ } else {
+ struct bkey_ptrs ptrs;
+ union bch_extent_entry *entry;
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ rcu_read_lock();
+ bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
+
+ if ((p.ptr.cached &&
+ (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
+ (!p.ptr.cached &&
+ gen_cmp(p.ptr.gen, g->gen) < 0) ||
+ gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
+ (g->data_type &&
+ g->data_type != data_type)) {
+ bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
+ goto restart_drop_ptrs;
+ }
+ }
+ rcu_read_unlock();
+again:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
+ entry->stripe_ptr.idx);
+ union bch_extent_entry *next_ptr;
+
+ bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+ if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+ goto found;
+ next_ptr = NULL;
+found:
+ if (!next_ptr) {
+ bch_err(c, "aieee, found stripe ptr with no data ptr");
+ continue;
+ }
+
+ if (!m || !m->alive ||
+ !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+ &next_ptr->ptr,
+ m->sectors)) {
+ bch2_bkey_extent_entry_drop(new, entry);
+ goto again;
+ }
+ }
+ }
+ }
+
+ if (0) {
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_info(c, "updated %s", buf.buf);
+
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+ bch_info(c, "new key %s", buf.buf);
+ }
+
+ percpu_up_read(&c->mark_lock);
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
+ BTREE_ITER_intent|BTREE_ITER_all_snapshots);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, new,
+ BTREE_UPDATE_internal_snapshot_node|
+ BTREE_TRIGGER_norun);
+ bch2_trans_iter_exit(trans, &iter);
+ percpu_down_read(&c->mark_lock);
+
+ if (ret)
+ goto err;
+
+ if (level)
+ bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+ }
err:
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, &old, &new);
+fsck_err:
percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
return ret;
}
-int bch2_check_bucket_ref(struct btree_trans *trans,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 b_gen, u8 bucket_data_type,
- u32 bucket_sectors)
+int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 b_gen, u8 bucket_data_type,
+ u32 *bucket_sectors)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
struct printbuf buf = PRINTBUF;
+ bool inserting = sectors > 0;
int ret = 0;
- if (bucket_data_type == BCH_DATA_cached)
- bucket_data_type = BCH_DATA_user;
-
- if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
- (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe))
- bucket_data_type = ptr_data_type = BCH_DATA_stripe;
+ BUG_ON(!sectors);
if (gen_after(ptr->gen, b_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@@ -578,8 +745,9 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ goto out;
}
if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
@@ -592,11 +760,17 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ goto out;
+ }
+
+ if (b_gen != ptr->gen && ptr->cached) {
+ ret = 1;
+ goto out;
}
- if (b_gen != ptr->gen && !ptr->cached) {
+ if (b_gen != ptr->gen) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_stale_dirty_ptr,
"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
@@ -607,18 +781,12 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
- }
-
- if (b_gen != ptr->gen) {
- ret = 1;
+ if (inserting)
+ goto err;
goto out;
}
- if (!data_type_is_empty(bucket_data_type) &&
- ptr_data_type &&
- bucket_data_type != ptr_data_type) {
+ if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
@@ -628,28 +796,33 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ goto out;
}
- if ((u64) bucket_sectors + sectors > U32_MAX) {
+ if ((u64) *bucket_sectors + sectors > U32_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_bucket_sector_count_overflow,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- bucket_sectors, sectors,
+ *bucket_sectors, sectors,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
- goto err;
+ if (inserting)
+ goto err;
+ sectors = -*bucket_sectors;
}
+
+ *bucket_sectors += sectors;
out:
printbuf_exit(&buf);
return ret;
err:
bch2_dump_trans_updates(trans);
+ ret = -EIO;
goto out;
}
@@ -786,29 +959,22 @@ need_mark:
/* KEY_TYPE_extent: */
-static int __mark_pointer(struct btree_trans *trans,
+static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
- u8 bucket_gen, u8 *bucket_data_type,
- u32 *dirty_sectors, u32 *cached_sectors)
+ struct bch_alloc_v4 *a)
{
u32 *dst_sectors = !ptr->cached
- ? dirty_sectors
- : cached_sectors;
- int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
- bucket_gen, *bucket_data_type, *dst_sectors);
+ ? &a->dirty_sectors
+ : &a->cached_sectors;
+ int ret = bch2_bucket_ref_update(trans, ca, k, ptr, sectors, ptr_data_type,
+ a->gen, a->data_type, dst_sectors);
if (ret)
return ret;
- *dst_sectors += sectors;
-
- if (!*dirty_sectors && !*cached_sectors)
- *bucket_data_type = 0;
- else if (*bucket_data_type != BCH_DATA_stripe)
- *bucket_data_type = ptr_data_type;
-
+ alloc_data_type_set(a, ptr_data_type);
return 0;
}
@@ -816,81 +982,69 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
const union bch_extent_entry *entry,
- s64 *sectors, unsigned flags)
+ s64 *sectors,
+ enum btree_iter_update_trigger_flags flags)
{
- bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
+ bool insert = !(flags & BTREE_TRIGGER_overwrite);
+ int ret = 0;
+
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
+ if (unlikely(!ca)) {
+ if (insert)
+ ret = -EIO;
+ goto err;
+ }
+
struct bpos bucket;
struct bch_backpointer bp;
-
- bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp);
+ bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp);
*sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket);
- int ret = PTR_ERR_OR_ZERO(a);
+ if (flags & BTREE_TRIGGER_transactional) {
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket);
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &a->v);
if (ret)
- return ret;
-
- ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type,
- a->v.gen, &a->v.data_type,
- &a->v.dirty_sectors, &a->v.cached_sectors) ?:
- bch2_trans_update(trans, &iter, &a->k_i, 0);
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- return ret;
+ goto err;
if (!p.ptr.cached) {
- ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
+ ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert);
if (ret)
- return ret;
+ goto err;
}
}
- if (flags & BTREE_TRIGGER_GC) {
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
-
+ if (flags & BTREE_TRIGGER_gc) {
percpu_down_read(&c->mark_lock);
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ struct bucket *g = gc_bucket(ca, bucket.offset);
bucket_lock(g);
- struct bucket old = *g;
-
- u8 bucket_data_type = g->data_type;
- int ret = __mark_pointer(trans, k, &p.ptr, *sectors,
- data_type, g->gen,
- &bucket_data_type,
- &g->dirty_sectors,
- &g->cached_sectors);
- if (ret) {
- bucket_unlock(g);
- percpu_up_read(&c->mark_lock);
- return ret;
+ struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
+ ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new);
+ if (!ret) {
+ alloc_to_bucket(g, new);
+ bch2_dev_usage_update(c, ca, &old, &new, 0, true);
}
-
- g->data_type = bucket_data_type;
- struct bucket new = *g;
bucket_unlock(g);
- bch2_dev_usage_update_m(c, ca, &old, &new);
percpu_up_read(&c->mark_lock);
}
-
- return 0;
+err:
+ bch2_dev_put(ca);
+ return ret;
}
static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
struct bkey_s_c k,
struct extent_ptr_decoded p,
enum bch_data_type data_type,
- s64 sectors, unsigned flags)
+ s64 sectors,
+ enum btree_iter_update_trigger_flags flags)
{
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
struct btree_iter iter;
struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_stripes, POS(0, p.ec.idx),
- BTREE_ITER_WITH_UPDATES, stripe);
+ BTREE_ITER_with_updates, stripe);
int ret = PTR_ERR_OR_ZERO(s);
if (unlikely(ret)) {
bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
@@ -920,10 +1074,10 @@ err:
return ret;
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & BTREE_TRIGGER_gc) {
struct bch_fs *c = trans->c;
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
+ BUG_ON(!(flags & BTREE_TRIGGER_gc));
struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
if (!m) {
@@ -959,9 +1113,10 @@ err:
static int __trigger_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+ struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
{
- bool gc = flags & BTREE_TRIGGER_GC;
+ bool gc = flags & BTREE_TRIGGER_gc;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -970,7 +1125,7 @@ static int __trigger_extent(struct btree_trans *trans,
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree
: BCH_DATA_user;
- s64 dirty_sectors = 0;
+ s64 replicas_sectors = 0;
int ret = 0;
r.e.data_type = data_type;
@@ -996,7 +1151,7 @@ static int __trigger_extent(struct btree_trans *trans,
return ret;
}
} else if (!p.has_ec) {
- dirty_sectors += disk_sectors;
+ replicas_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@@ -1014,8 +1169,8 @@ static int __trigger_extent(struct btree_trans *trans,
if (r.e.nr_devs) {
ret = !gc
- ? bch2_update_replicas_list(trans, &r.e, dirty_sectors)
- : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true);
+ ? bch2_update_replicas_list(trans, &r.e, replicas_sectors)
+ : bch2_update_replicas(c, k, &r.e, replicas_sectors, 0, true);
if (unlikely(ret && gc)) {
struct printbuf buf = PRINTBUF;
@@ -1031,15 +1186,18 @@ static int __trigger_extent(struct btree_trans *trans,
}
int bch2_trigger_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
+ enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
+ if (unlikely(flags & BTREE_TRIGGER_check_repair))
+ return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
+
/* if pointers aren't changing - nothing to do: */
if (new_ptrs_bytes == old_ptrs_bytes &&
!memcmp(new_ptrs.start,
@@ -1047,7 +1205,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
new_ptrs_bytes))
return 0;
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
struct bch_fs *c = trans->c;
int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
(int) bch2_bkey_needs_rebalance(c, old);
@@ -1060,8 +1218,8 @@ int bch2_trigger_extent(struct btree_trans *trans,
}
}
- if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))
- return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags);
+ if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc))
+ return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags);
return 0;
}
@@ -1069,17 +1227,17 @@ int bch2_trigger_extent(struct btree_trans *trans,
/* KEY_TYPE_reservation */
static int __trigger_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+ enum btree_id btree_id, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size * replicas;
- if (flags & BTREE_TRIGGER_OVERWRITE)
+ if (flags & BTREE_TRIGGER_overwrite)
sectors = -sectors;
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
int ret = bch2_replicas_deltas_realloc(trans, 0);
if (ret)
return ret;
@@ -1090,7 +1248,7 @@ static int __trigger_reservation(struct btree_trans *trans,
d->persistent_reserved[replicas - 1] += sectors;
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & BTREE_TRIGGER_gc) {
percpu_down_read(&c->mark_lock);
preempt_disable();
@@ -1110,7 +1268,7 @@ static int __trigger_reservation(struct btree_trans *trans,
int bch2_trigger_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
}
@@ -1118,22 +1276,16 @@ int bch2_trigger_reservation(struct btree_trans *trans,
/* Mark superblocks: */
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
- struct bch_dev *ca, size_t b,
+ struct bch_dev *ca, u64 b,
enum bch_data_type type,
unsigned sectors)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
int ret = 0;
- /*
- * Backup superblock might be past the end of our normal usable space:
- */
- if (b >= ca->mi.nbuckets)
- return 0;
-
- a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+ struct bkey_i_alloc_v4 *a =
+ bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
if (IS_ERR(a))
return PTR_ERR(a);
@@ -1161,20 +1313,75 @@ err:
return ret;
}
+static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ u64 b, enum bch_data_type data_type, unsigned sectors,
+ enum btree_iter_update_trigger_flags flags)
+{
+ int ret = 0;
+
+ percpu_down_read(&c->mark_lock);
+ struct bucket *g = gc_bucket(ca, b);
+
+ bucket_lock(g);
+ struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
+
+ if (bch2_fs_inconsistent_on(g->data_type &&
+ g->data_type != data_type, c,
+ "different types of data in same bucket: %s, %s",
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type))) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+ "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
+ ca->dev_idx, b, g->gen,
+ bch2_data_type_str(g->data_type ?: data_type),
+ g->dirty_sectors, sectors)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+ struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
+err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+ percpu_up_read(&c->mark_lock);
+ return ret;
+}
+
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
- struct bch_dev *ca, size_t b,
- enum bch_data_type type,
- unsigned sectors)
+ struct bch_dev *ca, u64 b,
+ enum bch_data_type type, unsigned sectors,
+ enum btree_iter_update_trigger_flags flags)
{
- return commit_do(trans, NULL, NULL, 0,
- __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
+ BUG_ON(type != BCH_DATA_free &&
+ type != BCH_DATA_sb &&
+ type != BCH_DATA_journal);
+
+ /*
+ * Backup superblock might be past the end of our normal usable space:
+ */
+ if (b >= ca->mi.nbuckets)
+ return 0;
+
+ if (flags & BTREE_TRIGGER_gc)
+ return bch2_mark_metadata_bucket(trans->c, ca, b, type, sectors, flags);
+ else if (flags & BTREE_TRIGGER_transactional)
+ return commit_do(trans, NULL, NULL, 0,
+ __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
+ else
+ BUG();
}
static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
- struct bch_dev *ca,
- u64 start, u64 end,
- enum bch_data_type type,
- u64 *bucket, unsigned *bucket_sectors)
+ struct bch_dev *ca, u64 start, u64 end,
+ enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
+ enum btree_iter_update_trigger_flags flags)
{
do {
u64 b = sector_to_bucket(ca, start);
@@ -1183,7 +1390,7 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
if (b != *bucket && *bucket_sectors) {
int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
- type, *bucket_sectors);
+ type, *bucket_sectors, flags);
if (ret)
return ret;
@@ -1198,8 +1405,8 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
return 0;
}
-static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
- struct bch_dev *ca)
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
u64 bucket = 0;
@@ -1212,21 +1419,21 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
if (offset == BCH_SB_SECTOR) {
ret = bch2_trans_mark_metadata_sectors(trans, ca,
0, BCH_SB_SECTOR,
- BCH_DATA_sb, &bucket, &bucket_sectors);
+ BCH_DATA_sb, &bucket, &bucket_sectors, flags);
if (ret)
return ret;
}
ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
offset + (1 << layout->sb_max_size_bits),
- BCH_DATA_sb, &bucket, &bucket_sectors);
+ BCH_DATA_sb, &bucket, &bucket_sectors, flags);
if (ret)
return ret;
}
if (bucket_sectors) {
ret = bch2_trans_mark_metadata_bucket(trans, ca,
- bucket, BCH_DATA_sb, bucket_sectors);
+ bucket, BCH_DATA_sb, bucket_sectors, flags);
if (ret)
return ret;
}
@@ -1234,7 +1441,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
for (i = 0; i < ca->journal.nr; i++) {
ret = bch2_trans_mark_metadata_bucket(trans, ca,
ca->journal.buckets[i],
- BCH_DATA_journal, ca->mi.bucket_size);
+ BCH_DATA_journal, ca->mi.bucket_size, flags);
if (ret)
return ret;
}
@@ -1242,20 +1449,22 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
return 0;
}
-int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
+ enum btree_iter_update_trigger_flags flags)
{
- int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
-
+ int ret = bch2_trans_run(c,
+ __bch2_trans_mark_dev_sb(trans, ca, flags));
bch_err_fn(c, ret);
return ret;
}
-int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
+ enum btree_iter_update_trigger_flags flags)
{
for_each_online_member(c, ca) {
- int ret = bch2_trans_mark_dev_sb(c, ca);
+ int ret = bch2_trans_mark_dev_sb(c, ca, flags);
if (ret) {
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
}
@@ -1263,6 +1472,11 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c)
return 0;
}
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+{
+ return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
+}
+
/* Disk reservations: */
#define SECTORS_CACHE 1024
@@ -1331,6 +1545,31 @@ recalculate:
/* Startup/shutdown: */
+void bch2_buckets_nouse_free(struct bch_fs *c)
+{
+ for_each_member_device(c, ca) {
+ kvfree_rcu_mightsleep(ca->buckets_nouse);
+ ca->buckets_nouse = NULL;
+ }
+}
+
+int bch2_buckets_nouse_alloc(struct bch_fs *c)
+{
+ for_each_member_device(c, ca) {
+ BUG_ON(ca->buckets_nouse);
+
+ ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
+ sizeof(unsigned long),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!ca->buckets_nouse) {
+ bch2_dev_put(ca);
+ return -BCH_ERR_ENOMEM_buckets_nouse;
+ }
+ }
+
+ return 0;
+}
+
static void bucket_gens_free_rcu(struct rcu_head *rcu)
{
struct bucket_gens *buckets =
@@ -1342,24 +1581,17 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
- unsigned long *buckets_nouse = NULL;
bool resize = ca->bucket_gens != NULL;
int ret;
+ BUG_ON(resize && ca->buckets_nouse);
+
if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
GFP_KERNEL|__GFP_ZERO))) {
ret = -BCH_ERR_ENOMEM_bucket_gens;
goto err;
}
- if ((c->opts.buckets_nouse &&
- !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) *
- sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO)))) {
- ret = -BCH_ERR_ENOMEM_buckets_nouse;
- goto err;
- }
-
bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets;
@@ -1377,17 +1609,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
memcpy(bucket_gens->b,
old_bucket_gens->b,
n);
- if (buckets_nouse)
- memcpy(buckets_nouse,
- ca->buckets_nouse,
- BITS_TO_LONGS(n) * sizeof(unsigned long));
}
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
bucket_gens = old_bucket_gens;
- swap(ca->buckets_nouse, buckets_nouse);
-
nbuckets = ca->mi.nbuckets;
if (resize) {
@@ -1398,7 +1624,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ret = 0;
err:
- kvfree(buckets_nouse);
if (bucket_gens)
call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index f9af5adabe83..617ffde2fb7a 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -12,7 +12,7 @@
#include "extents.h"
#include "sb-members.h"
-static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s)
{
return div_u64(s, ca->mi.bucket_size);
}
@@ -30,8 +30,7 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
return remainder;
}
-static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
- u32 *offset)
+static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset)
{
return div_u64_rem(s, ca->mi.bucket_size, offset);
}
@@ -94,7 +93,7 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
struct bucket_array *buckets = gc_bucket_array(ca);
- BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+ BUG_ON(!bucket_valid(ca, b));
return buckets->b + b;
}
@@ -111,7 +110,7 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
{
struct bucket_gens *gens = bucket_gens(ca);
- BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
+ BUG_ON(!bucket_valid(ca, b));
return gens->b + b;
}
@@ -121,20 +120,16 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
return sector_to_bucket(ca, ptr->offset);
}
-static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
- const struct bch_extent_ptr *ptr)
+static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca,
+ const struct bch_extent_ptr *ptr)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
}
-static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr,
u32 *bucket_offset)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
}
@@ -175,17 +170,19 @@ static inline int gen_after(u8 a, u8 b)
return r > 0 ? r : 0;
}
+static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
+{
+ return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+}
+
/**
- * ptr_stale() - check if a pointer points into a bucket that has been
+ * dev_ptr_stale() - check if a pointer points into a bucket that has been
* invalidated.
*/
-static inline u8 ptr_stale(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
+static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
- u8 ret;
-
rcu_read_lock();
- ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+ u8 ret = dev_ptr_stale_rcu(ca, ptr);
rcu_read_unlock();
return ret;
@@ -306,8 +303,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
const struct bch_alloc_v4 *,
const struct bch_alloc_v4 *, u64, bool);
-void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *,
- struct bucket *, struct bucket *);
/* key/bucket marking: */
@@ -333,27 +328,29 @@ int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
void bch2_fs_usage_initialize(struct bch_fs *);
-int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c,
- const struct bch_extent_ptr *,
- s64, enum bch_data_type, u8, u8, u32);
+int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
+ struct bkey_s_c, const struct bch_extent_ptr *,
+ s64, enum bch_data_type, u8, u8, u32 *);
-int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
- size_t, enum bch_data_type, unsigned,
- struct gc_pos, unsigned);
+int bch2_check_fix_ptrs(struct btree_trans *,
+ enum btree_id, unsigned, struct bkey_s_c,
+ enum btree_iter_update_trigger_flags);
int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
({ \
int ret = 0; \
\
if (_old.k->type) \
- ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
+ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \
if (!ret && _new.k->type) \
- ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\
+ ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\
ret; \
})
@@ -362,9 +359,13 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *);
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
-int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
- size_t, enum bch_data_type, unsigned);
-int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
+ enum bch_data_type, unsigned,
+ enum btree_iter_update_trigger_flags);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *,
+ enum btree_iter_update_trigger_flags);
+int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
+ enum btree_iter_update_trigger_flags);
int bch2_trans_mark_dev_sbs(struct bch_fs *);
static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
@@ -464,6 +465,9 @@ static inline u64 avail_factor(u64 r)
return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
}
+void bch2_buckets_nouse_free(struct bch_fs *);
+int bch2_buckets_nouse_alloc(struct bch_fs *);
+
int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
void bch2_dev_buckets_free(struct bch_dev *);
int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 4d14f19f5185..9e54323f0f5f 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -32,12 +32,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
if (dev >= c->sb.nr_devices)
return ERR_PTR(-EINVAL);
- rcu_read_lock();
- ca = rcu_dereference(c->devs[dev]);
- if (ca)
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
-
+ ca = bch2_dev_tryget_noerror(c, dev);
if (!ca)
return ERR_PTR(-EINVAL);
} else {
@@ -391,7 +386,7 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
return PTR_ERR(ca);
ret = bch2_dev_offline(c, ca, arg.flags);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -420,7 +415,7 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
if (ret)
bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -615,7 +610,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.d[i].fragmented = src.d[i].fragmented;
}
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
}
@@ -667,7 +662,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
goto err;
}
err:
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -689,11 +684,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
if (arg.flags & BCH_READ_DEV) {
ca = bch2_device_lookup(c, arg.dev, arg.flags);
-
- if (IS_ERR(ca)) {
- ret = PTR_ERR(ca);
- goto err;
- }
+ ret = PTR_ERR_OR_ZERO(ca);
+ if (ret)
+ goto err_unlock;
sb = ca->disk_sb.sb;
} else {
@@ -708,8 +701,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
vstruct_bytes(sb));
err:
- if (!IS_ERR_OR_NULL(ca))
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
+err_unlock:
mutex_unlock(&c->sb_lock);
return ret;
}
@@ -753,7 +746,7 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
ret = bch2_dev_resize(c, ca, arg.nbuckets);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -779,7 +772,7 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return ret;
}
@@ -961,7 +954,9 @@ static const struct file_operations bch_chardev_fops = {
};
static int bch_chardev_major;
-static struct class *bch_chardev_class;
+static const struct class bch_chardev_class = {
+ .name = "bcachefs",
+};
static struct device *bch_chardev;
void bch2_fs_chardev_exit(struct bch_fs *c)
@@ -978,7 +973,7 @@ int bch2_fs_chardev_init(struct bch_fs *c)
if (c->minor < 0)
return c->minor;
- c->chardev = device_create(bch_chardev_class, NULL,
+ c->chardev = device_create(&bch_chardev_class, NULL,
MKDEV(bch_chardev_major, c->minor), c,
"bcachefs%u-ctl", c->minor);
if (IS_ERR(c->chardev))
@@ -989,32 +984,39 @@ int bch2_fs_chardev_init(struct bch_fs *c)
void bch2_chardev_exit(void)
{
- if (!IS_ERR_OR_NULL(bch_chardev_class))
- device_destroy(bch_chardev_class,
- MKDEV(bch_chardev_major, U8_MAX));
- if (!IS_ERR_OR_NULL(bch_chardev_class))
- class_destroy(bch_chardev_class);
+ device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX));
+ class_unregister(&bch_chardev_class);
if (bch_chardev_major > 0)
unregister_chrdev(bch_chardev_major, "bcachefs");
}
int __init bch2_chardev_init(void)
{
+ int ret;
+
bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
if (bch_chardev_major < 0)
return bch_chardev_major;
- bch_chardev_class = class_create("bcachefs");
- if (IS_ERR(bch_chardev_class))
- return PTR_ERR(bch_chardev_class);
+ ret = class_register(&bch_chardev_class);
+ if (ret)
+ goto major_out;
- bch_chardev = device_create(bch_chardev_class, NULL,
+ bch_chardev = device_create(&bch_chardev_class, NULL,
MKDEV(bch_chardev_major, U8_MAX),
NULL, "bcachefs-ctl");
- if (IS_ERR(bch_chardev))
- return PTR_ERR(bch_chardev);
+ if (IS_ERR(bch_chardev)) {
+ ret = PTR_ERR(bch_chardev);
+ goto class_out;
+ }
return 0;
+
+class_out:
+ class_unregister(&bch_chardev_class);
+major_out:
+ unregister_chrdev(bch_chardev_major, "bcachefs-ctl");
+ return ret;
}
#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 088fd2e7bdf1..85198f391e9c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -469,9 +469,8 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
/* BCH_SB_FIELD_crypt: */
-static int bch2_sb_crypt_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
@@ -494,14 +493,10 @@ static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
- prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
- prt_newline(out);
+ prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt));
+ prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt));
+ prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt));
+ prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt));
}
const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0022b51ce3c0..0d807c2ce9c6 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -106,7 +106,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, m->btree_id,
bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
while (1) {
struct bkey_s_c k;
@@ -203,6 +203,8 @@ restart_drop_conflicting_replicas:
/* Now, drop excess replicas: */
restart_drop_extra_replicas:
+
+ rcu_read_lock();
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
@@ -214,6 +216,7 @@ restart_drop_extra_replicas:
goto restart_drop_extra_replicas;
}
}
+ rcu_read_unlock();
/* Finally, add the pointers we just wrote: */
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
@@ -288,7 +291,7 @@ restart_drop_extra_replicas:
k.k->p, insert->k.p) ?:
bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
bch2_trans_update(trans, &iter, insert,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, &op->res,
NULL,
BCH_TRANS_COMMIT_no_check_rw|
@@ -357,10 +360,11 @@ void bch2_data_update_exit(struct data_update *update)
bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
if (c->opts.nocow_enabled)
bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr), 0);
- percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+ PTR_BUCKET_POS(ca, ptr), 0);
+ bch2_dev_put(ca);
}
bch2_bkey_buf_exit(&update->k, c);
@@ -386,8 +390,10 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
while (bio_sectors(bio)) {
unsigned sectors = bio_sectors(bio);
+ bch2_trans_begin(trans);
+
bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
ret = lockrestart_do(trans, ({
k = bch2_btree_iter_peek_slot(&iter);
bkey_err(k);
@@ -465,7 +471,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
- struct bch_extent_ptr *ptr;
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
@@ -480,15 +485,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
/*
* Since we're not inserting through an extent iterator
- * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
- if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents))
n->k.size = 0;
return bch2_trans_relock(trans) ?:
- bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
@@ -539,15 +544,26 @@ int bch2_data_update_init(struct btree_trans *trans,
m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
- bkey_for_each_ptr(ptrs, ptr)
- percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!bch2_dev_tryget(c, ptr->dev)) {
+ bkey_for_each_ptr(ptrs, ptr2) {
+ if (ptr2 == ptr)
+ break;
+ bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
+ }
+ return -BCH_ERR_data_update_done;
+ }
+ }
unsigned durability_have = 0, durability_removing = 0;
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
bool locked;
+ rcu_read_lock();
if (((1U << i) & m->data_opts.rewrite_ptrs)) {
BUG_ON(p.ptr.cached);
@@ -561,6 +577,7 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
}
+ rcu_read_unlock();
/*
* op->csum_type is normally initialized from the fs/file's
@@ -579,15 +596,13 @@ int bch2_data_update_init(struct btree_trans *trans,
if (ctxt) {
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0)) ||
+ bucket, 0)) ||
list_empty(&ctxt->ios));
if (!locked)
- bch2_bucket_nocow_lock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0);
+ bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
} else {
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0)) {
+ if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
ret = -BCH_ERR_nocow_lock_blocked;
goto err;
}
@@ -649,10 +664,11 @@ int bch2_data_update_init(struct btree_trans *trans,
err:
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
if ((1U << i) & ptrs_locked)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, &p.ptr), 0);
- percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref);
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+ bch2_dev_put(ca);
i++;
}
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index cd99b7399414..51cbf3928361 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -37,11 +37,11 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
struct btree_node *n_ondisk = c->verify_ondisk;
struct btree_node *n_sorted = c->verify_data->data;
struct bset *sorted, *inmemory = &b->data->keys;
- struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
struct bio *bio;
bool failed = false, saw_error = false;
- if (!bch2_dev_get_ioref(ca, READ))
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ if (!ca)
return false;
bio = bio_alloc_bioset(ca->disk_sb.bdev,
@@ -194,8 +194,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
- if (!bch2_dev_get_ioref(ca, READ)) {
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ if (!ca) {
prt_printf(out, "error getting device to read from: not online\n");
return;
}
@@ -375,8 +375,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
return flush_buf(i) ?:
bch2_trans_run(i->c,
for_each_btree_key(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
bch2_bkey_val_to_text(&i->buf, i->c, k);
prt_newline(&i->buf);
bch2_trans_unlock(trans);
@@ -459,8 +459,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
return flush_buf(i) ?:
bch2_trans_run(i->c,
for_each_btree_key(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
struct btree_path_level *l =
&btree_iter_path(trans, &iter)->l[0];
struct bkey_packed *_k =
@@ -492,51 +492,26 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
- prt_printf(out, "%px btree=%s l=%u ",
- b,
- bch2_btree_id_str(b->c.btree_id),
- b->c.level);
- prt_newline(out);
+ prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level);
printbuf_indent_add(out, 2);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
prt_newline(out);
- prt_printf(out, "flags: ");
- prt_tab(out);
+ prt_printf(out, "flags:\t");
prt_bitflags(out, bch2_btree_node_flags, b->flags);
prt_newline(out);
- prt_printf(out, "pcpu read locks: ");
- prt_tab(out);
- prt_printf(out, "%u", b->c.lock.readers != NULL);
- prt_newline(out);
-
- prt_printf(out, "written:");
- prt_tab(out);
- prt_printf(out, "%u", b->written);
- prt_newline(out);
-
- prt_printf(out, "writes blocked:");
- prt_tab(out);
- prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
- prt_newline(out);
-
- prt_printf(out, "will make reachable:");
- prt_tab(out);
- prt_printf(out, "%lx", b->will_make_reachable);
- prt_newline(out);
-
- prt_printf(out, "journal pin %px:", &b->writes[0].journal);
- prt_tab(out);
- prt_printf(out, "%llu", b->writes[0].journal.seq);
- prt_newline(out);
+ prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL);
+ prt_printf(out, "written:\t%u\n", b->written);
+ prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked));
+ prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable);
- prt_printf(out, "journal pin %px:", &b->writes[1].journal);
- prt_tab(out);
- prt_printf(out, "%llu", b->writes[1].journal.seq);
- prt_newline(out);
+ prt_printf(out, "journal pin %px:\t%llu\n",
+ &b->writes[0].journal, b->writes[0].journal.seq);
+ prt_printf(out, "journal pin %px:\t%llu\n",
+ &b->writes[1].journal, b->writes[1].journal.seq);
printbuf_indent_sub(out, 2);
}
@@ -625,8 +600,7 @@ restart:
bch2_btree_trans_to_text(&i->buf, trans);
- prt_printf(&i->buf, "backtrace:");
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "backtrace:\n");
printbuf_indent_add(&i->buf, 2);
bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
@@ -782,25 +756,20 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
!bch2_btree_transaction_fns[i->iter])
break;
- prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]);
printbuf_indent_add(&i->buf, 2);
mutex_lock(&s->lock);
- prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
- prt_newline(&i->buf);
-
- prt_printf(&i->buf, "Transaction duration:");
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
+ prt_printf(&i->buf, "Transaction duration:\n");
printbuf_indent_add(&i->buf, 2);
bch2_time_stats_to_text(&i->buf, &s->duration);
printbuf_indent_sub(&i->buf, 2);
if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
- prt_printf(&i->buf, "Lock hold times:");
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "Lock hold times:\n");
printbuf_indent_add(&i->buf, 2);
bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
@@ -808,8 +777,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
}
if (s->max_paths_text) {
- prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths);
- prt_newline(&i->buf);
+ prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths);
printbuf_indent_add(&i->buf, 2);
prt_str_indented(&i->buf, s->max_paths_text);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d37bd07afbfe..6bbf9a7d9e4d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -98,7 +98,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
};
int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
@@ -118,7 +118,7 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
* Check new keys don't exceed the max length
* (older keys may be larger.)
*/
- bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err,
+ bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err,
dirent_name_too_long,
"dirent name too big (%u > %u)",
d_name.len, BCH_NAME_MAX);
@@ -205,7 +205,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
- bch_str_hash_flags_t str_hash_flags)
+ enum btree_iter_update_trigger_flags flags)
{
subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
struct bkey_i_dirent *dirent;
@@ -220,9 +220,8 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
dirent->k.p.snapshot = snapshot;
ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, snapshot,
- &dirent->k_i, str_hash_flags,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ dir_inum, snapshot, &dirent->k_i,
+ flags|BTREE_UPDATE_internal_snapshot_node);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -232,7 +231,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
- bch_str_hash_flags_t str_hash_flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i_dirent *dirent;
int ret;
@@ -243,7 +242,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir, &dirent->k_i, str_hash_flags);
+ dir, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -272,7 +271,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
} else {
target->subvol = le32_to_cpu(d.v->d_child_subvol);
- ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
+ ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s);
target->inum = le64_to_cpu(s.inode);
}
@@ -301,13 +300,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
memset(dst_inum, 0, sizeof(*dst_inum));
/* Lookup src: */
- ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
- src_hash, src_dir, src_name,
- BTREE_ITER_INTENT);
- if (ret)
- goto out;
-
- old_src = bch2_btree_iter_peek_slot(&src_iter);
+ old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+ src_hash, src_dir, src_name,
+ BTREE_ITER_intent);
ret = bkey_err(old_src);
if (ret)
goto out;
@@ -329,13 +324,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
} else {
- ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name,
- BTREE_ITER_INTENT);
- if (ret)
- goto out;
-
- old_dst = bch2_btree_iter_peek_slot(&dst_iter);
+ old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+ dst_hash, dst_dir, dst_name,
+ BTREE_ITER_intent);
ret = bkey_err(old_dst);
if (ret)
goto out;
@@ -450,7 +441,7 @@ out_set_src:
if (delete_src) {
bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
ret = bch2_btree_iter_traverse(&src_iter) ?:
- bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto out;
}
@@ -458,7 +449,7 @@ out_set_src:
if (delete_dst) {
bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
ret = bch2_btree_iter_traverse(&dst_iter) ?:
- bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto out;
}
@@ -479,13 +470,9 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans,
const struct qstr *name, subvol_inum *inum,
unsigned flags)
{
- int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, name, flags);
- if (ret)
- return ret;
-
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+ hash_info, dir, name, flags);
+ int ret = bkey_err(k);
if (ret)
goto err;
@@ -541,16 +528,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
}
+static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
+{
+ struct qstr name = bch2_dirent_get_name(d);
+ bool ret = dir_emit(ctx, name.name,
+ name.len,
+ target.inum,
+ vfs_d_type(d.v->d_type));
+ if (ret)
+ ctx->pos = d.k->p.offset + 1;
+ return ret;
+}
+
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_dirent dirent;
subvol_inum target;
u32 snapshot;
struct bkey_buf sk;
- struct qstr name;
int ret;
bch2_bkey_buf_init(&sk);
@@ -567,7 +564,9 @@ retry:
if (k.k->type != KEY_TYPE_dirent)
continue;
- dirent = bkey_s_c_to_dirent(k);
+ /* dir_emit() can fault and block: */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
ret = bch2_dirent_read_target(trans, inum, dirent, &target);
if (ret < 0)
@@ -575,28 +574,22 @@ retry:
if (ret)
continue;
- /* dir_emit() can fault and block: */
- bch2_bkey_buf_reassemble(&sk, c, k);
- dirent = bkey_i_to_s_c_dirent(sk.k);
- bch2_trans_unlock(trans);
-
- name = bch2_dirent_get_name(dirent);
-
- ctx->pos = dirent.k->p.offset;
- if (!dir_emit(ctx, name.name,
- name.len,
- target.inum,
- vfs_d_type(dirent.v->d_type)))
- break;
- ctx->pos = dirent.k->p.offset + 1;
-
/*
* read_target looks up subvolumes, we can overflow paths if the
* directory has many subvolumes in it
+ *
+ * XXX: btree_trans_too_many_iters() is something we'd like to
+ * get rid of, and there's no good reason to be using it here
+ * except that we don't yet have a for_each_btree_key() helper
+ * that does subvolume_get_snapshot().
*/
- ret = btree_trans_too_many_iters(trans);
- if (ret)
+ ret = drop_locks_do(trans,
+ bch2_dir_emit(ctx, dirent, target)) ?:
+ btree_trans_too_many_iters(trans);
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
break;
+ }
}
bch2_trans_iter_exit(trans, &iter);
err:
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index bee55cca2aa0..24037e6e0a09 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -4,11 +4,11 @@
#include "str_hash.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
extern const struct bch_hash_desc bch2_dirent_hash_desc;
int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_dirent ((struct bkey_ops) { \
@@ -38,11 +38,11 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
- bch_str_hash_flags_t);
+ enum btree_iter_update_trigger_flags);
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
- bch_str_hash_flags_t);
+ enum btree_iter_update_trigger_flags);
static inline unsigned vfs_d_type(unsigned type)
{
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 06a7df529b40..521a86df5e52 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -18,9 +18,8 @@ static int group_cmp(const void *_l, const void *_r)
strncmp(l->label, r->label, sizeof(l->label));
}
-static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
@@ -177,7 +176,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
struct bch_disk_group_cpu *dst;
- if (!bch2_member_exists(&m))
+ if (!bch2_member_alive(&m))
continue;
g = BCH_MEMBER_GROUP(&m);
@@ -523,7 +522,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
ca = bch2_dev_lookup(c, val);
if (!IS_ERR(ca)) {
*res = dev_to_target(ca->dev_idx);
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
return 0;
}
@@ -588,7 +587,7 @@ static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsi
case TARGET_DEV: {
struct bch_member m = bch2_sb_member_get(sb, t.dev);
- if (bch2_dev_exists(sb, t.dev)) {
+ if (bch2_member_exists(sb, t.dev)) {
prt_printf(out, "Device ");
pr_uuid(out, m.uuid.b);
prt_printf(out, " (%u)", t.dev);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 556a217108d3..b26dc7424662 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -107,7 +107,7 @@ struct ec_bio {
/* Stripes btree keys: */
int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
@@ -163,146 +163,189 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
/* Triggers: */
-static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c_stripe s,
- unsigned idx, bool deleting)
+static int __mark_stripe_bucket(struct btree_trans *trans,
+ struct bch_dev *ca,
+ struct bkey_s_c_stripe s,
+ unsigned ptr_idx, bool deleting,
+ struct bpos bucket,
+ struct bch_alloc_v4 *a,
+ enum btree_iter_update_trigger_flags flags)
{
- struct bch_fs *c = trans->c;
- const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
- enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity : 0;
- s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
+ const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
+ unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
+ bool parity = ptr_idx >= nr_data;
+ enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
+ s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
+ struct bch_fs *c = trans->c;
if (deleting)
sectors = -sectors;
- a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
- a->v.gen, a->v.data_type,
- a->v.dirty_sectors);
- if (ret)
- goto err;
-
if (!deleting) {
- if (bch2_trans_inconsistent_on(a->v.stripe ||
- a->v.stripe_redundancy, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_type_str(a->v.data_type),
- a->v.dirty_sectors,
- a->v.stripe, s.k->p.offset)) {
+ if (bch2_trans_inconsistent_on(a->stripe ||
+ a->stripe_redundancy, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ bch2_data_type_str(a->data_type),
+ a->dirty_sectors,
+ a->stripe, s.k->p.offset,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
goto err;
}
- if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_type_str(a->v.data_type),
- a->v.dirty_sectors,
- s.k->p.offset)) {
+ if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ bch2_data_type_str(a->data_type),
+ a->dirty_sectors,
+ a->cached_sectors,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
goto err;
}
-
- a->v.stripe = s.k->p.offset;
- a->v.stripe_redundancy = s.v->nr_redundant;
- a->v.data_type = BCH_DATA_stripe;
} else {
- if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
- a->v.stripe_redundancy != s.v->nr_redundant, trans,
- "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- s.k->p.offset, a->v.stripe)) {
+ if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
+ a->stripe_redundancy != s.v->nr_redundant, trans,
+ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ a->stripe,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
goto err;
}
- a->v.stripe = 0;
- a->v.stripe_redundancy = 0;
- a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
+ if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
+ "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ bch2_data_type_str(a->data_type),
+ bch2_data_type_str(data_type),
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_trans_inconsistent_on(parity &&
+ (a->dirty_sectors != -sectors ||
+ a->cached_sectors), trans,
+ "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
+ bucket.inode, bucket.offset, a->gen,
+ a->dirty_sectors,
+ a->cached_sectors,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -EIO;
+ goto err;
+ }
}
- a->v.dirty_sectors += sectors;
- if (data_type)
- a->v.data_type = !deleting ? data_type : 0;
+ if (sectors) {
+ ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
+ a->gen, a->data_type, &a->dirty_sectors);
+ if (ret)
+ goto err;
+ }
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto err;
+ if (!deleting) {
+ a->stripe = s.k->p.offset;
+ a->stripe_redundancy = s.v->nr_redundant;
+ } else {
+ a->stripe = 0;
+ a->stripe_redundancy = 0;
+ }
+
+ alloc_data_type_set(a, data_type);
err:
- bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
}
static int mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c k,
- unsigned ptr_idx,
- unsigned flags)
+ struct bkey_s_c_stripe s,
+ unsigned ptr_idx, bool deleting,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned nr_data = s->nr_blocks - s->nr_redundant;
- bool parity = ptr_idx >= nr_data;
- enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
- s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
- const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket old, new, *g;
- struct printbuf buf = PRINTBUF;
+ const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
int ret = 0;
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
+ struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
+ if (unlikely(!ca)) {
+ if (!(flags & BTREE_TRIGGER_overwrite))
+ ret = -EIO;
+ goto err;
+ }
- /* * XXX doesn't handle deletion */
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- percpu_down_read(&c->mark_lock);
- g = PTR_GC_BUCKET(ca, ptr);
+ if (flags & BTREE_TRIGGER_transactional) {
+ struct bkey_i_alloc_v4 *a =
+ bch2_trans_start_alloc_update(trans, bucket);
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
+ }
- if (g->dirty_sectors ||
- (g->stripe && g->stripe != k.k->p.offset)) {
- bch2_fs_inconsistent(c,
- "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EINVAL;
- goto err;
+ if (flags & BTREE_TRIGGER_gc) {
+ percpu_down_read(&c->mark_lock);
+ struct bucket *g = gc_bucket(ca, bucket.offset);
+ bucket_lock(g);
+ struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
+ ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
+ if (!ret) {
+ alloc_to_bucket(g, new);
+ bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+ }
+ bucket_unlock(g);
+ percpu_up_read(&c->mark_lock);
}
+err:
+ bch2_dev_put(ca);
+ return ret;
+}
- bucket_lock(g);
- old = *g;
+static int mark_stripe_buckets(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ enum btree_iter_update_trigger_flags flags)
+{
+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(old).v : NULL;
+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(new).v : NULL;
- ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type,
- g->gen, g->data_type,
- g->dirty_sectors);
- if (ret)
- goto err;
+ BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
- g->data_type = data_type;
- g->dirty_sectors += sectors;
+ unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
- g->stripe = k.k->p.offset;
- g->stripe_redundancy = s->nr_redundant;
- new = *g;
-err:
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, &old, &new);
- percpu_up_read(&c->mark_lock);
- printbuf_exit(&buf);
- return ret;
+ for (unsigned i = 0; i < nr_blocks; i++) {
+ if (new_s && old_s &&
+ !memcmp(&new_s->ptrs[i],
+ &old_s->ptrs[i],
+ sizeof(new_s->ptrs[i])))
+ continue;
+
+ if (new_s) {
+ int ret = mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(new), i, false, flags);
+ if (ret)
+ return ret;
+ }
+
+ if (old_s) {
+ int ret = mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(old), i, true, flags);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
}
int bch2_trigger_stripe(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
+ enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_s_c new = _new.s_c;
struct bch_fs *c = trans->c;
@@ -312,7 +355,10 @@ int bch2_trigger_stripe(struct btree_trans *trans,
const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(new).v : NULL;
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (unlikely(flags & BTREE_TRIGGER_check_repair))
+ return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
+
+ if (flags & BTREE_TRIGGER_transactional) {
/*
* If the pointers aren't changing, we don't need to do anything:
*/
@@ -347,31 +393,12 @@ int bch2_trigger_stripe(struct btree_trans *trans,
return ret;
}
- unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
- for (unsigned i = 0; i < nr_blocks; i++) {
- if (new_s && old_s &&
- !memcmp(&new_s->ptrs[i],
- &old_s->ptrs[i],
- sizeof(new_s->ptrs[i])))
- continue;
-
- if (new_s) {
- int ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(new), i, false);
- if (ret)
- return ret;
- }
-
- if (old_s) {
- int ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(old), i, true);
- if (ret)
- return ret;
- }
- }
+ int ret = mark_stripe_buckets(trans, old, new, flags);
+ if (ret)
+ return ret;
}
- if (flags & BTREE_TRIGGER_ATOMIC) {
+ if (flags & BTREE_TRIGGER_atomic) {
struct stripe *m = genradix_ptr(&c->stripes, idx);
if (!m) {
@@ -410,7 +437,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
}
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & BTREE_TRIGGER_gc) {
struct gc_stripe *m =
genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
@@ -439,13 +466,11 @@ int bch2_trigger_stripe(struct btree_trans *trans,
*/
memset(m->block_sectors, 0, sizeof(m->block_sectors));
- for (unsigned i = 0; i < new_s->nr_blocks; i++) {
- int ret = mark_stripe_bucket(trans, new, i, flags);
- if (ret)
- return ret;
- }
+ int ret = mark_stripe_buckets(trans, old, new, flags);
+ if (ret)
+ return ret;
- int ret = bch2_update_replicas(c, new, &m->r.e,
+ ret = bch2_update_replicas(c, new, &m->r.e,
((s64) m->sectors * m->nr_redundant),
0, true);
if (ret) {
@@ -608,19 +633,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
- struct printbuf err = PRINTBUF;
- struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
+ struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
+ if (ca) {
+ struct printbuf err = PRINTBUF;
- prt_str(&err, "stripe ");
- bch2_csum_err_msg(&err, v->csum_type, want, got);
- prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
- bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
- bch_err_ratelimited(ca, "%s", err.buf);
- printbuf_exit(&err);
+ prt_str(&err, "stripe ");
+ bch2_csum_err_msg(&err, v->csum_type, want, got);
+ prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
+ bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+ bch_err_ratelimited(ca, "%s", err.buf);
+ printbuf_exit(&err);
- clear_bit(i, buf->valid);
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ }
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ clear_bit(i, buf->valid);
break;
}
@@ -687,7 +714,7 @@ static void ec_block_endio(struct bio *bio)
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
- if (ptr_stale(ca, ptr)) {
+ if (dev_ptr_stale(ca, ptr)) {
bch_err_ratelimited(ca->fs,
"error %s stripe: stale pointer after io",
bio_data_dir(bio) == READ ? "reading from" : "writing to");
@@ -705,25 +732,26 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
unsigned offset = 0, bytes = buf->size << 9;
struct bch_extent_ptr *ptr = &v->ptrs[idx];
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
? BCH_DATA_user
: BCH_DATA_parity;
int rw = op_is_write(opf);
- if (ptr_stale(ca, ptr)) {
- bch_err_ratelimited(c,
- "error %s stripe: stale pointer",
- rw == READ ? "reading from" : "writing to");
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
+ if (!ca) {
clear_bit(idx, buf->valid);
return;
}
- if (!bch2_dev_get_ioref(ca, rw)) {
+ if (dev_ptr_stale(ca, ptr)) {
+ bch_err_ratelimited(c,
+ "error %s stripe: stale pointer",
+ rw == READ ? "reading from" : "writing to");
clear_bit(idx, buf->valid);
return;
}
+
this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
while (offset < bytes) {
@@ -769,7 +797,7 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- POS(0, idx), BTREE_ITER_SLOTS);
+ POS(0, idx), BTREE_ITER_slots);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1060,7 +1088,7 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1131,7 +1159,7 @@ static int ec_stripe_key_update(struct btree_trans *trans,
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- new->k.p, BTREE_ITER_INTENT);
+ new->k.p, BTREE_ITER_intent);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1173,6 +1201,7 @@ err:
}
static int ec_stripe_update_extent(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bpos bucket, u8 gen,
struct ec_stripe_buf *s,
struct bpos *bp_pos)
@@ -1183,13 +1212,13 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_s_c k;
const struct bch_extent_ptr *ptr_c;
- struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+ struct bch_extent_ptr *ec_ptr = NULL;
struct bch_extent_stripe_ptr stripe_ptr;
struct bkey_i *n;
int ret, dev, block;
- ret = bch2_get_next_backpointer(trans, bucket, gen,
- bp_pos, &bp, BTREE_ITER_CACHED);
+ ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
+ bp_pos, &bp, BTREE_ITER_cached);
if (ret)
return ret;
if (bpos_eq(*bp_pos, SPOS_MAX))
@@ -1214,7 +1243,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
return -EIO;
}
- k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
+ k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent);
ret = bkey_err(k);
if (ret)
return ret;
@@ -1272,17 +1301,21 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
{
struct bch_fs *c = trans->c;
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- struct bch_extent_ptr bucket = v->ptrs[block];
- struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
+ struct bch_extent_ptr ptr = v->ptrs[block];
struct bpos bp_pos = POS_MIN;
int ret = 0;
+ struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
+ if (!ca)
+ return -EIO;
+
+ struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
+
while (1) {
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
- s, &bp_pos));
+ ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos));
if (ret)
break;
if (bkey_eq(bp_pos, POS_MAX))
@@ -1291,6 +1324,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
bp_pos = bpos_nosnap_successor(bp_pos);
}
+ bch2_dev_put(ca);
return ret;
}
@@ -1321,20 +1355,18 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
unsigned block,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
- unsigned offset = ca->mi.bucket_size - ob->sectors_free;
- int ret;
-
- if (!bch2_dev_get_ioref(ca, WRITE)) {
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
+ if (!ca) {
s->err = -BCH_ERR_erofs_no_writes;
return;
}
+ unsigned offset = ca->mi.bucket_size - ob->sectors_free;
memset(s->new_stripe.data[block] + (offset << 9),
0,
ob->sectors_free << 9);
- ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
+ int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
ob->bucket * ca->mi.bucket_size + offset,
ob->sectors_free,
GFP_KERNEL, 0);
@@ -1519,16 +1551,13 @@ void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
{
struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
- struct bch_dev *ca;
- unsigned offset;
-
if (!ob)
return NULL;
BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
- ca = bch_dev_bkey_exists(c, ob->dev);
- offset = ca->mi.bucket_size - ob->sectors_free;
+ struct bch_dev *ca = ob_dev(c, ob);
+ unsigned offset = ca->mi.bucket_size - ob->sectors_free;
return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
}
@@ -1937,7 +1966,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
}
for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
if (start_pos.offset) {
start_pos = min_pos;
@@ -2127,7 +2156,7 @@ int bch2_stripes_read(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_prefetch, k, ({
if (k.k->type != KEY_TYPE_stripe)
continue;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f042616888b0..84a23eeb6249 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -6,14 +6,15 @@
#include "buckets_types.h"
#include "extents_types.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
.key_invalid = bch2_stripe_invalid, \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 82a6656c941c..c66eeffcd7f2 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -176,6 +176,21 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
return s;
}
+/* s/fix?/fixing/ s/recreate?/recreating/ */
+static void prt_actioning(struct printbuf *out, const char *action)
+{
+ unsigned len = strlen(action);
+
+ BUG_ON(action[len - 1] != '?');
+ --len;
+
+ if (action[len - 1] == 'e')
+ --len;
+
+ prt_bytes(out, action, len);
+ prt_str(out, "ing");
+}
+
int bch2_fsck_err(struct bch_fs *c,
enum bch_fsck_flags flags,
enum bch_sb_error_id err,
@@ -186,6 +201,7 @@ int bch2_fsck_err(struct bch_fs *c,
bool print = true, suppressing = false, inconsistent = false;
struct printbuf buf = PRINTBUF, *out = &buf;
int ret = -BCH_ERR_fsck_ignore;
+ const char *action_orig = "fix?", *action = action_orig;
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
@@ -197,6 +213,19 @@ int bch2_fsck_err(struct bch_fs *c,
prt_vprintf(out, fmt, args);
va_end(args);
+ /* Custom fix/continue/recreate/etc.? */
+ if (out->buf[out->pos - 1] == '?') {
+ const char *p = strrchr(out->buf, ',');
+ if (p) {
+ out->pos = p - out->buf;
+ action = kstrdup(p + 2, GFP_KERNEL);
+ if (!action) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+ }
+
mutex_lock(&c->fsck_error_msgs_lock);
s = fsck_err_get(c, fmt);
if (s) {
@@ -208,12 +237,16 @@ int bch2_fsck_err(struct bch_fs *c,
if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
ret = s->ret;
mutex_unlock(&c->fsck_error_msgs_lock);
- printbuf_exit(&buf);
- return ret;
+ goto err;
}
kfree(s->last_msg);
s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
+ if (!s->last_msg) {
+ mutex_unlock(&c->fsck_error_msgs_lock);
+ ret = -ENOMEM;
+ goto err;
+ }
if (c->opts.ratelimit_errors &&
!(flags & FSCK_NO_RATELIMIT) &&
@@ -239,7 +272,8 @@ int bch2_fsck_err(struct bch_fs *c,
inconsistent = true;
ret = -BCH_ERR_fsck_errors_not_fixed;
} else if (flags & FSCK_CAN_FIX) {
- prt_str(out, ", fixing");
+ prt_str(out, ", ");
+ prt_actioning(out, action);
ret = -BCH_ERR_fsck_fix;
} else {
prt_str(out, ", continuing");
@@ -254,16 +288,16 @@ int bch2_fsck_err(struct bch_fs *c,
: c->opts.fix_errors;
if (fix == FSCK_FIX_ask) {
- int ask;
+ prt_str(out, ", ");
+ prt_str(out, action);
- prt_str(out, ": fix?");
if (bch2_fs_stdio_redirect(c))
bch2_print(c, "%s", out->buf);
else
bch2_print_string_as_lines(KERN_ERR, out->buf);
print = false;
- ask = bch2_fsck_ask_yn(c);
+ int ask = bch2_fsck_ask_yn(c);
if (ask >= YN_ALLNO && s)
s->fix = ask == YN_ALLNO
@@ -276,10 +310,12 @@ int bch2_fsck_err(struct bch_fs *c,
} else if (fix == FSCK_FIX_yes ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
- prt_str(out, ", fixing");
+ prt_str(out, ", ");
+ prt_actioning(out, action);
ret = -BCH_ERR_fsck_fix;
} else {
- prt_str(out, ", not fixing");
+ prt_str(out, ", not ");
+ prt_actioning(out, action);
}
} else if (flags & FSCK_NEED_FSCK) {
prt_str(out, " (run fsck to correct)");
@@ -311,8 +347,6 @@ int bch2_fsck_err(struct bch_fs *c,
mutex_unlock(&c->fsck_error_msgs_lock);
- printbuf_exit(&buf);
-
if (inconsistent)
bch2_inconsistent_error(c);
@@ -322,7 +356,10 @@ int bch2_fsck_err(struct bch_fs *c,
set_bit(BCH_FS_errors_not_fixed, &c->flags);
set_bit(BCH_FS_error, &c->flags);
}
-
+err:
+ if (action != action_orig)
+ kfree(action);
+ printbuf_exit(&buf);
return ret;
}
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index b9033bb4f11c..5f4fecb358da 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -72,7 +72,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
for_each_btree_key_norestart(trans, iter,
BTREE_ID_reflink, POS(0, idx + offset),
- BTREE_ITER_SLOTS, r_k, ret2) {
+ BTREE_ITER_slots, r_k, ret2) {
if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
break;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1a331e539204..469037929685 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -71,6 +71,12 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
}
}
+static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
+{
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
+}
+
/*
* returns true if p1 is better than p2:
*/
@@ -79,11 +85,8 @@ static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p2)
{
if (likely(!p1.idx && !p2.idx)) {
- struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
- struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
-
- u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
- u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+ u64 l1 = dev_latency(c, p1.ptr.dev);
+ u64 l2 = dev_latency(c, p2.ptr.dev);
/* Pick at random, biased in favor of the faster device: */
@@ -109,21 +112,21 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct bch_dev_io_failures *f;
- struct bch_dev *ca;
int ret = 0;
if (k.k->type == KEY_TYPE_error)
return -EIO;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
/*
* Unwritten extent: no need to actually read, treat it as a
* hole and return 0s:
*/
- if (p.ptr.unwritten)
- return 0;
-
- ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ if (p.ptr.unwritten) {
+ ret = 0;
+ break;
+ }
/*
* If there are any dirty pointers it's an error if we can't
@@ -132,7 +135,9 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (!ret && !p.ptr.cached)
ret = -EIO;
- if (p.ptr.cached && ptr_stale(ca, &p.ptr))
+ struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+
+ if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr)))
continue;
f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
@@ -141,12 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
? f->idx
: f->idx + 1;
- if (!p.idx &&
- !bch2_dev_is_readable(ca))
+ if (!p.idx && !ca)
p.idx++;
- if (bch2_force_reconstruct_read &&
- !p.idx && p.has_ec)
+ if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
+ p.idx++;
+
+ if (!p.idx && !bch2_dev_is_readable(ca))
p.idx++;
if (p.idx >= (unsigned) p.has_ec + 1)
@@ -158,6 +164,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
*pick = p;
ret = 1;
}
+ rcu_read_unlock();
return ret;
}
@@ -165,7 +172,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
/* KEY_TYPE_btree_ptr: */
int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
@@ -186,7 +193,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
}
int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
@@ -201,6 +208,11 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
c, err, btree_ptr_v2_min_key_bad,
"min_key > key");
+ if (flags & BCH_VALIDATE_write)
+ bkey_fsck_err_on(!bp.v->sectors_written,
+ c, err, btree_ptr_v2_written_0,
+ "sectors_written == 0");
+
ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
fsck_err:
return ret;
@@ -247,7 +259,6 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
const union bch_extent_entry *en_r;
struct extent_ptr_decoded lp, rp;
bool use_right_ptr;
- struct bch_dev *ca;
en_l = l_ptrs.start;
en_r = r_ptrs.start;
@@ -278,8 +289,12 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
return false;
/* Extents may not straddle buckets: */
- ca = bch_dev_bkey_exists(c, lp.ptr.dev);
- if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr))
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
+ bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
+ rcu_read_unlock();
+
+ if (!same_bucket)
return false;
if (lp.has_ec != rp.has_ec ||
@@ -385,7 +400,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
/* KEY_TYPE_reservation: */
int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
@@ -667,16 +682,16 @@ static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent
unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
- return __extent_ptr_durability(ca, p);
+ return ca ? __extent_ptr_durability(ca, p) : 0;
}
unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
- if (ca->mi.state == BCH_MEMBER_STATE_failed)
+ if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
return 0;
return __extent_ptr_durability(ca, p);
@@ -689,8 +704,10 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
durability += bch2_extent_ptr_durability(c, &p);
+ rcu_read_unlock();
return durability;
}
@@ -702,9 +719,11 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
durability += bch2_extent_ptr_durability(c, &p);
+ rcu_read_unlock();
return durability;
}
@@ -833,8 +852,6 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
{
- struct bch_extent_ptr *ptr;
-
bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
}
@@ -860,14 +877,21 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned
bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_dev *ca;
+ bool ret = false;
+ rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
+ (ca = bch2_dev_rcu(c, ptr->dev)) &&
(!ptr->cached ||
- !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
- return true;
+ !dev_ptr_stale_rcu(ca, ptr))) {
+ ret = true;
+ break;
+ }
+ rcu_read_unlock();
- return false;
+ return ret;
}
bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
@@ -969,21 +993,23 @@ void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
*/
bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
{
- struct bch_extent_ptr *ptr;
+ struct bch_dev *ca;
+ rcu_read_lock();
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
- ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
+ (ca = bch2_dev_rcu(c, ptr->dev)) &&
+ dev_ptr_stale_rcu(ca, ptr));
+ rcu_read_unlock();
return bkey_deleted(k.k);
}
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
{
- struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
- ? bch_dev_bkey_exists(c, ptr->dev)
- : NULL;
-
+ out->atomic++;
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
if (!ca) {
prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
@@ -998,11 +1024,11 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
prt_str(out, " cached");
if (ptr->unwritten)
prt_str(out, " unwritten");
- if (b >= ca->mi.first_bucket &&
- b < ca->mi.nbuckets &&
- ptr_stale(ca, ptr))
+ if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr))
prt_printf(out, " stale");
}
+ rcu_read_unlock();
+ --out->atomic;
}
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
@@ -1069,55 +1095,50 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
static int extent_ptr_invalid(struct bch_fs *c,
struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
const struct bch_extent_ptr *ptr,
unsigned size_ondisk,
bool metadata,
struct printbuf *err)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- u64 bucket;
- u32 bucket_offset;
- struct bch_dev *ca;
int ret = 0;
- if (!bch2_dev_exists2(c, ptr->dev)) {
- /*
- * If we're in the write path this key might have already been
- * overwritten, and we could be seeing a device that doesn't
- * exist anymore due to racing with device removal:
- */
- if (flags & BKEY_INVALID_WRITE)
- return 0;
-
- bkey_fsck_err(c, err, ptr_to_invalid_device,
- "pointer to invalid device (%u)", ptr->dev);
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca) {
+ rcu_read_unlock();
+ return 0;
}
+ u32 bucket_offset;
+ u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
+ unsigned first_bucket = ca->mi.first_bucket;
+ u64 nbuckets = ca->mi.nbuckets;
+ unsigned bucket_size = ca->mi.bucket_size;
+ rcu_read_unlock();
- ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr2)
bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
ptr_to_duplicate_device,
"multiple pointers to same device (%u)", ptr->dev);
- bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
- bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
+ bkey_fsck_err_on(bucket >= nbuckets, c, err,
ptr_after_last_bucket,
- "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
- bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
+ "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
+ bkey_fsck_err_on(bucket < first_bucket, c, err,
ptr_before_first_bucket,
- "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
- bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
+ "pointer before first bucket (%llu < %u)", bucket, first_bucket);
+ bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err,
ptr_spans_multiple_buckets,
"pointer spans multiple buckets (%u + %u > %u)",
- bucket_offset, size_ondisk, ca->mi.bucket_size);
+ bucket_offset, size_ondisk, bucket_size);
fsck_err:
return ret;
}
int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -1193,7 +1214,7 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
bkey_fsck_err_on(crc_is_encoded(crc) &&
(crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
- (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
+ (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err,
ptr_crc_uncompressed_size_too_big,
"too large encoded extent");
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 528e817eacbd..1ade959652b2 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -8,7 +8,7 @@
struct bch_fs;
struct btree_trans;
-enum bkey_invalid_flags;
+enum bch_validate_flags;
/* extent entries: */
@@ -406,12 +406,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
/* KEY_TYPE_btree_ptr: */
int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
int, struct bkey_s);
@@ -448,7 +448,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
/* KEY_TYPE_reservation: */
int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -654,7 +654,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
do { \
struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \
\
- _ptr = &_ptrs.start->ptr; \
+ struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \
\
while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \
if (_cond) { \
@@ -680,7 +680,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_ptr_swab(struct bkey_s);
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
index 0f955c3c76a7..2eaffe37b5e7 100644
--- a/fs/bcachefs/eytzinger.c
+++ b/fs/bcachefs/eytzinger.c
@@ -171,7 +171,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size,
swap_r_func_t swap_func,
const void *priv)
{
- int i, c, r;
+ int i, j, k;
/* called from 'sort' without swap function, let's pick the default */
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
@@ -188,17 +188,22 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size,
/* heapify */
for (i = n / 2 - 1; i >= 0; --i) {
- for (r = i; r * 2 + 1 < n; r = c) {
- c = r * 2 + 1;
+ /* Find the sift-down path all the way to the leaves. */
+ for (j = i; k = j * 2 + 1, k + 1 < n;)
+ j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
- if (c + 1 < n &&
- eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
- c++;
+ /* Special case for the last leaf with no sibling. */
+ if (j * 2 + 2 == n)
+ j = j * 2 + 1;
- if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
- break;
+ /* Backtrack to the correct location. */
+ while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0)
+ j = (j - 1) / 2;
- eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+ /* Shift the element into its correct place. */
+ for (k = j; j != i;) {
+ j = (j - 1) / 2;
+ eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
}
}
@@ -206,17 +211,22 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size,
for (i = n - 1; i > 0; --i) {
eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
- for (r = 0; r * 2 + 1 < i; r = c) {
- c = r * 2 + 1;
+ /* Find the sift-down path all the way to the leaves. */
+ for (j = 0; k = j * 2 + 1, k + 1 < i;)
+ j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
- if (c + 1 < i &&
- eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
- c++;
+ /* Special case for the last leaf with no sibling. */
+ if (j * 2 + 2 == i)
+ j = j * 2 + 1;
- if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
- break;
+ /* Backtrack to the correct location. */
+ while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0)
+ j = (j - 1) / 2;
- eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+ /* Shift the element into its correct place. */
+ for (k = j; j;) {
+ j = (j - 1) / 2;
+ eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
}
}
}
@@ -232,3 +242,64 @@ void eytzinger0_sort(void *base, size_t n, size_t size,
return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
}
+
+#if 0
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/ktime.h>
+
+static u64 cmp_count;
+
+static int mycmp(const void *a, const void *b)
+{
+ u32 _a = *(u32 *)a;
+ u32 _b = *(u32 *)b;
+
+ cmp_count++;
+ if (_a < _b)
+ return -1;
+ else if (_a > _b)
+ return 1;
+ else
+ return 0;
+}
+
+static int test(void)
+{
+ size_t N, i;
+ ktime_t start, end;
+ s64 delta;
+ u32 *arr;
+
+ for (N = 10000; N <= 100000; N += 10000) {
+ arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL);
+ cmp_count = 0;
+
+ for (i = 0; i < N; i++)
+ arr[i] = get_random_u32();
+
+ start = ktime_get();
+ eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL);
+ end = ktime_get();
+
+ delta = ktime_us_delta(end, start);
+ printk(KERN_INFO "time: %lld\n", delta);
+ printk(KERN_INFO "comparisons: %lld\n", cmp_count);
+
+ u32 prev = 0;
+
+ eytzinger0_for_each(i, N) {
+ if (prev > arr[i])
+ goto err;
+ prev = arr[i];
+ }
+
+ kfree(arr);
+ }
+ return 0;
+
+err:
+ kfree(arr);
+ return -1;
+}
+#endif
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 624e6f963240..508d029ac53d 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -42,7 +42,7 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
if (ret)
goto err;
@@ -70,7 +70,7 @@ int bch2_create_trans(struct btree_trans *trans,
struct bch_subvolume s;
ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
- BTREE_ITER_CACHED, &s);
+ BTREE_ITER_cached, &s);
if (ret)
goto err;
@@ -78,7 +78,7 @@ int bch2_create_trans(struct btree_trans *trans,
}
ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -163,7 +163,7 @@ int bch2_create_trans(struct btree_trans *trans,
name,
dir_target,
&dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ STR_HASH_must_create);
if (ret)
goto err;
@@ -171,7 +171,7 @@ int bch2_create_trans(struct btree_trans *trans,
new_inode->bi_dir_offset = dir_offset;
}
- inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+ inode_iter.flags &= ~BTREE_ITER_all_snapshots;
bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
ret = bch2_btree_iter_traverse(&inode_iter) ?:
@@ -198,16 +198,16 @@ int bch2_link_trans(struct btree_trans *trans,
if (dir.subvol != inum.subvol)
return -EXDEV;
- ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
if (ret)
- goto err;
+ return ret;
inode_u->bi_ctime = now;
ret = bch2_inode_nlink_inc(inode_u);
if (ret)
- return ret;
+ goto err;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
if (ret)
goto err;
@@ -223,7 +223,7 @@ int bch2_link_trans(struct btree_trans *trans,
ret = bch2_dirent_create(trans, dir, &dir_hash,
mode_to_type(inode_u->bi_mode),
name, inum.inum, &dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ STR_HASH_must_create);
if (ret)
goto err;
@@ -255,19 +255,19 @@ int bch2_unlink_trans(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
if (ret)
goto err;
dir_hash = bch2_hash_info_init(c, dir_u);
ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
- name, &inum, BTREE_ITER_INTENT);
+ name, &inum, BTREE_ITER_intent);
if (ret)
goto err;
ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -322,7 +322,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash, &dirent_iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_inode_write(trans, &dir_iter, dir_u) ?:
bch2_inode_write(trans, &inode_iter, inode_u);
err:
@@ -363,7 +363,7 @@ static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_p
struct bkey_i_subvolume *s =
bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvol),
- BTREE_ITER_CACHED, subvolume);
+ BTREE_ITER_cached, subvolume);
int ret = PTR_ERR_OR_ZERO(s);
if (ret)
return ret;
@@ -394,7 +394,7 @@ int bch2_rename_trans(struct btree_trans *trans,
int ret;
ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -403,7 +403,7 @@ int bch2_rename_trans(struct btree_trans *trans,
if (dst_dir.inum != src_dir.inum ||
dst_dir.subvol != src_dir.subvol) {
ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
@@ -423,13 +423,13 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
if (dst_inum.inum) {
ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto err;
}
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 39292e7ef342..b0a33fabadf8 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -30,15 +30,8 @@ static void bch2_readpages_end_io(struct bio *bio)
{
struct folio_iter fi;
- bio_for_each_folio_all(fi, bio) {
- if (!bio->bi_status) {
- folio_mark_uptodate(fi.folio);
- } else {
- folio_clear_uptodate(fi.folio);
- folio_set_error(fi.folio);
- }
- folio_unlock(fi.folio);
- }
+ bio_for_each_folio_all(fi, bio)
+ folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
bio_put(bio);
}
@@ -176,7 +169,7 @@ retry:
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
@@ -408,7 +401,6 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
bio_for_each_folio_all(fi, bio) {
struct bch_folio *s;
- folio_set_error(fi.folio);
mapping_set_error(fi.folio->mapping, -EIO);
s = __bch2_folio(fi.folio);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index b889370a5088..09d21aef879a 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -254,7 +254,7 @@ retry:
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_SLOTS, k, err) {
+ BTREE_ITER_slots, k, err) {
if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
break;
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index d359aa9b33b8..872283e5bd1e 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -214,7 +214,7 @@ retry:
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_SLOTS, k, ret) {
+ BTREE_ITER_slots, k, ret) {
unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
unsigned state = bkey_to_sector_state(k);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 20b40477425f..ef20b64033e0 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -202,7 +202,10 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
ret = bch2_flush_inode(c, inode);
out:
- return bch2_err_class(ret);
+ ret = bch2_err_class(ret);
+ if (ret == -EROFS)
+ ret = -EIO;
+ return ret;
}
/* truncate: */
@@ -594,7 +597,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inode->v.i_ino, start_sector),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
while (!ret && bkey_lt(iter.pos, end_pos)) {
s64 i_sectors_delta = 0;
@@ -1009,7 +1012,7 @@ retry:
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
SPOS(inode->v.i_ino, offset >> 9, snapshot),
- BTREE_ITER_SLOTS, k, ret) {
+ BTREE_ITER_slots, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
offset, MAX_LFS_FILESIZE, 0, false);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 3dc8630ff9fe..205a323ffc6d 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -548,7 +548,7 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
- case FS_IOC_GETFLAGS:
+ case FS_IOC32_GETFLAGS:
cmd = FS_IOC_GETFLAGS;
break;
case FS_IOC32_SETFLAGS:
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 65b04b3c2679..fd851f10d11c 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -90,7 +90,7 @@ retry:
bch2_trans_begin(trans);
ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
- BTREE_ITER_INTENT) ?:
+ BTREE_ITER_intent) ?:
(set ? set(trans, inode, &inode_u, p) : 0) ?:
bch2_inode_write(trans, &iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
@@ -213,19 +213,43 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
_ret; \
})
+static struct inode *bch2_alloc_inode(struct super_block *sb)
+{
+ BUG();
+}
+
+static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
+{
+ struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+ if (!inode)
+ return NULL;
+
+ inode_init_once(&inode->v);
+ mutex_init(&inode->ei_update_lock);
+ two_state_lock_init(&inode->ei_pagecache_lock);
+ INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+ mutex_init(&inode->ei_quota_lock);
+ inode->v.i_state = 0;
+
+ if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
+ kmem_cache_free(bch2_inode_cache, inode);
+ return NULL;
+ }
+
+ return inode;
+}
+
/*
* Allocate a new inode, dropping/retaking btree locks if necessary:
*/
static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
-
struct bch_inode_info *inode =
memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
- to_bch_ei(new_inode(c->vfs_sb)));
+ __bch2_new_inode(trans->c));
if (unlikely(!inode)) {
- int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
+ int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
if (ret && inode) {
__destroy_inode(&inode->v);
kmem_cache_free(bch2_inode_cache, inode);
@@ -290,7 +314,7 @@ __bch2_create(struct mnt_idmap *idmap,
if (ret)
return ERR_PTR(ret);
#endif
- inode = to_bch_ei(new_inode(c->vfs_sb));
+ inode = __bch2_new_inode(c);
if (unlikely(!inode)) {
inode = ERR_PTR(-ENOMEM);
goto err;
@@ -323,7 +347,7 @@ retry:
inum.inum = inode_u.bi_inum;
ret = bch2_subvolume_get(trans, inum.subvol, true,
- BTREE_ITER_WITH_UPDATES, &subvol) ?:
+ BTREE_ITER_with_updates, &subvol) ?:
bch2_trans_commit(trans, NULL, &journal_seq, 0);
if (unlikely(ret)) {
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
@@ -376,17 +400,14 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter dirent_iter = {};
subvol_inum inum = {};
+ struct printbuf buf = PRINTBUF;
- int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
- dir_hash_info, dir, name, 0);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+ dir_hash_info, dir, name, 0);
+ int ret = bkey_err(k);
if (ret)
return ERR_PTR(ret);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
if (ret > 0)
ret = -ENOENT;
@@ -406,20 +427,31 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
- if (bch2_err_matches(ret, ENOENT)) {
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
- bch_err(c, "%s points to missing inode", buf.buf);
- printbuf_exit(&buf);
- }
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+ c, "dirent to missing inode:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
if (ret)
goto err;
+ /* regular files may have hardlinks: */
+ if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) &&
+ !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
+ c,
+ "dirent points to inode that does not point back:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k),
+ prt_printf(&buf, "\n "),
+ bch2_inode_unpacked_to_text(&buf, &inode_u),
+ buf.buf))) {
+ ret = -ENOENT;
+ goto err;
+ }
+
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
inode = bch2_inode_insert(c, inode);
out:
bch2_trans_iter_exit(trans, &dirent_iter);
+ printbuf_exit(&buf);
return inode;
err:
inode = ERR_PTR(ret);
@@ -787,7 +819,7 @@ retry:
acl = NULL;
ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
if (ret)
goto btree_err;
@@ -1043,6 +1075,10 @@ retry:
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
+
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ break;
}
start = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
@@ -1490,34 +1526,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
mapping_set_large_folios(inode->v.i_mapping);
}
-static struct inode *bch2_alloc_inode(struct super_block *sb)
-{
- struct bch_inode_info *inode;
-
- inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
- if (!inode)
- return NULL;
-
- inode_init_once(&inode->v);
- mutex_init(&inode->ei_update_lock);
- two_state_lock_init(&inode->ei_pagecache_lock);
- INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
- mutex_init(&inode->ei_quota_lock);
-
- return &inode->v;
-}
-
-static void bch2_i_callback(struct rcu_head *head)
-{
- struct inode *vinode = container_of(head, struct inode, i_rcu);
- struct bch_inode_info *inode = to_bch_ei(vinode);
-
- kmem_cache_free(bch2_inode_cache, inode);
-}
-
-static void bch2_destroy_inode(struct inode *vinode)
+static void bch2_free_inode(struct inode *vinode)
{
- call_rcu(&vinode->i_rcu, bch2_i_callback);
+ kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
}
static int inode_update_times_fn(struct btree_trans *trans,
@@ -1825,7 +1836,7 @@ static int bch2_unfreeze(struct super_block *sb)
static const struct super_operations bch_super_operations = {
.alloc_inode = bch2_alloc_inode,
- .destroy_inode = bch2_destroy_inode,
+ .free_inode = bch2_free_inode,
.write_inode = bch2_vfs_write_inode,
.evict_inode = bch2_evict_inode,
.sync_fs = bch2_sync_fs,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 8e2010212cc3..c8f57465131c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -79,7 +79,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
POS(0, inode_nr),
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_all_snapshots);
k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
@@ -127,13 +127,13 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
u64 *target, unsigned *type, u32 snapshot)
{
struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
- &hash_info, dir, name, 0, snapshot);
+ struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0, snapshot);
+ int ret = bkey_err(k);
if (ret)
return ret;
- d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
*target = le64_to_cpu(d.v->d_inum);
*type = d.v->d_type;
bch2_trans_iter_exit(trans, &iter);
@@ -154,12 +154,12 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
dir_hash_info = bch2_hash_info_init(c, &dir_inode);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash_info, &iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter);
err:
bch_err_fn(c, ret);
@@ -274,9 +274,9 @@ create_lostfound:
&lostfound_str,
lostfound->bi_inum,
&lostfound->bi_dir_offset,
- BCH_HASH_SET_MUST_CREATE) ?:
+ STR_HASH_must_create) ?:
bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
err:
bch_err_msg(c, ret, "creating lost+found");
bch2_trans_iter_exit(trans, &lostfound_iter);
@@ -333,7 +333,7 @@ static int reattach_inode(struct btree_trans *trans,
&name,
inode->bi_subvol ?: inode->bi_inum,
&dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ STR_HASH_must_create);
if (ret)
return ret;
@@ -486,14 +486,9 @@ static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 in
return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
}
-struct snapshots_seen_entry {
- u32 id;
- u32 equiv;
-};
-
struct snapshots_seen {
struct bpos pos;
- DARRAY(struct snapshots_seen_entry) ids;
+ snapshot_id_list ids;
};
static inline void snapshots_seen_exit(struct snapshots_seen *s)
@@ -508,20 +503,15 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
{
- struct snapshots_seen_entry *i, n = {
- .id = id,
- .equiv = bch2_snapshot_equiv(c, id),
- };
- int ret = 0;
-
+ u32 *i;
__darray_for_each(s->ids, i) {
- if (i->id == id)
+ if (*i == id)
return 0;
- if (i->id > id)
+ if (*i > id)
break;
}
- ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+ int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
if (ret)
bch_err(c, "error reallocating snapshots_seen table (size %zu)",
s->ids.size);
@@ -531,42 +521,11 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s
static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
enum btree_id btree_id, struct bpos pos)
{
- struct snapshots_seen_entry n = {
- .id = pos.snapshot,
- .equiv = bch2_snapshot_equiv(c, pos.snapshot),
- };
- int ret = 0;
-
if (!bkey_eq(s->pos, pos))
s->ids.nr = 0;
-
s->pos = pos;
- s->pos.snapshot = n.equiv;
- darray_for_each(s->ids, i) {
- if (i->id == n.id)
- return 0;
-
- /*
- * We currently don't rigorously track for snapshot cleanup
- * needing to be run, so it shouldn't be a fsck error yet:
- */
- if (i->equiv == n.equiv) {
- bch_err(c, "snapshot deletion did not finish:\n"
- " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
- bch2_btree_id_str(btree_id),
- pos.inode, pos.offset,
- i->id, n.id, n.equiv);
- set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
- return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
- }
- }
-
- ret = darray_push(&s->ids, n);
- if (ret)
- bch_err(c, "error reallocating snapshots_seen table (size %zu)",
- s->ids.size);
- return ret;
+ return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
}
/**
@@ -586,12 +545,10 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
ssize_t i;
EBUG_ON(id > ancestor);
- EBUG_ON(!bch2_snapshot_is_equiv(c, id));
- EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
/* @ancestor should be the snapshot most recently added to @seen */
EBUG_ON(ancestor != seen->pos.snapshot);
- EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
+ EBUG_ON(ancestor != darray_last(seen->ids));
if (id == ancestor)
return true;
@@ -610,9 +567,9 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
*/
for (i = seen->ids.nr - 2;
- i >= 0 && seen->ids.data[i].equiv >= id;
+ i >= 0 && seen->ids.data[i] >= id;
--i)
- if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
+ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
return false;
return true;
@@ -643,9 +600,6 @@ static int ref_visible2(struct bch_fs *c,
u32 src, struct snapshots_seen *src_seen,
u32 dst, struct snapshots_seen *dst_seen)
{
- src = bch2_snapshot_equiv(c, src);
- dst = bch2_snapshot_equiv(c, dst);
-
if (dst > src) {
swap(dst, src);
swap(dst_seen, src_seen);
@@ -692,7 +646,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
return darray_push(&w->inodes, ((struct inode_walker_entry) {
.inode = u,
- .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot),
+ .snapshot = inode.k->p.snapshot,
}));
}
@@ -708,7 +662,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
w->inodes.nr = 0;
for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ BTREE_ITER_all_snapshots, k, ret) {
if (k.k->p.offset != inum)
break;
@@ -728,21 +682,20 @@ static struct inode_walker_entry *
lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
{
bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
- u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
struct inode_walker_entry *i;
__darray_for_each(w->inodes, i)
- if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
goto found;
return NULL;
found:
- BUG_ON(snapshot > i->snapshot);
+ BUG_ON(k.k->p.snapshot > i->snapshot);
- if (snapshot != i->snapshot && !is_whiteout) {
+ if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
struct inode_walker_entry new = *i;
- new.snapshot = snapshot;
+ new.snapshot = k.k->p.snapshot;
new.count = 0;
struct printbuf buf = PRINTBUF;
@@ -751,10 +704,10 @@ found:
bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
"unexpected because we should always update the inode when we update a key in that inode\n"
"%s",
- w->last_pos.inode, snapshot, i->snapshot, buf.buf);
+ w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
printbuf_exit(&buf);
- while (i > w->inodes.data && i[-1].snapshot > snapshot)
+ while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
--i;
size_t pos = i - w->inodes.data;
@@ -786,10 +739,10 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
return lookup_inode_for_snapshot(trans->c, w, k);
}
-static int __get_visible_inodes(struct btree_trans *trans,
- struct inode_walker *w,
- struct snapshots_seen *s,
- u64 inum)
+static int get_visible_inodes(struct btree_trans *trans,
+ struct inode_walker *w,
+ struct snapshots_seen *s,
+ u64 inum)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -799,19 +752,17 @@ static int __get_visible_inodes(struct btree_trans *trans,
w->inodes.nr = 0;
for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
+ BTREE_ITER_all_snapshots, k, ret) {
if (k.k->p.offset != inum)
break;
- if (!ref_visible(c, s, s->pos.snapshot, equiv))
+ if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
continue;
if (bkey_is_inode(k.k))
add_inode(c, w, k);
- if (equiv >= s->pos.snapshot)
+ if (k.k->p.snapshot >= s->pos.snapshot)
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -832,7 +783,7 @@ static int check_key_has_snapshot(struct btree_trans *trans,
"key in missing snapshot: %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
fsck_err:
printbuf_exit(&buf);
return ret;
@@ -861,8 +812,8 @@ static int hash_redo_key(struct btree_trans *trans,
bch2_hash_set_in_snapshot(trans, desc, hash_info,
(subvol_inum) { 0, k.k->p.inode },
k.k->p.snapshot, tmp,
- BCH_HASH_SET_MUST_CREATE,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ STR_HASH_must_create|
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
@@ -891,7 +842,7 @@ static int hash_check_key(struct btree_trans *trans,
for_each_btree_key_norestart(trans, iter, desc.btree_id,
SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
- BTREE_ITER_SLOTS, k, ret) {
+ BTREE_ITER_slots, k, ret) {
if (bkey_eq(k.k->p, hash_k.k->p))
break;
@@ -1233,7 +1184,7 @@ int bch2_check_inodes(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_inode(trans, &iter, k, &prev, &s, full)));
@@ -1362,8 +1313,8 @@ static int overlapping_extents_found(struct btree_trans *trans,
BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
bch2_trans_iter_init(trans, &iter1, btree, pos1,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_NOT_EXTENTS);
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_not_extents);
k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
ret = bkey_err(k1);
if (ret)
@@ -1425,7 +1376,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
ret = bch2_trans_update_extent_overwrite(trans, old_iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+ BTREE_UPDATE_internal_snapshot_node,
k1, k2) ?:
bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res);
@@ -1466,7 +1417,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
struct snapshots_seen *seen,
struct extent_ends *extent_ends,
struct bkey_s_c k,
- u32 equiv,
struct btree_iter *iter,
bool *fixed)
{
@@ -1535,11 +1485,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
- struct bpos equiv = k.k->p;
int ret = 0;
- equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
ret = check_key_has_snapshot(trans, iter, k);
if (ret) {
ret = ret < 0 ? ret : 0;
@@ -1589,8 +1536,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
goto delete;
- ret = check_overlapping_extents(trans, s, extent_ends, k,
- equiv.snapshot, iter,
+ ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
&inode->recalculate_sums);
if (ret)
goto err;
@@ -1607,8 +1553,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
for (;
inode->inodes.data && i >= inode->inodes.data;
--i) {
- if (i->snapshot > equiv.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
+ if (i->snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
continue;
if (k.k->type != KEY_TYPE_whiteout) {
@@ -1625,7 +1571,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
ret = bch2_btree_iter_traverse(&iter2) ?:
bch2_btree_delete_at(trans, &iter2,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter2);
if (ret)
goto err;
@@ -1652,7 +1598,7 @@ fsck_err:
bch_err_fn(c, ret);
return ret;
delete:
- ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
goto out;
}
@@ -1673,7 +1619,7 @@ int bch2_check_extents(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL,
BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res);
@@ -1698,7 +1644,7 @@ int bch2_check_indirect_extents(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
&res, NULL,
BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res);
@@ -1767,6 +1713,15 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
if (inode_points_to_dirent(target, d))
return 0;
+ if (bch2_inode_should_have_bp(target) &&
+ !fsck_err(c, inode_wrong_backpointer,
+ "dirent points to inode that does not point back:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n "),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf)))
+ goto out_noiter;
+
if (!target->bi_dir &&
!target->bi_dir_offset) {
target->bi_dir = d.k->p.inode;
@@ -1835,6 +1790,7 @@ out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &bp_iter);
+out_noiter:
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
@@ -2052,7 +2008,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
- struct bpos equiv;
int ret = 0;
ret = check_key_has_snapshot(trans, iter, k);
@@ -2061,9 +2016,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto out;
}
- equiv = k.k->p;
- equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
if (ret)
goto err;
@@ -2104,7 +2056,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
goto out;
}
@@ -2140,14 +2092,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
} else {
- ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
+ ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
if (ret)
goto err;
if (fsck_err_on(!target->inodes.nr,
c, dirent_to_missing_inode,
- "dirent points to missing inode: (equiv %u)\n%s",
- equiv.snapshot,
+ "dirent points to missing inode:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
@@ -2164,7 +2115,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
}
if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
i->count++;
}
out:
@@ -2191,7 +2142,7 @@ int bch2_check_dirents(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
@@ -2255,7 +2206,7 @@ int bch2_check_xattrs(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
@@ -2422,7 +2373,7 @@ int bch2_check_subvolume_structure(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_subvol_path(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -2457,7 +2408,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
struct btree_iter inode_iter = {};
struct bch_inode_unpacked inode;
struct printbuf buf = PRINTBUF;
- u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
+ u32 snapshot = inode_k.k->p.snapshot;
int ret = 0;
p->nr = 0;
@@ -2559,9 +2510,9 @@ int bch2_check_directory_structure(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_intent|
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
if (!bkey_is_inode(k.k))
continue;
@@ -2661,9 +2612,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_inodes,
POS(0, start),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_intent|
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
if (!bkey_is_inode(k.k))
continue;
@@ -2704,9 +2655,9 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_intent|
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k, ({
ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
if (ret)
break;
@@ -2717,8 +2668,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
if (d.v->d_type != DT_DIR &&
d.v->d_type != DT_SUBVOL)
inc_link(c, &s, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum),
- bch2_snapshot_equiv(c, d.k->p.snapshot));
+ le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
}
0;
})));
@@ -2781,7 +2731,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS(0, range_start),
- BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
if (ret < 0) {
@@ -2849,7 +2799,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
u->v.front_pad = 0;
u->v.back_pad = 0;
- return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
+ return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
}
int bch2_fix_reflink_p(struct bch_fs *c)
@@ -2860,8 +2810,8 @@ int bch2_fix_reflink_p(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_extents, POS_MIN,
- BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_intent|BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
fix_reflink_p_key(trans, &iter, k)));
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 0f95d7fb5ec0..aafa79fa6351 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -339,7 +339,7 @@ int bch2_inode_peek_nowarn(struct btree_trans *trans,
k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
SPOS(0, inum.inum, snapshot),
- flags|BTREE_ITER_CACHED);
+ flags|BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
return ret;
@@ -371,7 +371,7 @@ int bch2_inode_peek(struct btree_trans *trans,
int bch2_inode_write_flags(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
- enum btree_update_flags flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_inode_buf *inode_p;
@@ -399,7 +399,7 @@ int __bch2_fsck_write_inode(struct btree_trans *trans,
return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
&inode_p->inode.k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
}
int bch2_fsck_write_inode(struct btree_trans *trans,
@@ -473,7 +473,7 @@ fsck_err:
}
int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
@@ -490,7 +490,7 @@ fsck_err:
}
int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
@@ -507,7 +507,7 @@ fsck_err:
}
int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
@@ -535,29 +535,19 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
printbuf_indent_add(out, 2);
- prt_printf(out, "mode=%o", inode->bi_mode);
- prt_newline(out);
+ prt_printf(out, "mode=%o\n", inode->bi_mode);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
- prt_printf(out, " (%x)", inode->bi_flags);
- prt_newline(out);
+ prt_printf(out, " (%x)\n", inode->bi_flags);
- prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
- prt_newline(out);
-
- prt_printf(out, "bi_size=%llu", inode->bi_size);
- prt_newline(out);
-
- prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
- prt_newline(out);
-
- prt_printf(out, "bi_version=%llu", inode->bi_version);
- prt_newline(out);
+ prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
+ prt_printf(out, "bi_size=%llu\n", inode->bi_size);
+ prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors);
+ prt_printf(out, "bi_version=%llu\n", inode->bi_version);
#define x(_name, _bits) \
- prt_printf(out, #_name "=%llu", (u64) inode->_name); \
- prt_newline(out);
+ prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
BCH_INODE_FIELDS_v3()
#undef x
printbuf_indent_sub(out, 2);
@@ -604,11 +594,11 @@ int bch2_trigger_inode(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
if (nr) {
int ret = bch2_replicas_deltas_realloc(trans, 0);
if (ret)
@@ -627,13 +617,13 @@ int bch2_trigger_inode(struct btree_trans *trans,
}
}
- if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
BUG_ON(!trans->journal_res.seq);
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & BTREE_TRIGGER_gc) {
struct bch_fs *c = trans->c;
percpu_down_read(&c->mark_lock);
@@ -645,7 +635,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
}
int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
@@ -762,8 +752,8 @@ int bch2_inode_create(struct btree_trans *trans,
pos = start;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_intent);
again:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
@@ -824,7 +814,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
* extent iterator:
*/
bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
while (1) {
bch2_trans_begin(trans);
@@ -846,7 +836,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
bkey_init(&delete.k);
delete.k.p = iter.pos;
- if (iter.flags & BTREE_ITER_IS_EXTENTS)
+ if (iter.flags & BTREE_ITER_is_extents)
bch2_key_resize(&delete.k,
bpos_min(end, k.k->p).offset -
iter.pos.offset);
@@ -895,7 +885,7 @@ retry:
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0, inum.inum, snapshot),
- BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+ BTREE_ITER_intent|BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1055,7 +1045,7 @@ retry:
bch2_trans_begin(trans);
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+ SPOS(0, inum, snapshot), BTREE_ITER_intent);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1100,7 +1090,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
struct bch_inode_unpacked inode;
int ret;
- k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
+ k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
return ret;
@@ -1152,7 +1142,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
inode.bi_flags &= ~BCH_INODE_unlinked;
ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch_err_msg(c, ret, "clearing inode unlinked flag");
if (ret)
goto out;
@@ -1199,7 +1189,7 @@ again:
* flushed and we'd spin:
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
if (ret > 0) {
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 056298050550..679f5f5e5d15 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -6,19 +6,20 @@
#include "bkey_methods.h"
#include "opts.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
extern const char * const bch2_inode_opts[];
int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_inode ((struct bkey_ops) { \
.key_invalid = bch2_inode_invalid, \
@@ -49,7 +50,7 @@ static inline bool bkey_is_inode(const struct bkey *k)
}
int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \
@@ -101,7 +102,7 @@ int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, enum btree_update_flags);
+ struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
static inline int bch2_inode_write(struct btree_trans *trans,
struct btree_iter *iter,
@@ -220,6 +221,14 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode)
+{
+ bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
+
+ return S_ISDIR(inode->bi_mode) ||
+ (!inode->bi_nlink && inode_has_bp);
+}
+
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 82f9170dab3f..4ec979b4b23e 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -198,7 +198,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, start),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
@@ -230,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans,
struct bch_inode_unpacked inode_u;
int ret;
- ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent) ?:
(inode_u.bi_size = new_i_size, 0) ?:
bch2_inode_write(trans, &iter, &inode_u);
@@ -256,7 +256,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
bch2_trans_iter_exit(trans, &fpunch_iter);
@@ -317,7 +317,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset
offset <<= 9;
len <<= 9;
- ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent);
if (ret)
return ret;
@@ -365,7 +365,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
switch (op->v.state) {
case LOGGED_OP_FINSERT_start:
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 8a556e6d1ab6..f57486794484 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -378,7 +378,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
bch2_bkey_buf_init(&sk);
bch2_trans_iter_init(trans, &iter, rbio->data_btree,
- rbio->read_pos, BTREE_ITER_SLOTS);
+ rbio->read_pos, BTREE_ITER_slots);
retry:
rbio->bio.bi_status = 0;
@@ -487,7 +487,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
return 0;
k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
if ((ret = bkey_err(k)))
goto out;
@@ -523,7 +523,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
goto out;
ret = bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -541,7 +541,6 @@ static void __bch2_read_endio(struct work_struct *work)
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
struct bio *src = &rbio->bio;
struct bio *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
@@ -647,13 +646,15 @@ csum_err:
prt_str(&buf, "data ");
bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
- bch_err_inum_offset_ratelimited(ca,
- rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "data %s", buf.buf);
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ if (ca) {
+ bch_err_inum_offset_ratelimited(ca,
+ rbio->read_pos.inode,
+ rbio->read_pos.offset << 9,
+ "data %s", buf.buf);
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ }
printbuf_exit(&buf);
-
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
decompression_err:
@@ -675,7 +676,7 @@ static void bch2_read_endio(struct bio *bio)
struct bch_read_bio *rbio =
container_of(bio, struct bch_read_bio, bio);
struct bch_fs *c = rbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
@@ -687,17 +688,21 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- rbio->read_pos.inode,
- rbio->read_pos.offset,
- "data read error: %s",
- bch2_blk_status_to_str(bio->bi_status))) {
+ if (bio->bi_status) {
+ if (ca) {
+ bch_err_inum_offset_ratelimited(ca,
+ rbio->read_pos.inode,
+ rbio->read_pos.offset,
+ "data read error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
+ bch2_io_error(ca, BCH_MEMBER_ERROR_read);
+ }
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
}
if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(ca, &rbio->pick.ptr)) {
+ (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
trace_and_count(c, read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -758,22 +763,21 @@ err:
}
static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+ struct bch_dev *ca,
struct bkey_s_c k,
struct bch_extent_ptr ptr)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
struct btree_iter iter;
struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- PTR_BUCKET_POS(c, &ptr),
- BTREE_ITER_CACHED);
+ PTR_BUCKET_POS(ca, &ptr),
+ BTREE_ITER_cached);
- prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+ prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
printbuf_indent_add(&buf, 2);
- prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
@@ -801,7 +805,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
- struct bch_dev *ca = NULL;
struct promote_op *promote = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
@@ -832,7 +835,7 @@ retry_pick:
goto err;
}
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
/*
* Stale dirty pointers are treated as IO errors, but @failed isn't
@@ -842,9 +845,11 @@ retry_pick:
*/
if ((flags & BCH_READ_IN_RETRY) &&
!pick.ptr.cached &&
- unlikely(ptr_stale(ca, &pick.ptr))) {
- read_from_stale_dirty_pointer(trans, k, pick.ptr);
+ ca &&
+ unlikely(dev_ptr_stale(ca, &pick.ptr))) {
+ read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick);
+ percpu_ref_put(&ca->io_ref);
goto retry_pick;
}
@@ -859,8 +864,11 @@ retry_pick:
* can happen if we retry, and the extent we were going to read
* has been merged in the meantime:
*/
- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) {
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
goto hole;
+ }
iter.bi_size = pick.crc.compressed_size << 9;
goto get_bio;
@@ -965,7 +973,7 @@ get_bio:
rbio->bvec_iter = iter;
rbio->offset_into_extent= offset_into_extent;
rbio->flags = flags;
- rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+ rbio->have_ioref = ca != NULL;
rbio->narrow_crcs = narrow_crcs;
rbio->hole = 0;
rbio->retry = 0;
@@ -995,7 +1003,7 @@ get_bio:
* If it's being moved internally, we don't want to flag it as a cache
* hit:
*/
- if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+ if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE))
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
@@ -1113,7 +1121,7 @@ retry:
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
while (1) {
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 40d7df7607df..9401d13e31bb 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -166,7 +166,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
bch2_trans_copy_iter(&iter, extent_iter);
for_each_btree_key_upto_continue_norestart(iter,
- new->k.p, BTREE_ITER_SLOTS, old, ret) {
+ new->k.p, BTREE_ITER_slots, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
bkey_start_offset(old.k));
@@ -210,14 +210,14 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
* to be journalled - if we crash, the bi_journal_seq update will be
* lost, but that's fine.
*/
- unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
+ unsigned inode_update_flags = BTREE_UPDATE_nojournal;
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0,
extent_iter->pos.inode,
extent_iter->snapshot),
- BTREE_ITER_CACHED);
+ BTREE_ITER_cached);
int ret = bkey_err(k);
if (unlikely(ret))
return ret;
@@ -259,7 +259,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
}
ret = bch2_trans_update(trans, &iter, &inode->k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ BTREE_UPDATE_internal_snapshot_node|
inode_update_flags);
err:
bch2_trans_iter_exit(trans, &iter);
@@ -368,7 +368,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
bkey_start_pos(&sk.k->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
@@ -407,13 +407,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
BUG_ON(c->opts.nochanges);
bkey_for_each_ptr(ptrs, ptr) {
- BUG_ON(!bch2_dev_exists2(c, ptr->dev));
-
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = nocow
+ ? bch2_dev_have_ref(c, ptr->dev)
+ : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
if (to_entry(ptr + 1) < ptrs.end) {
- n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
- GFP_NOFS, &ca->replica_set));
+ n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
n->bio.bi_end_io = wbio->bio.bi_end_io;
n->bio.bi_private = wbio->bio.bi_private;
@@ -430,11 +429,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->c = c;
n->dev = ptr->dev;
- n->have_ioref = nocow || bch2_dev_get_ioref(ca,
- type == BCH_DATA_btree ? READ : WRITE);
+ n->have_ioref = ca != NULL;
n->nocow = nocow;
n->submit_time = local_clock();
n->inode_offset = bkey_start_offset(&k->k);
+ if (nocow)
+ n->nocow_bucket = PTR_BUCKET_NR(ca, ptr);
n->bio.bi_iter.bi_sector = ptr->offset;
if (likely(n->have_ioref)) {
@@ -481,7 +481,6 @@ static void bch2_write_done(struct closure *cl)
static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
{
struct keylist *keys = &op->insert_keys;
- struct bch_extent_ptr *ptr;
struct bkey_i *src, *dst = keys->keys, *n;
for (src = keys->keys; src != keys->top; src = n) {
@@ -650,7 +649,9 @@ static void bch2_write_endio(struct bio *bio)
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
+ struct bch_dev *ca = wbio->have_ioref
+ ? bch2_dev_have_ref(c, wbio->dev)
+ : NULL;
if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
op->pos.inode,
@@ -661,8 +662,12 @@ static void bch2_write_endio(struct bio *bio)
op->flags |= BCH_WRITE_IO_ERROR;
}
- if (wbio->nocow)
+ if (wbio->nocow) {
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ POS(ca->dev_idx, wbio->nocow_bucket),
+ BUCKET_NOCOW_LOCK_UPDATE);
set_bit(wbio->dev, op->devs_need_flush->d);
+ }
if (wbio->have_ioref) {
bch2_latency_acct(ca, wbio->submit_time, WRITE);
@@ -1101,30 +1106,21 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
return false;
e = bkey_s_c_to_extent(k);
+
+ rcu_read_lock();
extent_for_each_ptr_decode(e, p, entry) {
- if (crc_is_encoded(p.crc) || p.has_ec)
+ if (crc_is_encoded(p.crc) || p.has_ec) {
+ rcu_read_unlock();
return false;
+ }
replicas += bch2_extent_ptr_durability(c, &p);
}
+ rcu_read_unlock();
return replicas >= op->opts.data_replicas;
}
-static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
-
- for_each_keylist_key(&op->insert_keys, k) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-
- bkey_for_each_ptr(ptrs, ptr)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr),
- BUCKET_NOCOW_LOCK_UPDATE);
- }
-}
-
static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *orig,
@@ -1158,7 +1154,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
return bch2_extent_update_i_size_sectors(trans, iter,
min(new->k.p.offset << 9, new_i_size), 0) ?:
bch2_trans_update(trans, iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
}
static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
@@ -1169,7 +1165,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
for_each_keylist_key(&op->insert_keys, orig) {
int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
@@ -1195,8 +1191,6 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
static void __bch2_nocow_write_done(struct bch_write_op *op)
{
- bch2_nocow_write_unlock(op);
-
if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
op->error = -EIO;
} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
@@ -1242,12 +1236,16 @@ retry:
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(op->pos.inode, op->pos.offset, snapshot),
- BTREE_ITER_SLOTS);
+ BTREE_ITER_slots);
while (1) {
struct bio *bio = &op->wbio.bio;
buckets.nr = 0;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ break;
+
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -1267,14 +1265,15 @@ retry:
/* Get iorefs before dropping btree locks: */
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
- struct bpos b = PTR_BUCKET_POS(c, ptr);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
+ if (unlikely(!ca))
+ goto err_get_ioref;
+
+ struct bpos b = PTR_BUCKET_POS(ca, ptr);
struct nocow_lock_bucket *l =
bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
prefetch(l);
- if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
- goto err_get_ioref;
-
/* XXX allocating memory with btree locks held - rare */
darray_push_gfp(&buckets, ((struct bucket_to_lock) {
.b = b, .gen = ptr->gen, .l = l,
@@ -1293,7 +1292,7 @@ retry:
bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
darray_for_each(buckets, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode);
+ struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
__bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
bucket_to_u64(i->b),
@@ -1370,7 +1369,7 @@ err:
return;
err_get_ioref:
darray_for_each(buckets, i)
- percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref);
+ percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);
/* Fall back to COW path: */
goto out;
@@ -1491,7 +1490,11 @@ err:
if ((op->flags & BCH_WRITE_SYNC) ||
(!(op->flags & BCH_WRITE_DONE) &&
!(op->flags & BCH_WRITE_IN_WORKER))) {
- closure_sync(&op->cl);
+ if (closure_sync_timeout(&op->cl, HZ * 10)) {
+ bch2_print_allocator_stuck(c);
+ closure_sync(&op->cl);
+ }
+
__bch2_write_index(op);
if (!(op->flags & BCH_WRITE_DONE))
@@ -1649,8 +1652,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
prt_bitflags(out, bch2_write_flags, op->flags);
prt_newline(out);
- prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
- prt_newline(out);
+ prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
printbuf_indent_sub(out, 2);
}
@@ -1658,13 +1660,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
void bch2_fs_io_write_exit(struct bch_fs *c)
{
mempool_exit(&c->bio_bounce_pages);
+ bioset_exit(&c->replica_set);
bioset_exit(&c->bio_write);
}
int bch2_fs_io_write_init(struct bch_fs *c)
{
- if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
- BIOSET_NEED_BVECS))
+ if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
+ bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
return -BCH_ERR_ENOMEM_bio_write_init;
if (mempool_init_page_pool(&c->bio_bounce_pages,
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
index c7f97c2c4805..6e878a6f2f0b 100644
--- a/fs/bcachefs/io_write_types.h
+++ b/fs/bcachefs/io_write_types.h
@@ -20,6 +20,7 @@ struct bch_write_bio {
u64 submit_time;
u64 inode_offset;
+ u64 nocow_bucket;
struct bch_devs_list failed;
u8 dev;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index a8b08e76d0d0..adec8e1ea73e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -53,29 +53,19 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
unsigned i = seq & JOURNAL_BUF_MASK;
struct journal_buf *buf = j->buf + i;
- prt_str(out, "seq:");
- prt_tab(out);
- prt_printf(out, "%llu", seq);
- prt_newline(out);
+ prt_printf(out, "seq:\t%llu\n", seq);
printbuf_indent_add(out, 2);
- prt_str(out, "refcount:");
- prt_tab(out);
- prt_printf(out, "%u", journal_state_count(s, i));
- prt_newline(out);
+ prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i));
- prt_str(out, "size:");
- prt_tab(out);
+ prt_printf(out, "size:\t");
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
- prt_str(out, "expires:");
- prt_tab(out);
- prt_printf(out, "%li jiffies", buf->expires - jiffies);
- prt_newline(out);
+ prt_printf(out, "expires:\t");
+ prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
- prt_str(out, "flags:");
- prt_tab(out);
+ prt_printf(out, "flags:\t");
if (buf->noflush)
prt_str(out, "noflush ");
if (buf->must_flush)
@@ -87,9 +77,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
if (buf->write_started)
prt_str(out, "write_started ");
if (buf->write_allocated)
- prt_str(out, "write allocated ");
+ prt_str(out, "write_allocated ");
if (buf->write_done)
- prt_str(out, "write done");
+ prt_str(out, "write_done");
prt_newline(out);
printbuf_indent_sub(out, 2);
@@ -948,7 +938,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
break;
}
} else {
- ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl);
+ ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal,
+ BCH_DATA_journal, cl);
ret = PTR_ERR_OR_ZERO(ob[nr_got]);
if (ret)
break;
@@ -956,7 +947,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
ret = bch2_trans_run(c,
bch2_trans_mark_metadata_bucket(trans, ca,
ob[nr_got]->bucket, BCH_DATA_journal,
- ca->mi.bucket_size));
+ ca->mi.bucket_size, BTREE_TRIGGER_transactional));
if (ret) {
bch2_open_bucket_put(c, ob[nr_got]);
bch_err_msg(c, ret, "marking new journal buckets");
@@ -1036,7 +1027,8 @@ err_unblock:
for (i = 0; i < nr_got; i++)
bch2_trans_run(c,
bch2_trans_mark_metadata_bucket(trans, ca,
- bu[i], BCH_DATA_free, 0));
+ bu[i], BCH_DATA_free, 0,
+ BTREE_TRIGGER_transactional));
err_free:
if (!new_fs)
for (i = 0; i < nr_got; i++)
@@ -1187,12 +1179,14 @@ void bch2_fs_journal_stop(struct journal *j)
bch2_journal_meta(j);
journal_quiesce(j);
+ cancel_delayed_work_sync(&j->write_work);
BUG_ON(!bch2_journal_error(j) &&
- test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
+ test_bit(JOURNAL_replay_done, &j->flags) &&
j->last_empty_seq != journal_cur_seq(j));
- cancel_delayed_work_sync(&j->write_work);
+ if (!bch2_journal_error(j))
+ clear_bit(JOURNAL_running, &j->flags);
}
int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
@@ -1266,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
spin_lock(&j->lock);
- set_bit(JOURNAL_STARTED, &j->flags);
+ set_bit(JOURNAL_running, &j->flags);
j->last_flush_write = jiffies;
j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
@@ -1407,6 +1401,13 @@ int bch2_fs_journal_init(struct journal *j)
/* debug: */
+static const char * const bch2_journal_flags_strs[] = {
+#define x(n) #n,
+ JOURNAL_FLAGS()
+#undef x
+ NULL
+};
+
void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -1415,19 +1416,22 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 24);
+ printbuf_tabstop_push(out, 28);
out->atomic++;
rcu_read_lock();
s = READ_ONCE(j->reservations);
+ prt_printf(out, "flags:\t");
+ prt_bitflags(out, bch2_journal_flags_strs, j->flags);
+ prt_newline(out);
prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
- prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
- prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
- prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j));
+ prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk);
+ prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
+ prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]);
prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
@@ -1436,48 +1440,44 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_newline(out);
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
- prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
+ prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked);
prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- prt_printf(out, "blocked:\t\t%u\n", j->blocked);
+ prt_printf(out, "blocked:\t%u\n", j->blocked);
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
- prt_printf(out, "current entry:\t\t");
+ prt_printf(out, "current entry:\t");
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
- prt_printf(out, "error");
+ prt_printf(out, "error\n");
break;
case JOURNAL_ENTRY_CLOSED_VAL:
- prt_printf(out, "closed");
+ prt_printf(out, "closed\n");
break;
default:
- prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
+ prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
break;
}
- prt_newline(out);
- prt_printf(out, "unwritten entries:");
- prt_newline(out);
+ prt_printf(out, "unwritten entries:\n");
bch2_journal_bufs_to_text(out, j);
- prt_printf(out,
- "replay done:\t\t%i\n",
- test_bit(JOURNAL_REPLAY_DONE, &j->flags));
-
prt_printf(out, "space:\n");
- prt_printf(out, "\tdiscarded\t%u:%u\n",
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "discarded\t%u:%u\n",
j->space[journal_space_discarded].next_entry,
j->space[journal_space_discarded].total);
- prt_printf(out, "\tclean ondisk\t%u:%u\n",
+ prt_printf(out, "clean ondisk\t%u:%u\n",
j->space[journal_space_clean_ondisk].next_entry,
j->space[journal_space_clean_ondisk].total);
- prt_printf(out, "\tclean\t\t%u:%u\n",
+ prt_printf(out, "clean\t%u:%u\n",
j->space[journal_space_clean].next_entry,
j->space[journal_space_clean].total);
- prt_printf(out, "\ttotal\t\t%u:%u\n",
+ prt_printf(out, "total\t%u:%u\n",
j->space[journal_space_total].next_entry,
j->space[journal_space_total].total);
+ printbuf_indent_sub(out, 2);
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
@@ -1488,14 +1488,16 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
if (!ja->nr)
continue;
- prt_printf(out, "dev %u:\n", ca->dev_idx);
- prt_printf(out, "\tnr\t\t%u\n", ja->nr);
- prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
- prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
- prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx);
- prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
- prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
- prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
+ prt_printf(out, "dev %u:\n", ca->dev_idx);
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "nr\t%u\n", ja->nr);
+ prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size);
+ prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+ prt_printf(out, "discard_idx\t%u\n", ja->discard_idx);
+ prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
+ prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
+ prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
+ printbuf_indent_sub(out, 2);
}
rcu_read_unlock();
@@ -1527,25 +1529,18 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
pin_list = journal_seq_pin(j, *seq);
- prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
- prt_newline(out);
+ prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
printbuf_indent_add(out, 2);
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
- list_for_each_entry(pin, &pin_list->list[i], list) {
- prt_printf(out, "\t%px %ps", pin, pin->flush);
- prt_newline(out);
- }
+ list_for_each_entry(pin, &pin_list->list[i], list)
+ prt_printf(out, "\t%px %ps\n", pin, pin->flush);
- if (!list_empty(&pin_list->flushed)) {
- prt_printf(out, "flushed:");
- prt_newline(out);
- }
+ if (!list_empty(&pin_list->flushed))
+ prt_printf(out, "flushed:\n");
- list_for_each_entry(pin, &pin_list->flushed, list) {
- prt_printf(out, "\t%px %ps", pin, pin->flush);
- prt_newline(out);
- }
+ list_for_each_entry(pin, &pin_list->flushed, list)
+ prt_printf(out, "\t%px %ps\n", pin, pin->flush);
printbuf_indent_sub(out, 2);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 7c7528f839c5..fd1f7cdaa8bc 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -372,7 +372,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
int ret;
EBUG_ON(res->ref);
- EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+ EBUG_ON(!test_bit(JOURNAL_running, &j->flags));
res->u64s = u64s;
@@ -418,8 +418,8 @@ struct bch_dev;
static inline void bch2_journal_set_replay_done(struct journal *j)
{
- BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
- set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+ BUG_ON(!test_bit(JOURNAL_running, &j->flags));
+ set_bit(JOURNAL_replay_done, &j->flags);
}
void bch2_journal_unblock(struct journal *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index eb1f9d6f5a19..cdcb1ad49af4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,15 +17,38 @@
#include "sb-clean.h"
#include "trace.h"
+void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ for_each_member_device(c, ca) {
+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+ m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx);
+ m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free);
+ }
+}
+
+void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
+{
+ mutex_lock(&c->sb_lock);
+ for_each_member_device(c, ca) {
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
+
+ unsigned idx = le32_to_cpu(m.last_journal_bucket);
+ if (idx < ca->journal.nr)
+ ca->journal.cur_idx = idx;
+ unsigned offset = le32_to_cpu(m.last_journal_bucket_offset);
+ if (offset <= ca->mi.bucket_size)
+ ca->journal.sectors_free = ca->mi.bucket_size - offset;
+ }
+ mutex_unlock(&c->sb_lock);
+}
+
void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct journal_replay *j)
{
darray_for_each(j->ptrs, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
- u64 offset;
-
- div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
-
if (i != j->ptrs.data)
prt_printf(out, " ");
prt_printf(out, "%u:%u:%u (sector %llu)",
@@ -122,6 +145,10 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct printbuf buf = PRINTBUF;
int ret = JOURNAL_ENTRY_ADD_OK;
+ if (!c->journal.oldest_seq_found_ondisk ||
+ le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
+ c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
+
/* Is this entry older than the range we need? */
if (!c->opts.read_entire_journal &&
le64_to_cpu(j->seq) < jlist->last_seq)
@@ -272,7 +299,7 @@ static void journal_entry_err_msg(struct printbuf *out,
journal_entry_err_msg(&_buf, version, jset, entry); \
prt_printf(&_buf, msg, ##__VA_ARGS__); \
\
- switch (flags & BKEY_INVALID_WRITE) { \
+ switch (flags & BCH_VALIDATE_write) { \
case READ: \
mustfix_fsck_err(c, _err, "%s", _buf.buf); \
break; \
@@ -301,9 +328,9 @@ static int journal_validate_key(struct bch_fs *c,
unsigned level, enum btree_id btree_id,
struct bkey_i *k,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
- int write = flags & BKEY_INVALID_WRITE;
+ int write = flags & BCH_VALIDATE_write;
void *next = vstruct_next(entry);
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -376,7 +403,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct bkey_i *k = entry->start;
@@ -385,7 +412,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
entry->level,
entry->btree_id,
k, version, big_endian,
- flags|BKEY_INVALID_JOURNAL);
+ flags|BCH_VALIDATE_journal);
if (ret == FSCK_DELETED_KEY)
continue;
@@ -416,7 +443,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct bkey_i *k = entry->start;
int ret = 0;
@@ -455,7 +482,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
/* obsolete, don't care: */
return 0;
@@ -470,7 +497,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
int ret = 0;
@@ -497,7 +524,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct jset_entry_blacklist_v2 *bl_entry;
int ret = 0;
@@ -539,7 +566,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
@@ -573,7 +600,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
@@ -617,7 +644,7 @@ static int journal_entry_clock_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
@@ -657,13 +684,12 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
unsigned expected = sizeof(*u);
- unsigned dev;
int ret = 0;
if (journal_entry_err_on(bytes < expected,
@@ -675,16 +701,6 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
return ret;
}
- dev = le32_to_cpu(u->dev);
-
- if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
- c, version, jset, entry,
- journal_entry_dev_usage_bad_dev,
- "bad dev")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
if (journal_entry_err_on(u->pad,
c, version, jset, entry,
journal_entry_dev_usage_bad_pad,
@@ -719,7 +735,7 @@ static int journal_entry_log_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
return 0;
}
@@ -737,7 +753,7 @@ static int journal_entry_overwrite_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
return journal_entry_btree_keys_validate(c, jset, entry,
version, big_endian, READ);
@@ -753,7 +769,7 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
return journal_entry_btree_keys_validate(c, jset, entry,
version, big_endian, READ);
@@ -769,7 +785,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
unsigned bytes = vstruct_bytes(entry);
unsigned expected = 16;
@@ -799,7 +815,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
- enum bkey_invalid_flags);
+ enum bch_validate_flags);
void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
};
@@ -817,7 +833,7 @@ int bch2_journal_entry_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
return entry->type < BCH_JSET_ENTRY_NR
? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
@@ -837,7 +853,7 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
}
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
unsigned version = le32_to_cpu(jset->version);
int ret = 0;
@@ -863,7 +879,7 @@ fsck_err:
static int jset_validate(struct bch_fs *c,
struct bch_dev *ca,
struct jset *jset, u64 sector,
- enum bkey_invalid_flags flags)
+ enum bch_validate_flags flags)
{
unsigned version;
int ret = 0;
@@ -918,7 +934,7 @@ static int jset_validate_early(struct bch_fs *c,
{
size_t bytes = vstruct_bytes(jset);
unsigned version;
- enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+ enum bch_validate_flags flags = BCH_VALIDATE_journal;
int ret = 0;
if (le64_to_cpu(jset->magic) != jset_magic(c))
@@ -1057,6 +1073,13 @@ reread:
goto err;
}
+ if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
+ ja->highest_seq_found = le64_to_cpu(j->seq);
+ ja->cur_idx = bucket;
+ ja->sectors_free = ca->mi.bucket_size -
+ bucket_remainder(ca, offset) - sectors;
+ }
+
/*
* This happens sometimes if we don't have discards on -
* when we've partially overwritten a bucket with new
@@ -1125,8 +1148,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
struct bch_fs *c = ca->fs;
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
- struct journal_replay *r, **_r;
- struct genradix_iter iter;
struct journal_read_buf buf = { NULL, 0 };
unsigned i;
int ret = 0;
@@ -1146,47 +1167,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
goto err;
}
- ja->sectors_free = ca->mi.bucket_size;
-
- mutex_lock(&jlist->lock);
- genradix_for_each_reverse(&c->journal_entries, iter, _r) {
- r = *_r;
-
- if (!r)
- continue;
-
- darray_for_each(r->ptrs, i)
- if (i->dev == ca->dev_idx) {
- unsigned wrote = bucket_remainder(ca, i->sector) +
- vstruct_sectors(&r->j, c->block_bits);
-
- ja->cur_idx = i->bucket;
- ja->sectors_free = ca->mi.bucket_size - wrote;
- goto found;
- }
- }
-found:
- mutex_unlock(&jlist->lock);
-
- if (ja->bucket_seq[ja->cur_idx] &&
- ja->sectors_free == ca->mi.bucket_size) {
-#if 0
- /*
- * Debug code for ZNS support, where we (probably) want to be
- * correlated where we stopped in the journal to the zone write
- * points:
- */
- bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
- bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
- for (i = 0; i < 3; i++) {
- unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
-
- bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
- }
-#endif
- ja->sectors_free = 0;
- }
-
/*
* Set dirty_idx to indicate the entire journal is full and needs to be
* reclaimed - journal reclaim will immediately reclaim whatever isn't
@@ -1255,7 +1235,7 @@ int bch2_journal_read(struct bch_fs *c,
* those entries will be blacklisted:
*/
genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
- enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+ enum bch_validate_flags flags = BCH_VALIDATE_journal;
i = *_i;
@@ -1366,7 +1346,7 @@ int bch2_journal_read(struct bch_fs *c,
fsck_err(c, journal_entries_missing,
"journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
" prev at %s\n"
- " next at %s",
+ " next at %s, continue?",
missing_start, missing_end,
*last_seq, *blacklist_seq - 1,
buf1.buf, buf2.buf);
@@ -1390,7 +1370,7 @@ int bch2_journal_read(struct bch_fs *c,
continue;
darray_for_each(i->ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
if (!ptr->csum_good)
bch_err_dev_offset(ca, ptr->sector,
@@ -1400,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c,
}
ret = jset_validate(c,
- bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
+ bch2_dev_have_ref(c, i->ptrs.data[0].dev),
&i->j,
i->ptrs.data[0].sector,
READ);
@@ -1731,10 +1711,8 @@ static CLOSURE_CALLBACK(journal_write_submit)
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct journal_device *ja = &ca->journal;
-
- if (!percpu_ref_tryget(&ca->io_ref)) {
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
+ if (!ca) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
continue;
@@ -1743,6 +1721,7 @@ static CLOSURE_CALLBACK(journal_write_submit)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
+ struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
@@ -1958,14 +1937,14 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
* So if we're in an error state, and we're still starting up, we don't
* write anything at all.
*/
- if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
+ if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
return -EIO;
if (error ||
w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+ test_bit(JOURNAL_may_skip_flush, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
w->data->last_seq = 0;
@@ -1976,7 +1955,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
w->must_flush = true;
j->last_flush_write = jiffies;
j->nr_flush_writes++;
- clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+ clear_bit(JOURNAL_need_flush_write, &j->flags);
}
return 0;
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 4f1e763ab506..2ca9cde30ea8 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -4,6 +4,9 @@
#include "darray.h"
+void bch2_journal_pos_from_member_info_set(struct bch_fs *);
+void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
+
struct journal_ptr {
bool csum_good;
u8 dev;
@@ -60,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
- enum bkey_invalid_flags);
+ enum bch_validate_flags);
void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
struct jset_entry *);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 04a577848b01..79be0eaddfa0 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -67,7 +67,7 @@ void bch2_journal_set_watermark(struct journal *j)
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
- mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin);
+ mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin);
swap(watermark, j->watermark);
if (watermark > j->watermark)
@@ -225,9 +225,9 @@ void bch2_journal_space_available(struct journal *j)
j->space[journal_space_clean_ondisk].total) &&
(clean - clean_ondisk <= total / 8) &&
(clean_ondisk * 2 > clean))
- set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+ set_bit(JOURNAL_may_skip_flush, &j->flags);
else
- clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+ clear_bit(JOURNAL_may_skip_flush, &j->flags);
bch2_journal_set_watermark(j);
out:
@@ -818,7 +818,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
* If journal replay hasn't completed, the unreplayed journal entries
* hold refs on their corresponding sequence numbers
*/
- ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+ ret = !test_bit(JOURNAL_replay_done, &j->flags) ||
journal_last_seq(j) > seq_to_flush ||
!fifo_used(&j->pin);
@@ -833,7 +833,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
/* time_stats this */
bool did_work = false;
- if (!test_bit(JOURNAL_STARTED, &j->flags))
+ if (!test_bit(JOURNAL_running, &j->flags))
return false;
closure_wait_event(&j->async_wait,
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index ae4fb8c3a2bc..db80e506e3ab 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -16,9 +16,8 @@ static int u64_cmp(const void *_l, const void *_r)
return cmp_int(*l, *r);
}
-static int bch2_sb_journal_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_journal *journal = field_to_type(f, journal);
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
@@ -99,9 +98,8 @@ static int u64_range_cmp(const void *_l, const void *_r)
return cmp_int(l->start, r->start);
}
-static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 37a024e034d4..ed4846709611 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "btree_iter.h"
#include "eytzinger.h"
+#include "journal.h"
#include "journal_seq_blacklist.h"
#include "super-io.h"
@@ -162,9 +162,8 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
return 0;
}
-static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_journal_seq_blacklist *bl =
field_to_type(f, journal_seq_blacklist);
@@ -217,78 +216,40 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
.to_text = bch2_sb_journal_seq_blacklist_to_text
};
-void bch2_blacklist_entries_gc(struct work_struct *work)
+bool bch2_blacklist_entries_gc(struct bch_fs *c)
{
- struct bch_fs *c = container_of(work, struct bch_fs,
- journal_seq_blacklist_gc_work);
- struct journal_seq_blacklist_table *t;
- struct bch_sb_field_journal_seq_blacklist *bl;
struct journal_seq_blacklist_entry *src, *dst;
- struct btree_trans *trans = bch2_trans_get(c);
- unsigned i, nr, new_nr;
- int ret;
-
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct btree_iter iter;
- struct btree *b;
-
- bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
- 0, 0, BTREE_ITER_PREFETCH);
-retry:
- bch2_trans_begin(trans);
-
- b = bch2_btree_iter_peek_node(&iter);
-
- while (!(ret = PTR_ERR_OR_ZERO(b)) &&
- b &&
- !test_bit(BCH_FS_stopping, &c->flags))
- b = bch2_btree_iter_next_node(&iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_iter_exit(trans, &iter);
- }
-
- bch2_trans_put(trans);
- if (ret)
- return;
-
- mutex_lock(&c->sb_lock);
- bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+ struct bch_sb_field_journal_seq_blacklist *bl =
+ bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
if (!bl)
- goto out;
+ return false;
- nr = blacklist_nr_entries(bl);
+ unsigned nr = blacklist_nr_entries(bl);
dst = bl->start;
- t = c->journal_seq_blacklist_table;
+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
BUG_ON(nr != t->nr);
+ unsigned i;
for (src = bl->start, i = eytzinger0_first(t->nr);
src < bl->start + nr;
src++, i = eytzinger0_next(i, nr)) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
- if (t->entries[i].dirty)
+ if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
*dst++ = *src;
}
- new_nr = dst - bl->start;
-
- bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
-
- if (new_nr != nr) {
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- new_nr ? sb_blacklist_u64s(new_nr) : 0);
- BUG_ON(new_nr && !bl);
+ unsigned new_nr = dst - bl->start;
+ if (new_nr == nr)
+ return false;
- if (!new_nr)
- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
+ bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr);
- bch2_write_super(c);
- }
-out:
- mutex_unlock(&c->sb_lock);
+ bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+ new_nr ? sb_blacklist_u64s(new_nr) : 0);
+ BUG_ON(new_nr && !bl);
+ return true;
}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index afb886ec8e25..d47636f96fdc 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -17,6 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
-void bch2_blacklist_entries_gc(struct work_struct *);
+bool bch2_blacklist_entries_gc(struct bch_fs *);
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index b5161b5d76a0..19183fcf7ad7 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -129,12 +129,17 @@ enum journal_space_from {
journal_space_nr,
};
+#define JOURNAL_FLAGS() \
+ x(replay_done) \
+ x(running) \
+ x(may_skip_flush) \
+ x(need_flush_write) \
+ x(space_low)
+
enum journal_flags {
- JOURNAL_REPLAY_DONE,
- JOURNAL_STARTED,
- JOURNAL_MAY_SKIP_FLUSH,
- JOURNAL_NEED_FLUSH_WRITE,
- JOURNAL_SPACE_LOW,
+#define x(n) JOURNAL_##n,
+ JOURNAL_FLAGS()
+#undef x
};
/* Reasons we may fail to get a journal reservation: */
@@ -229,6 +234,7 @@ struct journal {
u64 last_seq_ondisk;
u64 err_seq;
u64 last_empty_seq;
+ u64 oldest_seq_found_ondisk;
/*
* FIFO of journal entries whose btree updates have not yet been
@@ -326,6 +332,7 @@ struct journal_device {
/* for bch_journal_read_device */
struct closure read;
+ u64 highest_seq_found;
};
/*
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index b82f8209041f..f49fdca1d07d 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -56,7 +56,7 @@ int bch2_resume_logged_ops(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter,
BTREE_ID_logged_ops, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
resume_logged_op(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 26569043e368..a40d116224ed 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -11,7 +11,7 @@
/* KEY_TYPE_lru is obsolete: */
int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
@@ -149,7 +149,7 @@ int bch2_check_lrus(struct bch_fs *c)
struct bpos last_flushed_pos = POS_MIN;
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 429dca816df5..fb11ab0dd00e 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -49,7 +49,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
}
int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 69098eeb5d48..ddc187fb693d 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -49,7 +49,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
if (!bch2_bkey_has_device_c(k, dev_idx))
return 0;
- n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
@@ -67,7 +67,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
/*
* Since we're not inserting through an extent iterator
- * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
@@ -87,7 +87,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
continue;
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
if (ret)
@@ -119,7 +119,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
for (id = 0; id < BTREE_ID_NR; id++) {
bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
retry:
ret = 0;
while (bch2_trans_begin(trans),
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4d94b7742dbb..8171f947fac8 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -41,28 +41,23 @@ static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c
struct data_update_opts *data_opts)
{
printbuf_tabstop_push(out, 20);
- prt_str(out, "rewrite ptrs:");
- prt_tab(out);
+ prt_str(out, "rewrite ptrs:\t");
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
prt_newline(out);
- prt_str(out, "kill ptrs: ");
- prt_tab(out);
+ prt_str(out, "kill ptrs:\t");
bch2_prt_u64_base2(out, data_opts->kill_ptrs);
prt_newline(out);
- prt_str(out, "target: ");
- prt_tab(out);
+ prt_str(out, "target:\t");
bch2_target_to_text(out, c, data_opts->target);
prt_newline(out);
- prt_str(out, "compression: ");
- prt_tab(out);
+ prt_str(out, "compression:\t");
bch2_compression_opt_to_text(out, background_compression(*io_opts));
prt_newline(out);
- prt_str(out, "extra replicas: ");
- prt_tab(out);
+ prt_str(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas);
}
@@ -421,7 +416,7 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
io_opts->d.nr = 0;
ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_all_snapshots, k, ({
if (k.k->p.offset != extent_k.k->p.inode)
break;
@@ -467,7 +462,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans,
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
- BTREE_ITER_CACHED);
+ BTREE_ITER_cached);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
@@ -553,8 +548,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
}
bch2_trans_iter_init(trans, &iter, btree_id, start,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots);
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
@@ -695,6 +690,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
struct bpos bp_pos = POS_MIN;
int ret = 0;
+ struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
+ if (!ca)
+ return 0;
+
trace_bucket_evacuate(c, &bucket);
bch2_bkey_buf_init(&sk);
@@ -705,7 +704,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- bucket, BTREE_ITER_CACHED);
+ bucket, BTREE_ITER_cached);
ret = lockrestart_do(trans,
bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
bch2_trans_iter_exit(trans, &iter);
@@ -716,7 +715,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
a = bch2_alloc_to_v4(k, &a_convert);
dirty_sectors = bch2_bucket_sectors_dirty(*a);
- bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
+ bucket_size = ca->mi.bucket_size;
fragmentation = a->fragmentation_lru;
ret = bch2_btree_write_buffer_tryflush(trans);
@@ -730,9 +729,9 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
bch2_trans_begin(trans);
- ret = bch2_get_next_backpointer(trans, bucket, gen,
+ ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
&bp_pos, &bp,
- BTREE_ITER_CACHED);
+ BTREE_ITER_cached);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
@@ -828,6 +827,7 @@ next:
trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
err:
+ bch2_dev_put(ca);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
@@ -868,7 +868,7 @@ static int bch2_move_btree(struct bch_fs *c,
continue;
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_prefetch);
retry:
ret = 0;
while (bch2_trans_begin(trans),
@@ -975,26 +975,10 @@ static bool migrate_btree_pred(struct bch_fs *c, void *arg,
*/
static bool bformat_needs_redo(struct bkey_format *f)
{
- for (unsigned i = 0; i < f->nr_fields; i++) {
- unsigned f_bits = f->bits_per_field[i];
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
- if (f_bits > unpacked_bits)
- return true;
-
- if ((f_bits == unpacked_bits) && field_offset)
+ for (unsigned i = 0; i < f->nr_fields; i++)
+ if (bch2_bkey_format_field_overflows(f, i))
return true;
- u64 f_mask = f_bits
- ? ~((~0ULL << (f_bits - 1)) << 1)
- : 0;
-
- if (((field_offset + f_mask) & unpacked_mask) < field_offset)
- return true;
- }
-
return false;
}
@@ -1049,6 +1033,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
struct extent_ptr_decoded p;
unsigned i = 0;
+ rcu_read_lock();
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
unsigned d = bch2_extent_ptr_durability(c, &p);
@@ -1059,6 +1044,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
i++;
}
+ rcu_read_unlock();
return data_opts->kill_ptrs != 0;
}
@@ -1143,23 +1129,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_str(out, "keys moved: ");
- prt_u64(out, atomic64_read(&stats->keys_moved));
- prt_newline(out);
-
- prt_str(out, "keys raced: ");
- prt_u64(out, atomic64_read(&stats->keys_raced));
- prt_newline(out);
-
- prt_str(out, "bytes seen: ");
+ prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved));
+ prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced));
+ prt_printf(out, "bytes seen: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
prt_newline(out);
- prt_str(out, "bytes moved: ");
+ prt_printf(out, "bytes moved: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
- prt_str(out, "bytes raced: ");
+ prt_printf(out, "bytes raced: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
@@ -1173,19 +1153,17 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
- prt_printf(out, "reads: ios %u/%u sectors %u/%u",
+ prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
atomic_read(&ctxt->read_ios),
c->opts.move_ios_in_flight,
atomic_read(&ctxt->read_sectors),
c->opts.move_bytes_in_flight >> 9);
- prt_newline(out);
- prt_printf(out, "writes: ios %u/%u sectors %u/%u",
+ prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
atomic_read(&ctxt->write_ios),
c->opts.move_ios_in_flight,
atomic_read(&ctxt->write_sectors),
c->opts.move_bytes_in_flight >> 9);
- prt_newline(out);
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 0d2b82d8d11f..10bfb31c151b 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -84,7 +84,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
return 0;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
- b->k.bucket, BTREE_ITER_CACHED);
+ b->k.bucket, BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
return ret;
@@ -158,6 +158,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
return ret;
+ bch2_trans_begin(trans);
+
ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 84e452835a17..25530e0bb2f3 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -426,11 +426,6 @@ enum fsck_err_opts {
BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \
NULL, "Set superblock to latest version,\n" \
"allowing any new features to be used") \
- x(buckets_nouse, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Allocate the buckets_nouse bitmap") \
x(stdio, u64, \
0, \
OPT_UINT(0, S64_MAX), \
@@ -480,7 +475,7 @@ enum fsck_err_opts {
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
- NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\
+ NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
" prefetched sequentially")
struct bch_opts {
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index b27d22925929..8b0369185f5c 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -10,35 +10,50 @@
#include "printbuf.h"
-static inline unsigned printbuf_linelen(struct printbuf *buf)
+static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos)
{
- return buf->pos - buf->last_newline;
+ return pos - buf->last_newline;
}
-int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+static inline unsigned printbuf_linelen(struct printbuf *buf)
{
- unsigned new_size;
- char *buf;
+ return __printbuf_linelen(buf, buf->pos);
+}
- if (!out->heap_allocated)
- return 0;
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+ return buf->cur_tabstop < buf->nr_tabstops
+ ? buf->_tabstops[buf->cur_tabstop]
+ : 0;
+}
+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+{
/* Reserved space for terminating nul: */
extra += 1;
- if (out->pos + extra < out->size)
+ if (out->pos + extra <= out->size)
return 0;
- new_size = roundup_pow_of_two(out->size + extra);
+ if (!out->heap_allocated) {
+ out->overflow = true;
+ return 0;
+ }
+
+ unsigned new_size = roundup_pow_of_two(out->size + extra);
/*
* Note: output buffer must be freeable with kfree(), it's not required
* that the user use printbuf_exit().
*/
- buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+ char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
if (!buf) {
out->allocation_failure = true;
+ out->overflow = true;
return -ENOMEM;
}
@@ -47,6 +62,92 @@ int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
return 0;
}
+static void printbuf_advance_pos(struct printbuf *out, unsigned len)
+{
+ out->pos += min(len, printbuf_remaining(out));
+}
+
+static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr)
+{
+ unsigned move = out->pos - pos;
+
+ bch2_printbuf_make_room(out, nr);
+
+ if (pos + nr < out->size)
+ memmove(out->buf + pos + nr,
+ out->buf + pos,
+ min(move, out->size - 1 - pos - nr));
+
+ if (pos < out->size)
+ memset(out->buf + pos, ' ', min(nr, out->size - pos));
+
+ printbuf_advance_pos(out, nr);
+ printbuf_nul_terminate_reserved(out);
+}
+
+static void __printbuf_do_indent(struct printbuf *out, unsigned pos)
+{
+ while (true) {
+ int pad;
+ unsigned len = out->pos - pos;
+ char *p = out->buf + pos;
+ char *n = memscan(p, '\n', len);
+ if (cur_tabstop(out)) {
+ n = min(n, (char *) memscan(p, '\r', len));
+ n = min(n, (char *) memscan(p, '\t', len));
+ }
+
+ pos = n - out->buf;
+ if (pos == out->pos)
+ break;
+
+ switch (*n) {
+ case '\n':
+ pos++;
+ out->last_newline = pos;
+
+ printbuf_insert_spaces(out, pos, out->indent);
+
+ pos = min(pos + out->indent, out->pos);
+ out->last_field = pos;
+ out->cur_tabstop = 0;
+ break;
+ case '\r':
+ memmove(n, n + 1, out->pos - pos);
+ --out->pos;
+ pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos);
+ if (pad > 0) {
+ printbuf_insert_spaces(out, out->last_field, pad);
+ pos += pad;
+ }
+
+ out->last_field = pos;
+ out->cur_tabstop++;
+ break;
+ case '\t':
+ pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1;
+ if (pad > 0) {
+ *n = ' ';
+ printbuf_insert_spaces(out, pos, pad - 1);
+ pos += pad;
+ } else {
+ memmove(n, n + 1, out->pos - pos);
+ --out->pos;
+ }
+
+ out->last_field = pos;
+ out->cur_tabstop++;
+ break;
+ }
+ }
+}
+
+static inline void printbuf_do_indent(struct printbuf *out, unsigned pos)
+{
+ if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling)
+ __printbuf_do_indent(out, pos);
+}
+
void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
{
int len;
@@ -55,14 +156,14 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
va_list args2;
va_copy(args2, args);
- len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2);
va_end(args2);
- } while (len + 1 >= printbuf_remaining(out) &&
- !bch2_printbuf_make_room(out, len + 1));
+ } while (len > printbuf_remaining(out) &&
+ !bch2_printbuf_make_room(out, len));
- len = min_t(size_t, len,
- printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
- out->pos += len;
+ unsigned indent_pos = out->pos;
+ printbuf_advance_pos(out, len);
+ printbuf_do_indent(out, indent_pos);
}
void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
@@ -72,14 +173,14 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
do {
va_start(args, fmt);
- len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args);
va_end(args);
- } while (len + 1 >= printbuf_remaining(out) &&
- !bch2_printbuf_make_room(out, len + 1));
+ } while (len > printbuf_remaining(out) &&
+ !bch2_printbuf_make_room(out, len));
- len = min_t(size_t, len,
- printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
- out->pos += len;
+ unsigned indent_pos = out->pos;
+ printbuf_advance_pos(out, len);
+ printbuf_do_indent(out, indent_pos);
}
/**
@@ -194,33 +295,20 @@ void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
void bch2_prt_newline(struct printbuf *buf)
{
- unsigned i;
-
bch2_printbuf_make_room(buf, 1 + buf->indent);
- __prt_char(buf, '\n');
+ __prt_char_reserved(buf, '\n');
buf->last_newline = buf->pos;
- for (i = 0; i < buf->indent; i++)
- __prt_char(buf, ' ');
+ __prt_chars_reserved(buf, ' ', buf->indent);
- printbuf_nul_terminate(buf);
+ printbuf_nul_terminate_reserved(buf);
buf->last_field = buf->pos;
buf->cur_tabstop = 0;
}
-/*
- * Returns spaces from start of line, if set, or 0 if unset:
- */
-static inline unsigned cur_tabstop(struct printbuf *buf)
-{
- return buf->cur_tabstop < buf->nr_tabstops
- ? buf->_tabstops[buf->cur_tabstop]
- : 0;
-}
-
static void __prt_tab(struct printbuf *out)
{
int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
@@ -247,24 +335,9 @@ void bch2_prt_tab(struct printbuf *out)
static void __prt_tab_rjust(struct printbuf *buf)
{
- unsigned move = buf->pos - buf->last_field;
int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
-
- if (pad > 0) {
- bch2_printbuf_make_room(buf, pad);
-
- if (buf->last_field + pad < buf->size)
- memmove(buf->buf + buf->last_field + pad,
- buf->buf + buf->last_field,
- min(move, buf->size - 1 - buf->last_field - pad));
-
- if (buf->last_field < buf->size)
- memset(buf->buf + buf->last_field, ' ',
- min((unsigned) pad, buf->size - buf->last_field));
-
- buf->pos += pad;
- printbuf_nul_terminate(buf);
- }
+ if (pad > 0)
+ printbuf_insert_spaces(buf, buf->last_field, pad);
buf->last_field = buf->pos;
buf->cur_tabstop++;
@@ -301,41 +374,9 @@ void bch2_prt_tab_rjust(struct printbuf *buf)
*/
void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
{
- const char *unprinted_start = str;
- const char *end = str + count;
-
- if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) {
- prt_bytes(out, str, count);
- return;
- }
-
- while (str != end) {
- switch (*str) {
- case '\n':
- prt_bytes(out, unprinted_start, str - unprinted_start);
- unprinted_start = str + 1;
- bch2_prt_newline(out);
- break;
- case '\t':
- if (likely(cur_tabstop(out))) {
- prt_bytes(out, unprinted_start, str - unprinted_start);
- unprinted_start = str + 1;
- __prt_tab(out);
- }
- break;
- case '\r':
- if (likely(cur_tabstop(out))) {
- prt_bytes(out, unprinted_start, str - unprinted_start);
- unprinted_start = str + 1;
- __prt_tab_rjust(out);
- }
- break;
- }
-
- str++;
- }
-
- prt_bytes(out, unprinted_start, str - unprinted_start);
+ unsigned indent_pos = out->pos;
+ prt_bytes(out, str, count);
+ printbuf_do_indent(out, indent_pos);
}
/**
@@ -348,9 +389,10 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou
void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
{
bch2_printbuf_make_room(out, 10);
- out->pos += string_get_size(v, 1, !out->si_units,
- out->buf + out->pos,
- printbuf_remaining_size(out));
+ unsigned len = string_get_size(v, 1, !out->si_units,
+ out->buf + out->pos,
+ printbuf_remaining_size(out));
+ printbuf_advance_pos(out, len);
}
/**
@@ -402,9 +444,7 @@ void bch2_prt_string_option(struct printbuf *out,
const char * const list[],
size_t selected)
{
- size_t i;
-
- for (i = 0; list[i]; i++)
+ for (size_t i = 0; list[i]; i++)
bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 9a4a56c40937..9ecc56bc9635 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -86,6 +86,7 @@ struct printbuf {
u8 atomic;
bool allocation_failure:1;
bool heap_allocated:1;
+ bool overflow:1;
enum printbuf_si si_units:1;
bool human_readable_units:1;
bool has_indent_or_tabstops:1;
@@ -142,7 +143,9 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
*/
static inline unsigned printbuf_remaining_size(struct printbuf *out)
{
- return out->pos < out->size ? out->size - out->pos : 0;
+ if (WARN_ON(out->size && out->pos >= out->size))
+ out->pos = out->size - 1;
+ return out->size - out->pos;
}
/*
@@ -151,7 +154,7 @@ static inline unsigned printbuf_remaining_size(struct printbuf *out)
*/
static inline unsigned printbuf_remaining(struct printbuf *out)
{
- return out->pos < out->size ? out->size - out->pos - 1 : 0;
+ return out->size ? printbuf_remaining_size(out) - 1 : 0;
}
static inline unsigned printbuf_written(struct printbuf *out)
@@ -159,30 +162,25 @@ static inline unsigned printbuf_written(struct printbuf *out)
return out->size ? min(out->pos, out->size - 1) : 0;
}
-/*
- * Returns true if output was truncated:
- */
-static inline bool printbuf_overflowed(struct printbuf *out)
+static inline void printbuf_nul_terminate_reserved(struct printbuf *out)
{
- return out->pos >= out->size;
+ if (WARN_ON(out->size && out->pos >= out->size))
+ out->pos = out->size - 1;
+ if (out->size)
+ out->buf[out->pos] = 0;
}
static inline void printbuf_nul_terminate(struct printbuf *out)
{
bch2_printbuf_make_room(out, 1);
-
- if (out->pos < out->size)
- out->buf[out->pos] = 0;
- else if (out->size)
- out->buf[out->size - 1] = 0;
+ printbuf_nul_terminate_reserved(out);
}
/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
static inline void __prt_char_reserved(struct printbuf *out, char c)
{
if (printbuf_remaining(out))
- out->buf[out->pos] = c;
- out->pos++;
+ out->buf[out->pos++] = c;
}
/* Doesn't nul terminate: */
@@ -194,37 +192,34 @@ static inline void __prt_char(struct printbuf *out, char c)
static inline void prt_char(struct printbuf *out, char c)
{
- __prt_char(out, c);
- printbuf_nul_terminate(out);
+ bch2_printbuf_make_room(out, 2);
+ __prt_char_reserved(out, c);
+ printbuf_nul_terminate_reserved(out);
}
static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
{
- unsigned i, can_print = min(n, printbuf_remaining(out));
+ unsigned can_print = min(n, printbuf_remaining(out));
- for (i = 0; i < can_print; i++)
+ for (unsigned i = 0; i < can_print; i++)
out->buf[out->pos++] = c;
- out->pos += n - can_print;
}
static inline void prt_chars(struct printbuf *out, char c, unsigned n)
{
bch2_printbuf_make_room(out, n);
__prt_chars_reserved(out, c, n);
- printbuf_nul_terminate(out);
+ printbuf_nul_terminate_reserved(out);
}
static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
{
- unsigned i, can_print;
-
bch2_printbuf_make_room(out, n);
- can_print = min(n, printbuf_remaining(out));
+ unsigned can_print = min(n, printbuf_remaining(out));
- for (i = 0; i < can_print; i++)
+ for (unsigned i = 0; i < can_print; i++)
out->buf[out->pos++] = ((char *) b)[i];
- out->pos += n - can_print;
printbuf_nul_terminate(out);
}
@@ -241,18 +236,18 @@ static inline void prt_str_indented(struct printbuf *out, const char *str)
static inline void prt_hex_byte(struct printbuf *out, u8 byte)
{
- bch2_printbuf_make_room(out, 2);
+ bch2_printbuf_make_room(out, 3);
__prt_char_reserved(out, hex_asc_hi(byte));
__prt_char_reserved(out, hex_asc_lo(byte));
- printbuf_nul_terminate(out);
+ printbuf_nul_terminate_reserved(out);
}
static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
{
- bch2_printbuf_make_room(out, 2);
+ bch2_printbuf_make_room(out, 3);
__prt_char_reserved(out, hex_asc_upper_hi(byte));
__prt_char_reserved(out, hex_asc_upper_lo(byte));
- printbuf_nul_terminate(out);
+ printbuf_nul_terminate_reserved(out);
}
/**
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 556da0738106..a0cca8b70e0a 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -20,7 +20,7 @@ static const char * const bch2_quota_counters[] = {
};
static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_quota *q = field_to_type(f, quota);
@@ -60,8 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
};
int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
int ret = 0;
@@ -97,45 +96,14 @@ static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 20);
- prt_str(out, "i_fieldmask");
- prt_tab(out);
- prt_printf(out, "%x", i->i_fieldmask);
- prt_newline(out);
-
- prt_str(out, "i_flags");
- prt_tab(out);
- prt_printf(out, "%u", i->i_flags);
- prt_newline(out);
-
- prt_str(out, "i_spc_timelimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_spc_timelimit);
- prt_newline(out);
-
- prt_str(out, "i_ino_timelimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_ino_timelimit);
- prt_newline(out);
-
- prt_str(out, "i_rt_spc_timelimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_rt_spc_timelimit);
- prt_newline(out);
-
- prt_str(out, "i_spc_warnlimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_spc_warnlimit);
- prt_newline(out);
-
- prt_str(out, "i_ino_warnlimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_ino_warnlimit);
- prt_newline(out);
-
- prt_str(out, "i_rt_spc_warnlimit");
- prt_tab(out);
- prt_printf(out, "%u", i->i_rt_spc_warnlimit);
- prt_newline(out);
+ prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask);
+ prt_printf(out, "i_flags\t%u\n", i->i_flags);
+ prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit);
+ prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit);
+ prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit);
+ prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit);
+ prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit);
+ prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit);
}
static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
@@ -143,60 +111,17 @@ static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 20);
- prt_str(out, "d_fieldmask");
- prt_tab(out);
- prt_printf(out, "%x", q->d_fieldmask);
- prt_newline(out);
-
- prt_str(out, "d_spc_hardlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_spc_hardlimit);
- prt_newline(out);
-
- prt_str(out, "d_spc_softlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_spc_softlimit);
- prt_newline(out);
-
- prt_str(out, "d_ino_hardlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_hardlimit);
- prt_newline(out);
-
- prt_str(out, "d_ino_softlimit");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_softlimit);
- prt_newline(out);
-
- prt_str(out, "d_space");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_space);
- prt_newline(out);
-
- prt_str(out, "d_ino_count");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_count);
- prt_newline(out);
-
- prt_str(out, "d_ino_timer");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_ino_timer);
- prt_newline(out);
-
- prt_str(out, "d_spc_timer");
- prt_tab(out);
- prt_printf(out, "%llu", q->d_spc_timer);
- prt_newline(out);
-
- prt_str(out, "d_ino_warns");
- prt_tab(out);
- prt_printf(out, "%i", q->d_ino_warns);
- prt_newline(out);
-
- prt_str(out, "d_spc_warns");
- prt_tab(out);
- prt_printf(out, "%i", q->d_spc_warns);
- prt_newline(out);
+ prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask);
+ prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit);
+ prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit);
+ prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit);
+ prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit);
+ prt_printf(out, "d_space\t%llu\n", q->d_space);
+ prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count);
+ prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer);
+ prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer);
+ prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns);
+ prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns);
}
static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
@@ -610,10 +535,10 @@ int bch2_fs_quota_read(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
__bch2_quota_set(c, k, NULL)) ?:
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
bch2_fs_quota_read_inode(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
@@ -900,7 +825,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ BTREE_ITER_slots|BTREE_ITER_intent);
ret = bkey_err(k);
if (unlikely(ret))
return ret;
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 884f601f41c4..02d37a332218 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -5,11 +5,11 @@
#include "inode.h"
#include "quota_types.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_quota ((struct bkey_ops) { \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 56336f3dd1d0..cf81e5128c3a 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -42,7 +42,7 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -89,7 +89,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum,
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
@@ -140,7 +140,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
bch2_trans_iter_init(trans, extent_iter,
work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
work_pos,
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_all_snapshots);
k = bch2_btree_iter_peek_slot(extent_iter);
if (bkey_err(k))
return k;
@@ -323,12 +323,14 @@ static int do_rebalance(struct moving_context *ctxt)
struct bkey_s_c k;
int ret = 0;
+ bch2_trans_begin(trans);
+
bch2_move_stats_init(&r->work_stats, "rebalance_work");
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
bch2_trans_iter_init(trans, &rebalance_work_iter,
BTREE_ID_rebalance_work, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_all_snapshots);
while (!bch2_move_ratelimit(ctxt)) {
if (!r->enabled) {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8091d0686029..1266916ac03f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -65,9 +65,20 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
+
+ __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent);
+
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
+
__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
@@ -125,9 +136,9 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
{
struct btree_iter iter;
unsigned iter_flags =
- BTREE_ITER_INTENT|
- BTREE_ITER_NOT_EXTENTS;
- unsigned update_flags = BTREE_TRIGGER_NORUN;
+ BTREE_ITER_intent|
+ BTREE_ITER_not_extents;
+ unsigned update_flags = BTREE_TRIGGER_norun;
int ret;
if (k->overwritten)
@@ -136,17 +147,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
trans->journal_res.seq = k->journal_seq;
/*
- * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
+ * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing
* besides the allocator is doing updates yet so we don't need key cache
* coherency for non-alloc btrees, and key cache fills for snapshots
- * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
+ * btrees use BTREE_ITER_filter_snapshots, which isn't available until
* the snapshots recovery pass runs.
*/
if (!k->level && k->btree_id == BTREE_ID_alloc)
- iter_flags |= BTREE_ITER_CACHED;
+ iter_flags |= BTREE_ITER_cached;
else
- update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM;
+ update_flags |= BTREE_UPDATE_key_cache_reclaim;
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
@@ -191,7 +202,7 @@ int bch2_journal_replay(struct bch_fs *c)
struct journal *j = &c->journal;
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
- struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_trans *trans = NULL;
bool immediate_flush = false;
int ret = 0;
@@ -205,6 +216,7 @@ int bch2_journal_replay(struct bch_fs *c)
BUG_ON(!atomic_read(&keys->ref));
move_gap(keys, keys->nr);
+ trans = bch2_trans_get(c);
/*
* First, attempt to replay keys in sorted order. This is more
@@ -361,14 +373,17 @@ static int journal_replay_entry_early(struct bch_fs *c,
case BCH_JSET_ENTRY_dev_usage: {
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
- struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
- unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
-
- for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
- ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
- ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
- ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
- }
+ unsigned nr_types = jset_entry_dev_usage_nr_types(u);
+
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, le32_to_cpu(u->dev));
+ if (ca)
+ for (unsigned i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
+ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
+ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
+ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
+ }
+ rcu_read_unlock();
break;
}
@@ -597,56 +612,54 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->opts.norecovery)
c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
- if (!c->opts.nochanges) {
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- bool write_sb = false;
-
- if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
- ext->recovery_passes_required[0] |=
- cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
- write_sb = true;
- }
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ bool write_sb = false;
- u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- if (sb_passes) {
- struct printbuf buf = PRINTBUF;
- prt_str(&buf, "superblock requires following recovery passes to be run:\n ");
- prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
+ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
+ ext->recovery_passes_required[0] |=
+ cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
+ write_sb = true;
+ }
- if (bch2_check_version_downgrade(c)) {
- struct printbuf buf = PRINTBUF;
+ u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ if (sb_passes) {
+ struct printbuf buf = PRINTBUF;
+ prt_str(&buf, "superblock requires following recovery passes to be run:\n ");
+ prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
- prt_str(&buf, "Version downgrade required:");
+ if (bch2_check_version_downgrade(c)) {
+ struct printbuf buf = PRINTBUF;
- __le64 passes = ext->recovery_passes_required[0];
- bch2_sb_set_downgrade(c,
- BCH_VERSION_MINOR(bcachefs_metadata_version_current),
- BCH_VERSION_MINOR(c->sb.version));
- passes = ext->recovery_passes_required[0] & ~passes;
- if (passes) {
- prt_str(&buf, "\n running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
- }
+ prt_str(&buf, "Version downgrade required:");
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- write_sb = true;
+ __le64 passes = ext->recovery_passes_required[0];
+ bch2_sb_set_downgrade(c,
+ BCH_VERSION_MINOR(bcachefs_metadata_version_current),
+ BCH_VERSION_MINOR(c->sb.version));
+ passes = ext->recovery_passes_required[0] & ~passes;
+ if (passes) {
+ prt_str(&buf, "\n running recovery passes: ");
+ prt_bitflags(&buf, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
- if (check_version_upgrade(c))
- write_sb = true;
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ write_sb = true;
+ }
- if (write_sb)
- bch2_write_super(c);
+ if (check_version_upgrade(c))
+ write_sb = true;
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- mutex_unlock(&c->sb_lock);
- }
+ if (write_sb)
+ bch2_write_super(c);
+
+ c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ mutex_unlock(&c->sb_lock);
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
@@ -660,7 +673,9 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) {
+ bch2_journal_pos_from_member_info_resume(c);
+
+ if (!c->sb.clean || c->opts.retain_recovery_info) {
struct genradix_iter iter;
struct journal_replay **i;
@@ -832,8 +847,8 @@ use_clean:
}
mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- bool write_sb = false;
+ ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ write_sb = false;
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
@@ -868,6 +883,9 @@ use_clean:
write_sb = true;
}
+ if (bch2_blacklist_entries_gc(c))
+ write_sb = true;
+
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -890,10 +908,6 @@ use_clean:
bch_info(c, "scanning for old btree nodes done");
}
- if (c->journal_seq_blacklist_table &&
- c->journal_seq_blacklist_table->nr > 128)
- queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
-
ret = 0;
out:
bch2_flush_fsck_errs(c);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 0cec0f7d9703..4a9eb9582b6e 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -26,11 +26,6 @@ const char * const bch2_recovery_passes[] = {
NULL
};
-static int bch2_check_allocations(struct bch_fs *c)
-{
- return bch2_gc(c, true, false);
-}
-
static int bch2_set_may_go_rw(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
@@ -227,7 +222,8 @@ int bch2_run_recovery_passes(struct bch_fs *c)
if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
unsigned pass = c->curr_recovery_pass;
- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass) ?:
+ bch2_journal_flush(&c->journal);
if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
(ret && c->curr_recovery_pass < pass))
continue;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index ff7864731a07..9ac6cf21cfbf 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -30,7 +30,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
/* reflink pointers */
int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
@@ -74,20 +74,20 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 *idx, unsigned flags)
+ struct bkey_s_c_reflink_p p, u64 *idx,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_i *k;
__le64 *refcount;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
struct printbuf buf = PRINTBUF;
int ret;
k = bch2_bkey_get_mut_noupdate(trans, &iter,
BTREE_ID_reflink, POS(0, *idx),
- BTREE_ITER_WITH_UPDATES);
+ BTREE_ITER_with_updates);
ret = PTR_ERR_OR_ZERO(k);
if (ret)
goto err;
@@ -102,7 +102,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
goto err;
}
- if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+ if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
bch2_bkey_val_to_text(&buf, c, p.s_c);
bch2_trans_inconsistent(trans,
"indirect extent refcount underflow at %llu while marking\n %s",
@@ -111,7 +111,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
goto err;
}
- if (flags & BTREE_TRIGGER_INSERT) {
+ if (flags & BTREE_TRIGGER_insert) {
struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
u64 pad;
@@ -141,12 +141,13 @@ err:
}
static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 *idx, unsigned flags, size_t r_idx)
+ struct bkey_s_c_reflink_p p, u64 *idx,
+ enum btree_iter_update_trigger_flags flags,
+ size_t r_idx)
{
struct bch_fs *c = trans->c;
struct reflink_gc *r;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
u64 start = le64_to_cpu(p.v->idx);
u64 end = le64_to_cpu(p.v->idx) + p.k->size;
u64 next_idx = end + le32_to_cpu(p.v->back_pad);
@@ -163,10 +164,13 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
BUG_ON((s64) r->refcount + add < 0);
- r->refcount += add;
+ if (flags & BTREE_TRIGGER_gc)
+ r->refcount += add;
*idx = r->offset;
return 0;
not_found:
+ BUG_ON(!(flags & BTREE_TRIGGER_check_repair));
+
if (fsck_err(c, reflink_p_to_missing_reflink_v,
"pointer to missing indirect extent\n"
" %s\n"
@@ -189,7 +193,7 @@ not_found:
set_bkey_val_u64s(&update->k, 0);
}
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun);
}
*idx = next_idx;
@@ -200,8 +204,8 @@ fsck_err:
}
static int __trigger_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+ enum btree_id btree_id, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
@@ -210,12 +214,12 @@ static int __trigger_reflink_p(struct btree_trans *trans,
u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad);
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
while (idx < end && !ret)
ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
}
- if (flags & BTREE_TRIGGER_GC) {
+ if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) {
size_t l = 0, r = c->reflink_gc_nr;
while (l < r) {
@@ -238,10 +242,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
- (flags & BTREE_TRIGGER_INSERT)) {
+ if ((flags & BTREE_TRIGGER_transactional) &&
+ (flags & BTREE_TRIGGER_insert)) {
struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
v->front_pad = v->back_pad = 0;
@@ -253,7 +257,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
/* indirect extents */
int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
return bch2_bkey_ptrs_invalid(c, k, flags, err);
@@ -281,23 +285,25 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
#endif
-static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags)
+static inline void
+check_indirect_extent_deleting(struct bkey_s new,
+ enum btree_iter_update_trigger_flags *flags)
{
- if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
+ if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) {
new.k->type = KEY_TYPE_deleted;
new.k->size = 0;
set_bkey_val_u64s(new.k, 0);
- *flags &= ~BTREE_TRIGGER_INSERT;
+ *flags &= ~BTREE_TRIGGER_insert;
}
}
int bch2_trigger_reflink_v(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
- (flags & BTREE_TRIGGER_INSERT))
+ if ((flags & BTREE_TRIGGER_transactional) &&
+ (flags & BTREE_TRIGGER_insert))
check_indirect_extent_deleting(new, &flags);
return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
@@ -306,7 +312,7 @@ int bch2_trigger_reflink_v(struct btree_trans *trans,
/* indirect inline data */
int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
return 0;
@@ -326,7 +332,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
check_indirect_extent_deleting(new, &flags);
@@ -349,7 +355,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
k = bch2_btree_iter_peek_prev(&reflink_iter);
ret = bkey_err(k);
if (ret)
@@ -394,7 +400,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
err:
bch2_trans_iter_exit(trans, &reflink_iter);
@@ -455,9 +461,9 @@ s64 bch2_remap_range(struct bch_fs *c,
goto err;
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
while ((ret == 0 ||
bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
@@ -567,7 +573,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_begin(trans);
ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
- dst_inum, BTREE_ITER_INTENT);
+ dst_inum, BTREE_ITER_intent);
if (!ret2 &&
inode_u.bi_size < new_i_size) {
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 4d8867289717..e894f3a2c67a 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -2,15 +2,16 @@
#ifndef _BCACHEFS_REFLINK_H
#define _BCACHEFS_REFLINK_H
-enum bkey_invalid_flags;
+enum bch_validate_flags;
int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
@@ -21,11 +22,12 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
})
int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
@@ -36,13 +38,13 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
})
int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
int bch2_trigger_indirect_inline_data(struct btree_trans *,
enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
- unsigned);
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 678b9c20e251..bd1d5d085e23 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -84,7 +84,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
}
for (unsigned i = 0; i < r->nr_devs; i++)
- if (!bch2_dev_exists(sb, r->devs[i])) {
+ if (!bch2_member_exists(sb, r->devs[i])) {
prt_printf(err, "invalid device %u in entry ", r->devs[i]);
goto bad;
}
@@ -200,7 +200,7 @@ cpu_replicas_add_entry(struct bch_fs *c,
};
for (i = 0; i < new_entry->nr_devs; i++)
- BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i]));
+ BUG_ON(!bch2_dev_exists(c, new_entry->devs[i]));
BUG_ON(!new_entry->data_type);
verify_replicas_entry(new_entry);
@@ -860,7 +860,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
}
static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
struct bch_replicas_cpu cpu_r;
@@ -899,7 +899,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
};
static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
struct bch_replicas_cpu cpu_r;
@@ -947,18 +947,20 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
percpu_down_read(&c->mark_lock);
for_each_cpu_replicas_entry(&c->replicas, e) {
- unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
+ unsigned nr_online = 0, nr_failed = 0, dflags = 0;
bool metadata = e->data_type < BCH_DATA_user;
if (e->data_type == BCH_DATA_cached)
continue;
- for (i = 0; i < e->nr_devs; i++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
-
+ rcu_read_lock();
+ for (unsigned i = 0; i < e->nr_devs; i++) {
nr_online += test_bit(e->devs[i], devs.d);
- nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
+
+ struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]);
+ nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed;
}
+ rcu_read_unlock();
if (nr_failed == e->nr_devs)
continue;
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 194e55b11137..47f10ab57f40 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -266,9 +266,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
}
}
-static int bch2_sb_clean_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_clean *clean = field_to_type(f, clean);
@@ -283,7 +282,7 @@ static int bch2_sb_clean_validate(struct bch_sb *sb,
entry = vstruct_next(entry)) {
if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) {
prt_str(err, "entry type ");
- bch2_prt_jset_entry_type(err, le16_to_cpu(entry->type));
+ bch2_prt_jset_entry_type(err, entry->type);
prt_str(err, " overruns end of section");
return -BCH_ERR_invalid_sb_clean;
}
@@ -298,10 +297,8 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field_clean *clean = field_to_type(f, clean);
struct jset_entry *entry;
- prt_printf(out, "flags: %x", le32_to_cpu(clean->flags));
- prt_newline(out);
- prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq));
- prt_newline(out);
+ prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags));
+ prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq));
for (entry = clean->start;
entry != vstruct_end(&clean->field);
@@ -392,6 +389,8 @@ void bch2_fs_mark_clean(struct bch_fs *c)
goto out;
}
+ bch2_journal_pos_from_member_info_set(c);
+
bch2_write_super(c);
out:
mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
index 7dc898761bb3..6992e7469112 100644
--- a/fs/bcachefs/sb-counters.c
+++ b/fs/bcachefs/sb-counters.c
@@ -20,9 +20,8 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
};
-static int bch2_sb_counters_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
};
@@ -31,19 +30,12 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
- unsigned int i;
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
- for (i = 0; i < nr; i++) {
- if (i < BCH_COUNTER_NR)
- prt_printf(out, "%s ", bch2_counter_names[i]);
- else
- prt_printf(out, "(unknown)");
-
- prt_tab(out);
- prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < nr; i++)
+ prt_printf(out, "%s \t%llu\n",
+ i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)",
+ le64_to_cpu(ctrs->d[i]));
};
int bch2_sb_counters_to_cpu(struct bch_fs *c)
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index a98ef940b7a3..390a1bbd2567 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -134,15 +134,25 @@ downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
#define for_each_downgrade_entry(_d, _i) \
for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \
(void *) _i < vstruct_end(&(_d)->field) && \
- (void *) &_i->errors[0] < vstruct_end(&(_d)->field); \
+ (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \
+ (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \
_i = downgrade_entry_next_c(_i))
static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
- for_each_downgrade_entry(e, i) {
+ for (const struct bch_sb_field_downgrade_entry *i = e->entries;
+ (void *) i < vstruct_end(&e->field);
+ i = downgrade_entry_next_c(i)) {
+ if (flags & BCH_VALIDATE_write &&
+ ((void *) &i->errors[0] > vstruct_end(&e->field) ||
+ (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field))) {
+ prt_printf(err, "downgrade entry overruns end of superblock section)");
+ return -BCH_ERR_invalid_sb_downgrade;
+ }
+
if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
@@ -164,19 +174,16 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
printbuf_tabstop_push(out, 16);
for_each_downgrade_entry(e, i) {
- prt_str(out, "version:");
- prt_tab(out);
+ prt_str(out, "version:\t");
bch2_version_to_text(out, le16_to_cpu(i->version));
prt_newline(out);
- prt_str(out, "recovery passes:");
- prt_tab(out);
+ prt_str(out, "recovery passes:\t");
prt_bitflags(out, bch2_recovery_passes,
bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
prt_newline(out);
- prt_str(out, "errors:");
- prt_tab(out);
+ prt_str(out, "errors:\t");
bool first = true;
for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
if (!first)
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index 5f5bcae391fb..bda33e59e226 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -30,7 +30,7 @@ static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
}
static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_errors *e = field_to_type(f, errors);
unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 06c7a644f4a4..87324747351a 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -272,7 +272,8 @@
x(snapshot_node_missing, 264) \
x(dup_backpointer_to_bad_csum_extent, 265) \
x(btree_bitmap_not_marked, 266) \
- x(sb_clean_entry_overrun, 267)
+ x(sb_clean_entry_overrun, 267) \
+ x(btree_ptr_v2_written_0, 268)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 44b3f0cb7b49..39196f2a4197 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -3,11 +3,22 @@
#include "bcachefs.h"
#include "btree_cache.h"
#include "disk_groups.h"
+#include "error.h"
#include "opts.h"
#include "replicas.h"
#include "sb-members.h"
#include "super-io.h"
+void bch2_dev_missing(struct bch_fs *c, unsigned dev)
+{
+ bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
+}
+
+void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
+{
+ bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset);
+}
+
#define x(t, n, ...) [n] = #t,
static const char * const bch2_iops_measurements[] = {
BCH_IOPS_MEASUREMENTS()
@@ -164,18 +175,14 @@ static void member_to_text(struct printbuf *out,
u64 bucket_size = le16_to_cpu(m.bucket_size);
u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
- if (!bch2_member_exists(&m))
+ if (!bch2_member_alive(&m))
return;
- prt_printf(out, "Device:");
- prt_tab(out);
- prt_printf(out, "%u", i);
- prt_newline(out);
+ prt_printf(out, "Device:\t%u\n", i);
printbuf_indent_add(out, 2);
- prt_printf(out, "Label:");
- prt_tab(out);
+ prt_printf(out, "Label:\t");
if (BCH_MEMBER_GROUP(&m)) {
unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
@@ -189,103 +196,73 @@ static void member_to_text(struct printbuf *out,
}
prt_newline(out);
- prt_printf(out, "UUID:");
- prt_tab(out);
+ prt_printf(out, "UUID:\t");
pr_uuid(out, m.uuid.b);
prt_newline(out);
- prt_printf(out, "Size:");
- prt_tab(out);
+ prt_printf(out, "Size:\t");
prt_units_u64(out, device_size << 9);
prt_newline(out);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s errors:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, le64_to_cpu(m.errors[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i]));
- for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
- prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
- prt_tab(out);
- prt_printf(out, "%u", le32_to_cpu(m.iops[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_IOPS_NR; i++)
+ prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i]));
- prt_printf(out, "Bucket size:");
- prt_tab(out);
+ prt_printf(out, "Bucket size:\t");
prt_units_u64(out, bucket_size << 9);
prt_newline(out);
- prt_printf(out, "First bucket:");
- prt_tab(out);
- prt_printf(out, "%u", le16_to_cpu(m.first_bucket));
- prt_newline(out);
-
- prt_printf(out, "Buckets:");
- prt_tab(out);
- prt_printf(out, "%llu", le64_to_cpu(m.nbuckets));
- prt_newline(out);
+ prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket));
+ prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets));
- prt_printf(out, "Last mount:");
- prt_tab(out);
+ prt_printf(out, "Last mount:\t");
if (m.last_mount)
bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
else
prt_printf(out, "(never)");
prt_newline(out);
- prt_printf(out, "Last superblock write:");
- prt_tab(out);
- prt_u64(out, le64_to_cpu(m.seq));
- prt_newline(out);
+ prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq));
- prt_printf(out, "State:");
- prt_tab(out);
- prt_printf(out, "%s",
+ prt_printf(out, "State:\t%s\n",
BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
? bch2_member_states[BCH_MEMBER_STATE(&m)]
: "unknown");
- prt_newline(out);
- prt_printf(out, "Data allowed:");
- prt_tab(out);
+ prt_printf(out, "Data allowed:\t");
if (BCH_MEMBER_DATA_ALLOWED(&m))
prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
else
prt_printf(out, "(none)");
prt_newline(out);
- prt_printf(out, "Has data:");
- prt_tab(out);
+ prt_printf(out, "Has data:\t");
if (data_have)
prt_bitflags(out, __bch2_data_types, data_have);
else
prt_printf(out, "(none)");
prt_newline(out);
- prt_str(out, "Durability:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+ prt_printf(out, "Btree allocated bitmap blocksize:\t");
+ prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
prt_newline(out);
- prt_printf(out, "Discard:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
+ prt_printf(out, "Btree allocated bitmap:\t");
+ bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64);
prt_newline(out);
- prt_printf(out, "Freespace initialized:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
- prt_newline(out);
+ prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+
+ prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
+ prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
printbuf_indent_sub(out, 2);
}
-static int bch2_sb_members_v1_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
unsigned i;
@@ -333,9 +310,8 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
member_to_text(out, members_v2_get(mi, i), gi, sb, i);
}
-static int bch2_sb_members_v2_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
+static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
{
struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
@@ -390,12 +366,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
prt_newline(out);
printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, atomic64_read(&ca->errors[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i]));
printbuf_indent_sub(out, 2);
prt_str(out, "IO errors since ");
@@ -404,12 +376,9 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
prt_newline(out);
printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
- prt_printf(out, "%s:", bch2_member_error_strs[i]);
- prt_tab(out);
- prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+ prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i],
+ atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
printbuf_indent_sub(out, 2);
}
@@ -437,11 +406,20 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
{
- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
- if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev),
- ptr->offset, btree_sectors(c)))
- return false;
- return true;
+ bool ret = true;
+ rcu_read_lock();
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
+
+ if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) {
+ ret = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
}
static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
@@ -463,6 +441,9 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns
m->btree_bitmap_shift += resize;
}
+ BUG_ON(m->btree_bitmap_shift > 57);
+ BUG_ON(end > 64ULL << m->btree_bitmap_shift);
+
for (unsigned bit = start >> m->btree_bitmap_shift;
(u64) bit << m->btree_bitmap_shift < end;
bit++)
@@ -476,6 +457,10 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
lockdep_assert_held(&c->sb_lock);
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ if (!bch2_member_exists(c->disk_sb.sb, ptr->dev))
+ continue;
+
__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
+ }
}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 5bf27d30ca29..dd93192ec065 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -29,19 +29,6 @@ static inline bool bch2_dev_is_readable(struct bch_dev *ca)
ca->mi.state != BCH_MEMBER_STATE_failed;
}
-static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
-{
- if (!percpu_ref_tryget(&ca->io_ref))
- return false;
-
- if (ca->mi.state == BCH_MEMBER_STATE_rw ||
- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
- return true;
-
- percpu_ref_put(&ca->io_ref);
- return false;
-}
-
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
{
return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
@@ -105,14 +92,41 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *
for (struct bch_dev *_ca = NULL; \
(_ca = __bch2_next_dev((_c), _ca, (_mask)));)
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
+static inline void bch2_dev_get(struct bch_dev *ca)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L);
+#else
+ percpu_ref_get(&ca->ref);
+#endif
+}
+
+static inline void __bch2_dev_put(struct bch_dev *ca)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ long r = atomic_long_dec_return(&ca->ref);
+ if (r < (long) !ca->dying)
+ panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put);
+ ca->last_put = _THIS_IP_;
+ if (!r)
+ complete(&ca->ref_completion);
+#else
+ percpu_ref_put(&ca->ref);
+#endif
+}
+
+static inline void bch2_dev_put(struct bch_dev *ca)
{
- rcu_read_lock();
if (ca)
- percpu_ref_put(&ca->ref);
+ __bch2_dev_put(ca);
+}
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+ rcu_read_lock();
+ bch2_dev_put(ca);
if ((ca = __bch2_next_dev(c, ca, NULL)))
- percpu_ref_get(&ca->ref);
+ bch2_dev_get(ca);
rcu_read_unlock();
return ca;
@@ -158,26 +172,113 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
#define for_each_readable_member(c, ca) \
__for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
-/*
- * If a key exists that references a device, the device won't be going away and
- * we can omit rcu_read_lock():
- */
-static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
{
- EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+ return dev < c->sb.nr_devices && c->devs[dev];
+}
- return rcu_dereference_check(c->devs[idx], 1);
+static inline bool bucket_valid(const struct bch_dev *ca, u64 b)
+{
+ return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first;
}
-static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev)
{
- EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+ EBUG_ON(!bch2_dev_exists(c, dev));
+
+ return rcu_dereference_check(c->devs[dev], 1);
+}
- return rcu_dereference_protected(c->devs[idx],
+static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
+{
+ EBUG_ON(!bch2_dev_exists(c, dev));
+
+ return rcu_dereference_protected(c->devs[dev],
lockdep_is_held(&c->sb_lock) ||
lockdep_is_held(&c->state_lock));
}
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
+{
+ return c && dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[dev])
+ : NULL;
+}
+
+static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ if (ca)
+ bch2_dev_get(ca);
+ rcu_read_unlock();
+ return ca;
+}
+
+void bch2_dev_missing(struct bch_fs *, unsigned);
+
+static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
+{
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
+ if (!ca)
+ bch2_dev_missing(c, dev);
+ return ca;
+}
+
+static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
+{
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
+ if (ca && !bucket_valid(ca, bucket.offset)) {
+ bch2_dev_put(ca);
+ ca = NULL;
+ }
+ return ca;
+}
+
+void bch2_dev_bucket_missing(struct bch_fs *, struct bpos);
+
+static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
+{
+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket);
+ if (!ca)
+ bch2_dev_bucket_missing(c, bucket);
+ return ca;
+}
+
+static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
+{
+ if (ca && ca->dev_idx == dev_idx)
+ return ca;
+ bch2_dev_put(ca);
+ return bch2_dev_tryget_noerror(c, dev_idx);
+}
+
+static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
+{
+ if (ca && ca->dev_idx == dev_idx)
+ return ca;
+ bch2_dev_put(ca);
+ return bch2_dev_tryget(c, dev_idx);
+}
+
+static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ if (ca && !percpu_ref_tryget(&ca->io_ref))
+ ca = NULL;
+ rcu_read_unlock();
+
+ if (ca &&
+ (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
+ return ca;
+
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
+ return NULL;
+}
+
/* XXX kill, move to struct bch_fs */
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{
@@ -192,16 +293,16 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
-static inline bool bch2_member_exists(struct bch_member *m)
+static inline bool bch2_member_alive(struct bch_member *m)
{
return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
}
-static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev)
+static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
{
if (dev < sb->nr_devices) {
struct bch_member m = bch2_sb_member_get(sb, dev);
- return bch2_member_exists(&m);
+ return bch2_member_alive(&m);
}
return false;
}
@@ -210,6 +311,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
{
return (struct bch_member_cpu) {
.nbuckets = le64_to_cpu(mi->nbuckets),
+ .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) -
+ le16_to_cpu(mi->first_bucket),
.first_bucket = le16_to_cpu(mi->first_bucket),
.bucket_size = le16_to_cpu(mi->bucket_size),
.group = BCH_MEMBER_GROUP(mi),
@@ -220,7 +323,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
- .valid = bch2_member_exists(mi),
+ .valid = bch2_member_alive(mi),
.btree_bitmap_shift = mi->btree_bitmap_shift,
.btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
};
diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h
new file mode 100644
index 000000000000..c0eda888fe39
--- /dev/null
+++ b/fs/bcachefs/sb-members_types.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H
+#define _BCACHEFS_SB_MEMBERS_TYPES_H
+
+struct bch_member_cpu {
+ u64 nbuckets; /* device size */
+ u64 nbuckets_minus_first;
+ u16 first_bucket; /* index of first bucket used */
+ u16 bucket_size; /* sectors */
+ u16 group;
+ u8 state;
+ u8 discard;
+ u8 data_allowed;
+ u8 durability;
+ u8 freespace_initialized;
+ u8 valid;
+ u8 btree_bitmap_shift;
+ u64 btree_allocated_bitmap;
+};
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 544322d5c251..629900a5e641 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -32,7 +32,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
}
int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
@@ -49,7 +49,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot_tree *s)
{
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
- BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+ BTREE_ITER_with_updates, snapshot_tree, s);
if (bch2_err_matches(ret, ENOENT))
ret = -BCH_ERR_ENOENT_snapshot_tree;
@@ -223,7 +223,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
}
int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_snapshot s;
@@ -298,7 +298,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
static int __bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
struct snapshot_t *t;
@@ -352,7 +352,7 @@ err:
int bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
}
@@ -361,7 +361,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s)
{
return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_WITH_UPDATES, snapshot, s);
+ BTREE_ITER_with_updates, snapshot, s);
}
static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
@@ -618,7 +618,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_snapshot_trees, POS_MIN,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_snapshot_tree(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -695,7 +695,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans,
root = bch2_bkey_get_iter_typed(trans, &root_iter,
BTREE_ID_snapshots, POS(0, root_id),
- BTREE_ITER_WITH_UPDATES, snapshot);
+ BTREE_ITER_with_updates, snapshot);
ret = bkey_err(root);
if (ret)
goto err;
@@ -886,7 +886,7 @@ int bch2_check_snapshots(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_reverse_commit(trans, iter,
BTREE_ID_snapshots, POS_MAX,
- BTREE_ITER_PREFETCH, k,
+ BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_snapshot(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -900,7 +900,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
if (bch2_snapshot_equiv(c, id))
return 0;
- u32 tree_id;
+ /* 0 is an invalid tree ID */
+ u32 tree_id = 0;
int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
if (ret)
return ret;
@@ -1001,7 +1002,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
r.btree = btree;
ret = for_each_btree_key(trans, iter, btree, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({
+ BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
get_snapshot_trees(c, &r, k.k->p);
}));
if (ret)
@@ -1018,7 +1019,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
darray_for_each(*t, id) {
if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
c, snapshot_node_missing,
- "snapshot node %u from tree %s missing", *id, buf.buf)) {
+ "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
if (t->nr > 1) {
bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
ret = -BCH_ERR_fsck_repair_unimplemented;
@@ -1090,7 +1091,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
int ret = 0;
s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_INTENT, snapshot);
+ BTREE_ITER_intent, snapshot);
ret = bkey_err(s);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
"missing snapshot %u", id);
@@ -1199,7 +1200,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
- POS_MIN, BTREE_ITER_INTENT);
+ POS_MIN, BTREE_ITER_intent);
k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
@@ -1367,7 +1368,7 @@ static int snapshot_delete_key(struct btree_trans *trans,
if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
snapshot_list_has_id(equiv_seen, equiv)) {
return bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
} else {
return snapshot_list_add(c, equiv_seen, equiv);
}
@@ -1404,15 +1405,15 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans,
new->k.p.snapshot = equiv;
bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
ret = bch2_btree_iter_traverse(&new_iter) ?:
bch2_trans_update(trans, &new_iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ BTREE_UPDATE_internal_snapshot_node) ?:
bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &new_iter);
if (ret)
return ret;
@@ -1603,12 +1604,12 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter,
id, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
for_each_btree_key_commit(trans, iter,
id, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
move_key_to_correct_snapshot(trans, &iter, k));
@@ -1643,7 +1644,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
* nodes some depth fields will be off:
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
if (ret)
@@ -1699,8 +1700,8 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, id, pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots);
while (1) {
k = bch2_btree_iter_prev(&iter);
ret = bkey_err(k);
@@ -1752,7 +1753,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
pos.snapshot = leaf_id;
- bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index b7d2fed37c4f..bd5d74269d15 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -2,11 +2,11 @@
#ifndef _BCACHEFS_SNAPSHOT_H
#define _BCACHEFS_SNAPSHOT_H
-enum bkey_invalid_flags;
+enum bch_validate_flags;
void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
.key_invalid = bch2_snapshot_tree_invalid, \
@@ -20,9 +20,10 @@ int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tre
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
.key_invalid = bch2_snapshot_invalid, \
@@ -77,7 +78,7 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
return 0;
u32 parent = s->parent;
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) &&
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
parent &&
s->depth != snapshot_t(c, parent)->depth + 1)
panic("id %u depth=%u parent %u depth=%u\n",
@@ -135,11 +136,6 @@ static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
return id;
}
-static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
-{
- return id == bch2_snapshot_equiv(c, id);
-}
-
static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
{
rcu_read_lock();
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 3976f80721bf..cbad9b27874f 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -15,16 +15,6 @@
#include <crypto/hash.h>
#include <crypto/sha2.h>
-typedef unsigned __bitwise bch_str_hash_flags_t;
-
-enum bch_str_hash_flags {
- __BCH_HASH_SET_MUST_CREATE,
- __BCH_HASH_SET_MUST_REPLACE,
-};
-
-#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
-
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
@@ -159,13 +149,14 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
desc.is_visible(inum, k));
}
-static __always_inline int
+static __always_inline struct bkey_s_c
bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
- unsigned flags, u32 snapshot)
+ enum btree_iter_update_trigger_flags flags,
+ u32 snapshot)
{
struct bkey_s_c k;
int ret;
@@ -173,10 +164,10 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
- BTREE_ITER_SLOTS|flags, k, ret) {
+ BTREE_ITER_slots|flags, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_key(k, key))
- return 0;
+ return k;
} else if (k.k->type == KEY_TYPE_hash_whiteout) {
;
} else {
@@ -186,20 +177,23 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
}
bch2_trans_iter_exit(trans, iter);
- return ret ?: -BCH_ERR_ENOENT_str_hash_lookup;
+ return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup);
}
-static __always_inline int
+static __always_inline struct bkey_s_c
bch2_hash_lookup(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
u32 snapshot;
- return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
- bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+ int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return bkey_s_c_err(ret);
+
+ return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
}
static __always_inline int
@@ -220,7 +214,7 @@ bch2_hash_hole(struct btree_trans *trans,
for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
+ BTREE_ITER_slots|BTREE_ITER_intent, k, ret)
if (!is_visible_key(desc, inum, k))
return 0;
bch2_trans_iter_exit(trans, iter);
@@ -242,7 +236,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
bch2_btree_iter_advance(&iter);
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
+ for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) {
if (k.k->type != desc.key_type &&
k.k->type != KEY_TYPE_hash_whiteout)
break;
@@ -264,8 +258,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
- bch_str_hash_flags_t str_hash_flags,
- int update_flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter, slot = { NULL };
struct bkey_s_c k;
@@ -277,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
POS(insert->k.p.inode, U64_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
@@ -286,8 +279,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
continue;
}
- if (!slot.path &&
- !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
+ if (!slot.path && !(flags & STR_HASH_must_replace))
bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
@@ -305,16 +297,16 @@ found:
found = true;
not_found:
- if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
+ if (!found && (flags & STR_HASH_must_replace)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
+ } else if (found && (flags & STR_HASH_must_create)) {
ret = -EEXIST;
} else {
if (!found && slot.path)
swap(iter, slot);
insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, update_flags);
+ ret = bch2_trans_update(trans, &iter, insert, flags);
}
goto out;
@@ -326,14 +318,14 @@ int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_info *info,
subvol_inum inum,
struct bkey_i *insert,
- bch_str_hash_flags_t str_hash_flags)
+ enum btree_iter_update_trigger_flags flags)
{
insert->k.p.inode = inum.inum;
u32 snapshot;
return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
bch2_hash_set_in_snapshot(trans, desc, info, inum,
- snapshot, insert, str_hash_flags, 0);
+ snapshot, insert, flags);
}
static __always_inline
@@ -341,7 +333,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *iter,
- unsigned update_flags)
+ enum btree_iter_update_trigger_flags flags)
{
struct bkey_i *delete;
int ret;
@@ -359,7 +351,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
delete->k.p = iter->pos;
delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
- return bch2_trans_update(trans, iter, delete, update_flags);
+ return bch2_trans_update(trans, iter, delete, flags);
}
static __always_inline
@@ -369,14 +361,10 @@ int bch2_hash_delete(struct btree_trans *trans,
subvol_inum inum, const void *key)
{
struct btree_iter iter;
- int ret;
-
- ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
- BTREE_ITER_INTENT);
- if (ret)
- return ret;
-
- ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
+ BTREE_ITER_intent);
+ int ret = bkey_err(k) ?:
+ bch2_hash_delete_at(trans, desc, info, &iter, 0);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 88a79c823276..132213761ef6 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -162,7 +162,7 @@ int bch2_check_subvols(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_subvol(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -198,7 +198,7 @@ int bch2_check_subvol_children(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_subvol_child(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -208,7 +208,7 @@ int bch2_check_subvol_children(struct bch_fs *c)
/* Subvolumes: */
int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
int ret = 0;
@@ -245,9 +245,9 @@ static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bo
int bch2_subvolume_trigger(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
- unsigned flags)
+ enum btree_iter_update_trigger_flags flags)
{
- if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (flags & BTREE_TRIGGER_transactional) {
struct bpos children_pos_old = subvolume_children_pos(old);
struct bpos children_pos_new = subvolume_children_pos(new.s_c);
@@ -333,7 +333,7 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
subvol = bch2_bkey_get_iter_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
+ BTREE_ITER_cached|BTREE_ITER_with_updates,
subvolume);
ret = bkey_err(subvol);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
@@ -383,9 +383,9 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
return lockrestart_do(trans,
bch2_subvolume_get(trans, subvolid_to_delete, true,
- BTREE_ITER_CACHED, &s)) ?:
+ BTREE_ITER_cached, &s)) ?:
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
subvolid_to_delete, le32_to_cpu(s.creation_parent)));
@@ -404,7 +404,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
subvol = bch2_bkey_get_iter_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_CACHED|BTREE_ITER_INTENT,
+ BTREE_ITER_cached|BTREE_ITER_intent,
subvolume);
ret = bkey_err(subvol);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
@@ -505,7 +505,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
n = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_CACHED, subvolume);
+ BTREE_ITER_cached, subvolume);
ret = PTR_ERR_OR_ZERO(n);
if (unlikely(ret)) {
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
@@ -547,7 +547,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
BTREE_ID_subvolumes, POS(0, src_subvolid),
- BTREE_ITER_CACHED, subvolume);
+ BTREE_ITER_cached, subvolume);
ret = PTR_ERR_OR_ZERO(src_subvol);
if (unlikely(ret)) {
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index d2015d549bd2..afa5e871efb2 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -5,16 +5,17 @@
#include "darray.h"
#include "subvolume_types.h"
-enum bkey_invalid_flags;
+enum bch_validate_flags;
int bch2_check_subvols(struct bch_fs *);
int bch2_check_subvol_children(struct bch_fs *);
int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s, unsigned);
+ struct bkey_s_c, struct bkey_s,
+ enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
.key_invalid = bch2_subvolume_invalid, \
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index bfdb15e7d778..f1bee6c5222d 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -76,7 +76,7 @@ const char * const bch2_sb_fields[] = {
};
static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
- struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
enum bch_sb_field_type type)
@@ -344,8 +344,8 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0;
}
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
- int rw)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
+ enum bch_validate_flags flags, struct printbuf *out)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field_members_v1 *mi;
@@ -401,7 +401,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return -BCH_ERR_invalid_sb_time_precision;
}
- if (rw == READ) {
+ if (!flags) {
/*
* Been seeing a bug where these are getting inexplicably
* zeroed, so we're now validating them, but we have to be
@@ -457,7 +457,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return -BCH_ERR_invalid_sb_members_missing;
}
- ret = bch2_sb_field_validate(sb, &mi->field, out);
+ ret = bch2_sb_field_validate(sb, &mi->field, flags, out);
if (ret)
return ret;
@@ -465,12 +465,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
continue;
- ret = bch2_sb_field_validate(sb, f, out);
+ ret = bch2_sb_field_validate(sb, f, flags, out);
if (ret)
return ret;
}
- if (rw == WRITE &&
+ if ((flags & BCH_VALIDATE_write) &&
bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
@@ -819,7 +819,7 @@ got_super:
sb->have_layout = true;
- ret = bch2_sb_validate(sb, &err, READ);
+ ret = bch2_sb_validate(sb, 0, &err);
if (ret) {
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
@@ -975,7 +975,7 @@ int bch2_write_super(struct bch_fs *c)
darray_for_each(online_devices, ca) {
printbuf_reset(&err);
- ret = bch2_sb_validate(&(*ca)->disk_sb, &err, WRITE);
+ ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
goto out;
@@ -1020,26 +1020,35 @@ int bch2_write_super(struct bch_fs *c)
continue;
if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
- bch2_fs_fatal_error(c,
+ struct printbuf buf = PRINTBUF;
+ prt_char(&buf, ' ');
+ prt_bdevname(&buf, ca->disk_sb.bdev);
+ prt_printf(&buf,
": Superblock write was silently dropped! (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
- percpu_ref_put(&ca->io_ref);
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
ret = -BCH_ERR_erofs_sb_err;
- goto out;
}
if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
- bch2_fs_fatal_error(c,
+ struct printbuf buf = PRINTBUF;
+ prt_char(&buf, ' ');
+ prt_bdevname(&buf, ca->disk_sb.bdev);
+ prt_printf(&buf,
": Superblock modified by another process (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
- percpu_ref_put(&ca->io_ref);
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
ret = -BCH_ERR_erofs_sb_err;
- goto out;
}
}
+ if (ret)
+ goto out;
+
do {
wrote = false;
darray_for_each(online_devices, cap) {
@@ -1152,7 +1161,7 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
}
static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
if (vstruct_bytes(f) < 88) {
prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88);
@@ -1167,8 +1176,7 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_ext *e = field_to_type(f, ext);
- prt_printf(out, "Recovery passes required:");
- prt_tab(out);
+ prt_printf(out, "Recovery passes required:\t");
prt_bitflags(out, bch2_recovery_passes,
bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0])));
prt_newline(out);
@@ -1177,16 +1185,14 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
if (errors_silent) {
le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
- prt_printf(out, "Errors to silently fix:");
- prt_tab(out);
+ prt_printf(out, "Errors to silently fix:\t");
prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8);
prt_newline(out);
kfree(errors_silent);
}
- prt_printf(out, "Btrees with missing data:");
- prt_tab(out);
+ prt_printf(out, "Btrees with missing data:\t");
prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
prt_newline(out);
}
@@ -1213,14 +1219,14 @@ static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
}
static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *err)
+ enum bch_validate_flags flags, struct printbuf *err)
{
unsigned type = le32_to_cpu(f->type);
struct printbuf field_err = PRINTBUF;
const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
int ret;
- ret = ops->validate ? ops->validate(sb, f, &field_err) : 0;
+ ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0;
if (ret) {
prt_printf(err, "Invalid superblock section %s: %s",
bch2_sb_fields[type], field_err.buf);
@@ -1294,97 +1300,73 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
printbuf_tabstop_push(out, 44);
for (int i = 0; i < sb->nr_devices; i++)
- nr_devices += bch2_dev_exists(sb, i);
+ nr_devices += bch2_member_exists(sb, i);
- prt_printf(out, "External UUID:");
- prt_tab(out);
+ prt_printf(out, "External UUID:\t");
pr_uuid(out, sb->user_uuid.b);
prt_newline(out);
- prt_printf(out, "Internal UUID:");
- prt_tab(out);
+ prt_printf(out, "Internal UUID:\t");
pr_uuid(out, sb->uuid.b);
prt_newline(out);
- prt_printf(out, "Magic number:");
- prt_tab(out);
+ prt_printf(out, "Magic number:\t");
pr_uuid(out, sb->magic.b);
prt_newline(out);
- prt_str(out, "Device index:");
- prt_tab(out);
- prt_printf(out, "%u", sb->dev_idx);
- prt_newline(out);
+ prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
- prt_str(out, "Label:");
- prt_tab(out);
+ prt_str(out, "Label:\t");
prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
prt_newline(out);
- prt_str(out, "Version:");
- prt_tab(out);
+ prt_str(out, "Version:\t");
bch2_version_to_text(out, le16_to_cpu(sb->version));
prt_newline(out);
- prt_str(out, "Version upgrade complete:");
- prt_tab(out);
+ prt_str(out, "Version upgrade complete:\t");
bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
prt_newline(out);
- prt_printf(out, "Oldest version on disk:");
- prt_tab(out);
+ prt_printf(out, "Oldest version on disk:\t");
bch2_version_to_text(out, le16_to_cpu(sb->version_min));
prt_newline(out);
- prt_printf(out, "Created:");
- prt_tab(out);
+ prt_printf(out, "Created:\t");
if (sb->time_base_lo)
bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
else
prt_printf(out, "(not set)");
prt_newline(out);
- prt_printf(out, "Sequence number:");
- prt_tab(out);
+ prt_printf(out, "Sequence number:\t");
prt_printf(out, "%llu", le64_to_cpu(sb->seq));
prt_newline(out);
- prt_printf(out, "Time of last write:");
- prt_tab(out);
+ prt_printf(out, "Time of last write:\t");
bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
prt_newline(out);
- prt_printf(out, "Superblock size:");
- prt_tab(out);
+ prt_printf(out, "Superblock size:\t");
prt_units_u64(out, vstruct_bytes(sb));
prt_str(out, "/");
prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
prt_newline(out);
- prt_printf(out, "Clean:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
- prt_newline(out);
-
- prt_printf(out, "Devices:");
- prt_tab(out);
- prt_printf(out, "%u", nr_devices);
- prt_newline(out);
+ prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
+ prt_printf(out, "Devices:\t%u\n", nr_devices);
- prt_printf(out, "Sections:");
+ prt_printf(out, "Sections:\t");
vstruct_for_each(sb, f)
fields_have |= 1 << le32_to_cpu(f->type);
- prt_tab(out);
prt_bitflags(out, bch2_sb_fields, fields_have);
prt_newline(out);
- prt_printf(out, "Features:");
- prt_tab(out);
+ prt_printf(out, "Features:\t");
prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
prt_newline(out);
- prt_printf(out, "Compat features:");
- prt_tab(out);
+ prt_printf(out, "Compat features:\t");
prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
prt_newline(out);
@@ -1401,8 +1383,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
if (opt->get_sb != BCH2_NO_SB_OPT) {
u64 v = bch2_opt_from_sb(sb, id);
- prt_printf(out, "%s:", opt->attr.name);
- prt_tab(out);
+ prt_printf(out, "%s:\t", opt->attr.name);
bch2_opt_to_text(out, NULL, sb, opt, v,
OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
prt_newline(out);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 95e80e06316b..fadd364e2802 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -51,7 +51,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
extern const char * const bch2_sb_fields[];
struct bch_sb_field_ops {
- int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
+ int (*validate)(struct bch_sb *, struct bch_sb_field *,
+ enum bch_validate_flags, struct printbuf *);
void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
};
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index dddf57ec4511..294a9d35a9f2 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -264,7 +264,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_open_buckets_stop(c, NULL, true);
bch2_rebalance_stop(c);
bch2_copygc_stop(c);
- bch2_gc_thread_stop(c);
bch2_fs_ec_flush(c);
bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
@@ -285,7 +284,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
journal_cur_seq(&c->journal));
- if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+ if (test_bit(JOURNAL_replay_done, &c->journal.flags) &&
!test_bit(BCH_FS_emergency_ro, &c->flags))
set_bit(BCH_FS_clean_shutdown, &c->flags);
@@ -467,7 +466,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
* overwriting whatever was there previously, and there must always be
* at least one non-flush write in the journal or recovery will fail:
*/
- set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
+ set_bit(JOURNAL_need_flush_write, &c->journal.flags);
+ set_bit(JOURNAL_running, &c->journal.flags);
for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
@@ -485,12 +485,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
}
#endif
- ret = bch2_gc_thread_start(c);
- if (ret) {
- bch_err(c, "error starting gc thread");
- return ret;
- }
-
ret = bch2_journal_reclaim_start(&c->journal);
if (ret)
goto err;
@@ -537,9 +531,7 @@ int bch2_fs_read_write_early(struct bch_fs *c)
static void __bch2_fs_free(struct bch_fs *c)
{
- unsigned i;
-
- for (i = 0; i < BCH_TIME_STAT_NR; i++)
+ for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
@@ -572,6 +564,7 @@ static void __bch2_fs_free(struct bch_fs *c)
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
+ EBUG_ON(percpu_u64_get(c->online_reserved));
free_percpu(c->online_reserved);
darray_exit(&c->btree_roots_extra);
@@ -616,8 +609,6 @@ void __bch2_fs_stop(struct bch_fs *c)
set_bit(BCH_FS_stopping, &c->flags);
- cancel_work_sync(&c->journal_seq_blacklist_gc_work);
-
down_write(&c->state_lock);
bch2_fs_read_only(c);
up_write(&c->state_lock);
@@ -665,6 +656,7 @@ void bch2_fs_free(struct bch_fs *c)
struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
if (ca) {
+ EBUG_ON(atomic_long_read(&ca->ref) != 1);
bch2_free_super(&ca->disk_sb);
bch2_dev_free(ca);
}
@@ -719,7 +711,7 @@ static int bch2_fs_online(struct bch_fs *c)
ret = bch2_dev_sysfs_online(c, ca);
if (ret) {
bch_err(c, "error creating sysfs objects");
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
goto err;
}
}
@@ -778,6 +770,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
+ bch2_fs_gc_init(c);
bch2_fs_copygc_init(c);
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
bch2_fs_btree_iter_init_early(c);
@@ -800,16 +793,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
spin_lock_init(&c->btree_write_error_lock);
- INIT_WORK(&c->journal_seq_blacklist_gc_work,
- bch2_blacklist_entries_gc);
-
INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_error_msgs);
mutex_init(&c->fsck_error_msgs_lock);
- seqcount_init(&c->gc_pos_lock);
-
seqcount_init(&c->usage_lock);
sema_init(&c->io_in_flight, 128);
@@ -940,7 +928,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto err;
for (i = 0; i < c->sb.nr_devices; i++)
- if (bch2_dev_exists(c->disk_sb.sb, i) &&
+ if (bch2_member_exists(c->disk_sb.sb, i) &&
bch2_dev_alloc(c, i)) {
ret = -EEXIST;
goto err;
@@ -1101,7 +1089,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
return -BCH_ERR_device_not_a_member_of_filesystem;
- if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx))
+ if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))
return -BCH_ERR_device_has_been_removed;
if (fs->sb->block_size != sb->sb->block_size)
@@ -1200,11 +1188,11 @@ static void bch2_dev_free(struct bch_dev *ca)
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
+ kfree(ca->buckets_nouse);
bch2_free_super(&ca->disk_sb);
bch2_dev_journal_exit(ca);
free_percpu(ca->io_done);
- bioset_exit(&ca->replica_set);
bch2_dev_buckets_free(ca);
free_page((unsigned long) ca->sb_read_scratch);
@@ -1212,7 +1200,9 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
+#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_exit(&ca->ref);
+#endif
kobject_put(&ca->kobj);
}
@@ -1239,12 +1229,14 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_journal_exit(ca);
}
+#ifndef CONFIG_BCACHEFS_DEBUG
static void bch2_dev_ref_complete(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
complete(&ca->ref_completion);
}
+#endif
static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
{
@@ -1313,14 +1305,17 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / btree_sectors(c));
- if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
- 0, GFP_KERNEL) ||
- percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+#ifndef CONFIG_BCACHEFS_DEBUG
+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))
+ goto err;
+#else
+ atomic_long_set(&ca->ref, 1);
+#endif
+
+ if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
- bioset_init(&ca->replica_set, 4,
- offsetof(struct bch_write_bio, bio), 0) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
@@ -1411,10 +1406,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
le64_to_cpu(c->disk_sb.sb->seq))
bch2_sb_to_fs(c, sb->sb);
- BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
- !c->devs[sb->sb->dev_idx]);
+ BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
- ca = bch_dev_locked(c, sb->sb->dev_idx);
+ ca = bch2_dev_locked(c, sb->sb->dev_idx);
ret = __bch2_dev_attach_bdev(ca, sb);
if (ret)
@@ -1506,10 +1500,10 @@ static bool bch2_fs_may_start(struct bch_fs *c)
mutex_lock(&c->sb_lock);
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- if (!bch2_dev_exists(c->disk_sb.sb, i))
+ if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
- ca = bch_dev_locked(c, i);
+ ca = bch2_dev_locked(c, i);
if (!bch2_dev_is_online(ca) &&
(ca->mi.state == BCH_MEMBER_STATE_rw ||
@@ -1599,17 +1593,17 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
* with bch2_do_invalidates() and bch2_do_discards()
*/
ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
+ BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
+ BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
+ BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
+ BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
- BTREE_TRIGGER_NORUN, NULL) ?:
+ BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
- BTREE_TRIGGER_NORUN, NULL);
+ BTREE_TRIGGER_norun, NULL);
bch_err_msg(c, ret, "removing dev alloc info");
return ret;
}
@@ -1626,7 +1620,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* We consume a reference to ca->ref, regardless of whether we succeed
* or fail:
*/
- percpu_ref_put(&ca->ref);
+ bch2_dev_put(ca);
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot remove without losing data");
@@ -1678,7 +1672,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
mutex_unlock(&c->sb_lock);
+#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_kill(&ca->ref);
+#else
+ ca->dying = true;
+ bch2_dev_put(ca);
+#endif
wait_for_completion(&ca->ref_completion);
bch2_dev_free(ca);
@@ -1777,9 +1776,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
- for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
- if (!bch2_dev_exists(c->disk_sb.sb, dev_idx))
- goto have_slot;
+ if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) {
+ dev_idx = c->sb.nr_devices;
+ goto have_slot;
+ }
+
+ int best = -1;
+ u64 best_last_mount = 0;
+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+ if (bch2_member_alive(&m))
+ continue;
+
+ u64 last_mount = le64_to_cpu(m.last_mount);
+ if (best < 0 || last_mount < best_last_mount) {
+ best = dev_idx;
+ best_last_mount = last_mount;
+ }
+ }
+ if (best >= 0) {
+ dev_idx = best;
+ goto have_slot;
+ }
no_slot:
ret = -BCH_ERR_ENOSPC_sb_members;
bch_err_msg(c, ret, "setting up new superblock");
@@ -1821,7 +1839,7 @@ have_slot:
bch2_dev_usage_journal_reserve(c);
- ret = bch2_trans_mark_dev_sb(c, ca);
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
bch_err_msg(ca, ret, "marking new superblock");
if (ret)
goto err_late;
@@ -1884,9 +1902,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
if (ret)
goto err;
- ca = bch_dev_locked(c, dev_idx);
+ ca = bch2_dev_locked(c, dev_idx);
- ret = bch2_trans_mark_dev_sb(c, ca);
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
if (ret)
goto err;
@@ -1979,7 +1997,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (ret)
goto err;
- ret = bch2_trans_mark_dev_sb(c, ca);
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
if (ret)
goto err;
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 11bcef170c2c..368a63d938cf 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -26,19 +26,4 @@ struct bch_devs_list {
u8 data[BCH_BKEY_PTRS_MAX];
};
-struct bch_member_cpu {
- u64 nbuckets; /* device size */
- u16 first_bucket; /* index of first bucket used */
- u16 bucket_size; /* sectors */
- u16 group;
- u8 state;
- u8 discard;
- u8 data_allowed;
- u8 durability;
- u8 freespace_initialized;
- u8 valid;
- u8 btree_bitmap_shift;
- u64 btree_allocated_bitmap;
-};
-
#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 5be92fe3f4ea..93ca74d108b1 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -140,9 +140,8 @@ write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
write_attribute(trigger_journal_flush);
-write_attribute(prune_cache);
-write_attribute(btree_wakeup);
-rw_attribute(btree_gc_periodic);
+write_attribute(trigger_btree_cache_shrink);
+write_attribute(trigger_btree_key_cache_shrink);
rw_attribute(gc_gens_pos);
read_attribute(uuid);
@@ -189,12 +188,8 @@ static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
{
bch2_printbuf_tabstop_push(out, 24);
- for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
- prt_str(out, bch2_write_refs[i]);
- prt_tab(out);
- prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
- prt_newline(out);
- }
+ for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++)
+ prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i]));
}
#endif
@@ -278,7 +273,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
continue;
ret = for_each_btree_key(trans, iter, id, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ BTREE_ITER_all_snapshots, k, ({
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *entry;
@@ -313,22 +308,11 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (ret)
return ret;
- prt_str(out, "type");
printbuf_tabstop_push(out, 12);
- prt_tab(out);
-
- prt_str(out, "compressed");
printbuf_tabstop_push(out, 16);
- prt_tab_rjust(out);
-
- prt_str(out, "uncompressed");
printbuf_tabstop_push(out, 16);
- prt_tab_rjust(out);
-
- prt_str(out, "average extent size");
printbuf_tabstop_push(out, 24);
- prt_tab_rjust(out);
- prt_newline(out);
+ prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
bch2_prt_compression_type(out, i);
@@ -362,21 +346,6 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "\n");
}
-static void bch2_btree_wakeup_all(struct bch_fs *c)
-{
- struct btree_trans *trans;
-
- seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
-
- if (b)
- six_lock_wakeup_all(&b->lock);
-
- }
- seqmutex_unlock(&c->btree_trans_lock);
-}
-
SHOW(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -392,8 +361,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_btree_write_stats)
bch2_btree_write_stats_to_text(out, c);
- sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
-
if (attr == &sysfs_gc_gens_pos)
bch2_gc_gens_pos_to_text(out, c);
@@ -416,7 +383,7 @@ SHOW(bch2_fs)
bch2_journal_debug_to_text(out, &c->journal);
if (attr == &sysfs_btree_cache)
- bch2_btree_cache_to_text(out, c);
+ bch2_btree_cache_to_text(out, &c->btree_cache);
if (attr == &sysfs_btree_key_cache)
bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
@@ -459,6 +426,9 @@ SHOW(bch2_fs)
if (attr == &sysfs_disk_groups)
bch2_disk_groups_to_text(out, c);
+ if (attr == &sysfs_alloc_debug)
+ bch2_fs_alloc_debug_to_text(out, c);
+
return 0;
}
@@ -466,14 +436,6 @@ STORE(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
- if (attr == &sysfs_btree_gc_periodic) {
- ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
- ?: (ssize_t) size;
-
- wake_up_process(c->gc_thread);
- return ret;
- }
-
if (attr == &sysfs_copy_gc_enabled) {
ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
?: (ssize_t) size;
@@ -505,7 +467,7 @@ STORE(bch2_fs)
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
return -EROFS;
- if (attr == &sysfs_prune_cache) {
+ if (attr == &sysfs_trigger_btree_cache_shrink) {
struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL;
@@ -513,22 +475,17 @@ STORE(bch2_fs)
c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
}
- if (attr == &sysfs_btree_wakeup)
- bch2_btree_wakeup_all(c);
-
- if (attr == &sysfs_trigger_gc) {
- /*
- * Full gc is currently incompatible with btree key cache:
- */
-#if 0
- down_read(&c->state_lock);
- bch2_gc(c, false, false);
- up_read(&c->state_lock);
-#else
- bch2_gc_gens(c);
-#endif
+ if (attr == &sysfs_trigger_btree_key_cache_shrink) {
+ struct shrink_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = strtoul_or_return(buf);
+ c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
}
+ if (attr == &sysfs_trigger_gc)
+ bch2_gc_gens(c);
+
if (attr == &sysfs_trigger_discards)
bch2_do_discards(c);
@@ -594,13 +551,11 @@ SHOW(bch2_fs_counters)
if (attr == &sysfs_##t) { \
counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
- prt_printf(out, "since mount:"); \
- prt_tab(out); \
+ prt_printf(out, "since mount:\t"); \
prt_human_readable_u64(out, counter_since_mount); \
prt_newline(out); \
\
- prt_printf(out, "since filesystem creation:"); \
- prt_tab(out); \
+ prt_printf(out, "since filesystem creation:\t"); \
prt_human_readable_u64(out, counter); \
prt_newline(out); \
}
@@ -660,8 +615,8 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_discards,
&sysfs_trigger_invalidates,
&sysfs_trigger_journal_flush,
- &sysfs_prune_cache,
- &sysfs_btree_wakeup,
+ &sysfs_trigger_btree_cache_shrink,
+ &sysfs_trigger_btree_key_cache_shrink,
&sysfs_gc_gens_pos,
@@ -677,6 +632,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_internal_uuid,
&sysfs_disk_groups,
+ &sysfs_alloc_debug,
NULL
};
@@ -792,88 +748,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL
};
-static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
- unsigned i, nr[BCH_DATA_NR];
-
- memset(nr, 0, sizeof(nr));
-
- for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
- nr[c->open_buckets[i].data_type]++;
-
- printbuf_tabstop_push(out, 8);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
-
- bch2_dev_usage_to_text(out, &stats);
-
- prt_newline(out);
-
- prt_printf(out, "reserves:");
- prt_newline(out);
- for (i = 0; i < BCH_WATERMARK_NR; i++) {
- prt_str(out, bch2_watermarks[i]);
- prt_tab(out);
- prt_u64(out, bch2_dev_buckets_reserved(ca, i));
- prt_tab_rjust(out);
- prt_newline(out);
- }
-
- prt_newline(out);
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 24);
-
- prt_str(out, "freelist_wait");
- prt_tab(out);
- prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty");
- prt_newline(out);
-
- prt_str(out, "open buckets allocated");
- prt_tab(out);
- prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
- prt_newline(out);
-
- prt_str(out, "open buckets this dev");
- prt_tab(out);
- prt_u64(out, ca->nr_open_buckets);
- prt_newline(out);
-
- prt_str(out, "open buckets total");
- prt_tab(out);
- prt_u64(out, OPEN_BUCKETS_COUNT);
- prt_newline(out);
-
- prt_str(out, "open_buckets_wait");
- prt_tab(out);
- prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty");
- prt_newline(out);
-
- prt_str(out, "open_buckets_btree");
- prt_tab(out);
- prt_u64(out, nr[BCH_DATA_btree]);
- prt_newline(out);
-
- prt_str(out, "open_buckets_user");
- prt_tab(out);
- prt_u64(out, nr[BCH_DATA_user]);
- prt_newline(out);
-
- prt_str(out, "buckets_to_invalidate");
- prt_tab(out);
- prt_u64(out, should_invalidate_buckets(ca, stats));
- prt_newline(out);
-
- prt_str(out, "btree reserve cache");
- prt_tab(out);
- prt_u64(out, c->btree_reserve_cache_nr);
- prt_newline(out);
-}
-
static const char * const bch2_rw[] = {
"read",
"write",
@@ -943,7 +817,7 @@ SHOW(bch2_dev)
* 100 / CONGESTED_MAX);
if (attr == &sysfs_alloc_debug)
- dev_alloc_debug_to_text(out, ca);
+ bch2_dev_alloc_debug_to_text(out, ca);
return 0;
}
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index bfec656f94c0..68104b2056d9 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -40,7 +40,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
k.k.p.snapshot = U32_MAX;
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
@@ -81,7 +81,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
k.k.p.snapshot = U32_MAX;
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
@@ -261,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
ret = bch2_trans_run(c,
for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
+ BTREE_ITER_slots, k, ({
if (i >= nr * 2)
break;
@@ -322,7 +322,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
ret = bch2_trans_run(c,
for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
+ BTREE_ITER_slots, k, ({
if (i == nr)
break;
BUG_ON(bkey_deleted(k.k) != !(i % 16));
@@ -452,7 +452,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+ BTREE_UPDATE_internal_snapshot_node));
bch_err_fn(c, ret);
return ret;
}
@@ -671,7 +671,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
- BTREE_ITER_INTENT);
+ BTREE_ITER_intent);
k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
@@ -714,7 +714,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
+ BTREE_ITER_slots|BTREE_ITER_intent, k,
NULL, NULL, 0, ({
if (iter.pos.offset >= nr)
break;
@@ -737,7 +737,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
- BTREE_ITER_INTENT, k,
+ BTREE_ITER_intent, k,
NULL, NULL, 0, ({
struct bkey_i_cookie u;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 6aa81d1e6d36..84fcf26e306e 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(fs_str,
TP_fast_assign(
__entry->dev = c->dev;
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
@@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(trans_str,
__entry->dev = trans->c->dev;
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d %s %pS %s",
@@ -85,7 +85,7 @@ DECLARE_EVENT_CLASS(trans_str_nocaller,
TP_fast_assign(
__entry->dev = trans->c->dev;
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d %s %s",
@@ -638,99 +638,14 @@ DEFINE_EVENT(bch_fs, gc_gens_end,
/* Allocator */
-DECLARE_EVENT_CLASS(bucket_alloc,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
- u64 bucket,
- u64 free,
- u64 avail,
- u64 copygc_wait_amount,
- s64 copygc_waiting_for,
- struct bucket_alloc_state *s,
- bool nonblocking,
- const char *err),
- TP_ARGS(ca, alloc_reserve, bucket, free, avail,
- copygc_wait_amount, copygc_waiting_for,
- s, nonblocking, err),
-
- TP_STRUCT__entry(
- __field(u8, dev )
- __array(char, reserve, 16 )
- __field(u64, bucket )
- __field(u64, free )
- __field(u64, avail )
- __field(u64, copygc_wait_amount )
- __field(s64, copygc_waiting_for )
- __field(u64, seen )
- __field(u64, open )
- __field(u64, need_journal_commit )
- __field(u64, nouse )
- __field(bool, nonblocking )
- __field(u64, nocow )
- __array(char, err, 32 )
- ),
-
- TP_fast_assign(
- __entry->dev = ca->dev_idx;
- strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
- __entry->bucket = bucket;
- __entry->free = free;
- __entry->avail = avail;
- __entry->copygc_wait_amount = copygc_wait_amount;
- __entry->copygc_waiting_for = copygc_waiting_for;
- __entry->seen = s->buckets_seen;
- __entry->open = s->skipped_open;
- __entry->need_journal_commit = s->skipped_need_journal_commit;
- __entry->nouse = s->skipped_nouse;
- __entry->nonblocking = nonblocking;
- __entry->nocow = s->skipped_nocow;
- strscpy(__entry->err, err, sizeof(__entry->err));
- ),
-
- TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
- __entry->reserve,
- __entry->dev,
- __entry->bucket,
- __entry->free,
- __entry->avail,
- __entry->copygc_wait_amount,
- __entry->copygc_waiting_for,
- __entry->seen,
- __entry->open,
- __entry->need_journal_commit,
- __entry->nouse,
- __entry->nocow,
- __entry->nonblocking,
- __entry->err)
+DEFINE_EVENT(fs_str, bucket_alloc,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bucket_alloc, bucket_alloc,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
- u64 bucket,
- u64 free,
- u64 avail,
- u64 copygc_wait_amount,
- s64 copygc_waiting_for,
- struct bucket_alloc_state *s,
- bool nonblocking,
- const char *err),
- TP_ARGS(ca, alloc_reserve, bucket, free, avail,
- copygc_wait_amount, copygc_waiting_for,
- s, nonblocking, err)
-);
-
-DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
- TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
- u64 bucket,
- u64 free,
- u64 avail,
- u64 copygc_wait_amount,
- s64 copygc_waiting_for,
- struct bucket_alloc_state *s,
- bool nonblocking,
- const char *err),
- TP_ARGS(ca, alloc_reserve, bucket, free, avail,
- copygc_wait_amount, copygc_waiting_for,
- s, nonblocking, err)
+DEFINE_EVENT(fs_str, bucket_alloc_fail,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
TRACE_EVENT(discard_buckets,
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 92c6ad75e702..de331dec2a99 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -348,15 +348,12 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
const struct time_unit *u = bch2_pick_time_units(ns);
- prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
- prt_tab_rjust(out);
- prt_printf(out, "%s", u->name);
+ prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name);
}
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
{
- prt_str(out, name);
- prt_tab(out);
+ prt_printf(out, "%s\t", name);
bch2_pr_time_units_aligned(out, ns);
prt_newline(out);
}
@@ -389,12 +386,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
}
printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
- prt_printf(out, "count:");
- prt_tab(out);
- prt_printf(out, "%llu ",
- stats->duration_stats.n);
+ prt_printf(out, "count:\t%llu\n", stats->duration_stats.n);
printbuf_tabstop_pop(out);
- prt_newline(out);
printbuf_tabstops_reset(out);
@@ -403,13 +396,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_tabstop_push(out, 0);
printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
- prt_tab(out);
- prt_printf(out, "since mount");
- prt_tab_rjust(out);
- prt_tab(out);
+ prt_printf(out, "\tsince mount\r\trecent\r\n");
prt_printf(out, "recent");
- prt_tab_rjust(out);
- prt_newline(out);
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, out->indent + 20);
@@ -417,23 +405,20 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_tabstop_push(out, 2);
printbuf_tabstop_push(out, TABSTOP_SIZE);
- prt_printf(out, "duration of events");
- prt_newline(out);
+ prt_printf(out, "duration of events\n");
printbuf_indent_add(out, 2);
pr_name_and_units(out, "min:", stats->min_duration);
pr_name_and_units(out, "max:", stats->max_duration);
pr_name_and_units(out, "total:", stats->total_duration);
- prt_printf(out, "mean:");
- prt_tab(out);
+ prt_printf(out, "mean:\t");
bch2_pr_time_units_aligned(out, d_mean);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
- prt_printf(out, "stddev:");
- prt_tab(out);
+ prt_printf(out, "stddev:\t");
bch2_pr_time_units_aligned(out, d_stddev);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
@@ -441,22 +426,19 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_indent_sub(out, 2);
prt_newline(out);
- prt_printf(out, "time between events");
- prt_newline(out);
+ prt_printf(out, "time between events\n");
printbuf_indent_add(out, 2);
pr_name_and_units(out, "min:", stats->min_freq);
pr_name_and_units(out, "max:", stats->max_freq);
- prt_printf(out, "mean:");
- prt_tab(out);
+ prt_printf(out, "mean:\t");
bch2_pr_time_units_aligned(out, f_mean);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
- prt_printf(out, "stddev:");
- prt_tab(out);
+ prt_printf(out, "stddev:\t");
bch2_pr_time_units_aligned(out, f_stddev);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
@@ -589,40 +571,31 @@ void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_contro
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 20);
- prt_printf(out, "rate:");
- prt_tab(out);
+ prt_printf(out, "rate:\t");
prt_human_readable_s64(out, pd->rate.rate);
prt_newline(out);
- prt_printf(out, "target:");
- prt_tab(out);
+ prt_printf(out, "target:\t");
prt_human_readable_u64(out, pd->last_target);
prt_newline(out);
- prt_printf(out, "actual:");
- prt_tab(out);
+ prt_printf(out, "actual:\t");
prt_human_readable_u64(out, pd->last_actual);
prt_newline(out);
- prt_printf(out, "proportional:");
- prt_tab(out);
+ prt_printf(out, "proportional:\t");
prt_human_readable_s64(out, pd->last_proportional);
prt_newline(out);
- prt_printf(out, "derivative:");
- prt_tab(out);
+ prt_printf(out, "derivative:\t");
prt_human_readable_s64(out, pd->last_derivative);
prt_newline(out);
- prt_printf(out, "change:");
- prt_tab(out);
+ prt_printf(out, "change:\t");
prt_human_readable_s64(out, pd->last_change);
prt_newline(out);
- prt_printf(out, "next io:");
- prt_tab(out);
- prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
- prt_newline(out);
+ prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
}
/* misc: */
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 5cf885b09986..5d2c470a49ac 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -445,11 +445,6 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
void bch2_bio_map(struct bio *bio, void *base, size_t);
int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
-static inline sector_t bdev_sectors(struct block_device *bdev)
-{
- return bdev->bd_inode->i_size >> 9;
-}
-
#define closure_bio_submit(bio, cl) \
do { \
closure_get(cl); \
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 754f17bba68e..c11bf6dacc2c 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
};
int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
+ enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
@@ -118,11 +118,17 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
else
prt_printf(out, "(unknown type %u)", xattr.v->x_type);
+ unsigned name_len = xattr.v->x_name_len;
+ unsigned val_len = le16_to_cpu(xattr.v->x_val_len);
+ unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
+ offsetof(struct bch_xattr, x_name);
+
+ val_len = min_t(int, val_len, max_name_val_bytes - name_len);
+ name_len = min(name_len, max_name_val_bytes);
+
prt_printf(out, "%.*s:%.*s",
- xattr.v->x_name_len,
- xattr.v->x_name,
- le16_to_cpu(xattr.v->x_val_len),
- (char *) xattr_val(xattr.v));
+ name_len, xattr.v->x_name,
+ val_len, (char *) xattr_val(xattr.v));
if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
@@ -138,21 +144,13 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
struct btree_iter iter;
- struct bkey_s_c_xattr xattr;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
- inode_inum(inode), &search, 0);
+ struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+ inode_inum(inode), &search, 0);
+ int ret = bkey_err(k);
if (ret)
- goto err1;
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err2;
+ return ret;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
ret = le16_to_cpu(xattr.v->x_val_len);
if (buffer) {
if (ret > size)
@@ -160,10 +158,8 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
else
memcpy(buffer, xattr_val(xattr.v), ret);
}
-err2:
bch2_trans_iter_exit(trans, &iter);
-err1:
- return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret;
+ return ret;
}
int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
@@ -177,7 +173,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
int ret;
ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
- bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+ bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
if (ret)
return ret;
@@ -212,8 +208,8 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
inum, &xattr->k_i,
- (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
- (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+ (flags & XATTR_CREATE ? STR_HASH_must_create : 0)|
+ (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0));
} else {
struct xattr_search_key search =
X_SEARCH(type, name, strlen(name));
@@ -359,6 +355,9 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
int ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
+ if (ret < 0 && bch2_err_matches(ret, ENOENT))
+ ret = -ENODATA;
+
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 1337f31a5c49..1574b9eb4c85 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -7,7 +7,7 @@
extern const struct bch_hash_desc bch2_xattr_hash_desc;
int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
+ enum bch_validate_flags, struct printbuf *);
void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_xattr ((struct bkey_ops) { \
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b5a25ee49eea..a43897b03ce9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1934,7 +1934,7 @@ static void free_note_info(struct elf_note_info *info)
threads = t->next;
WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus);
for (i = 1; i < info->thread_notes; ++i)
- kfree(t->notes[i].data);
+ kvfree(t->notes[i].data);
kfree(t);
}
kfree(info->psinfo.data);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7696beec4c21..7130040d92ab 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,7 +316,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_devices;
ret = btrfs_get_dev_zone_info(device, false);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a91a8056758a..1b20b3e390df 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3656,7 +3656,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
struct btrfs_super_block *super;
struct page *page;
u64 bytenr, bytenr_orig;
- struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct address_space *mapping = bdev->bd_mapping;
int ret;
bytenr_orig = btrfs_sb_offset(copy_num);
@@ -3743,7 +3743,7 @@ static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct address_space *mapping = device->bdev->bd_inode->i_mapping;
+ struct address_space *mapping = device->bdev->bd_mapping;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
int i;
int ret;
@@ -3861,7 +3861,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
device->commit_total_bytes)
break;
- folio = filemap_get_folio(device->bdev->bd_inode->i_mapping,
+ folio = filemap_get_folio(device->bdev->bd_mapping,
bytenr >> PAGE_SHIFT);
/* If the folio has been removed, then we know it completed. */
if (IS_ERR(folio))
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b6a701011fb0..c39145e8c4ad 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -482,10 +482,12 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
if (flush)
sync_blockdev(bdev);
- ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
- if (ret) {
- fput(*bdev_file);
- goto error;
+ if (holder) {
+ ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
+ if (ret) {
+ fput(*bdev_file);
+ goto error;
+ }
}
invalidate_bdev(bdev);
*disk_super = btrfs_read_dev_super(bdev);
@@ -498,6 +500,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
return 0;
error:
+ *disk_super = NULL;
*bdev_file = NULL;
return ret;
}
@@ -1287,7 +1290,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
return ERR_PTR(-EINVAL);
/* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
+ page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);
if (IS_ERR(page))
return ERR_CAST(page);
@@ -2714,7 +2717,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
btrfs_clear_sb_rdonly(sb);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 4cba80b34387..9b43fa493219 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -118,7 +118,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
return -ENOENT;
} else if (full[0] && full[1]) {
/* Compare two super blocks */
- struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct address_space *mapping = bdev->bd_mapping;
struct page *page[BTRFS_NR_SB_LOG_ZONES];
struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
int i;
diff --git a/fs/buffer.c b/fs/buffer.c
index 4f73d23c2c46..8c19e705b9c3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -189,8 +189,8 @@ EXPORT_SYMBOL(end_buffer_write_sync);
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
{
- struct inode *bd_inode = bdev->bd_inode;
- struct address_space *bd_mapping = bd_inode->i_mapping;
+ struct address_space *bd_mapping = bdev->bd_mapping;
+ const int blkbits = bd_mapping->host->i_blkbits;
struct buffer_head *ret = NULL;
pgoff_t index;
struct buffer_head *bh;
@@ -199,7 +199,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
int all_mapped = 1;
static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
- index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
+ index = ((loff_t)block << blkbits) / PAGE_SIZE;
folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
if (IS_ERR(folio))
goto out;
@@ -233,7 +233,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
(unsigned long long)block,
(unsigned long long)bh->b_blocknr,
bh->b_state, bh->b_size, bdev,
- 1 << bd_inode->i_blkbits);
+ 1 << blkbits);
}
out_unlock:
spin_unlock(&bd_mapping->i_private_lock);
@@ -687,30 +687,37 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);
-/*
- * Add a page to the dirty page list.
- *
- * It is a sad fact of life that this function is called from several places
- * deeply under spinlocking. It may not sleep.
- *
- * If the page has buffers, the uptodate buffers are set dirty, to preserve
- * dirty-state coherency between the page and the buffers. It the page does
- * not have buffers then when they are later attached they will all be set
- * dirty.
- *
- * The buffers are dirtied before the page is dirtied. There's a small race
- * window in which a writepage caller may see the page cleanness but not the
- * buffer dirtiness. That's fine. If this code were to set the page dirty
- * before the buffers, a concurrent writepage caller could clear the page dirty
- * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
- * page on the dirty page list.
- *
- * We use i_private_lock to lock against try_to_free_buffers while using the
- * page's buffer list. Also use this to protect against clean buffers being
- * added to the page after it was set dirty.
- *
- * FIXME: may need to call ->reservepage here as well. That's rather up to the
- * address_space though.
+/**
+ * block_dirty_folio - Mark a folio as dirty.
+ * @mapping: The address space containing this folio.
+ * @folio: The folio to mark dirty.
+ *
+ * Filesystems which use buffer_heads can use this function as their
+ * ->dirty_folio implementation. Some filesystems need to do a little
+ * work before calling this function. Filesystems which do not use
+ * buffer_heads should call filemap_dirty_folio() instead.
+ *
+ * If the folio has buffers, the uptodate buffers are set dirty, to
+ * preserve dirty-state coherency between the folio and the buffers.
+ * Buffers added to a dirty folio are created dirty.
+ *
+ * The buffers are dirtied before the folio is dirtied. There's a small
+ * race window in which writeback may see the folio cleanness but not the
+ * buffer dirtiness. That's fine. If this code were to set the folio
+ * dirty before the buffers, writeback could clear the folio dirty flag,
+ * see a bunch of clean buffers and we'd end up with dirty buffers/clean
+ * folio on the dirty folio list.
+ *
+ * We use i_private_lock to lock against try_to_free_buffers() while
+ * using the folio's buffer list. This also prevents clean buffers
+ * being added to the folio after it was set dirty.
+ *
+ * Context: May only be called from process context. Does not sleep.
+ * Caller must ensure that @folio cannot be truncated during this call,
+ * typically by holding the folio lock or having a page in the folio
+ * mapped and holding the page table lock.
+ *
+ * Return: True if the folio was dirtied; false if it was already dirtied.
*/
bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
{
@@ -1034,12 +1041,12 @@ static sector_t folio_init_buffers(struct folio *folio,
static bool grow_dev_folio(struct block_device *bdev, sector_t block,
pgoff_t index, unsigned size, gfp_t gfp)
{
- struct inode *inode = bdev->bd_inode;
+ struct address_space *mapping = bdev->bd_mapping;
struct folio *folio;
struct buffer_head *bh;
sector_t end_block = 0;
- folio = __filemap_get_folio(inode->i_mapping, index,
+ folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
if (IS_ERR(folio))
return false;
@@ -1073,10 +1080,10 @@ static bool grow_dev_folio(struct block_device *bdev, sector_t block,
* lock to be atomic wrt __find_get_block(), which does not
* run under the folio lock.
*/
- spin_lock(&inode->i_mapping->i_private_lock);
+ spin_lock(&mapping->i_private_lock);
link_dev_buffers(folio, bh);
end_block = folio_init_buffers(folio, bdev, size);
- spin_unlock(&inode->i_mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
unlock:
folio_unlock(folio);
folio_put(folio);
@@ -1219,26 +1226,28 @@ void mark_buffer_write_io_error(struct buffer_head *bh)
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
-/*
- * Decrement a buffer_head's reference count. If all buffers against a page
- * have zero reference count, are clean and unlocked, and if the page is clean
- * and unlocked then try_to_free_buffers() may strip the buffers from the page
- * in preparation for freeing it (sometimes, rarely, buffers are removed from
- * a page but it ends up not being freed, and buffers may later be reattached).
+/**
+ * __brelse - Release a buffer.
+ * @bh: The buffer to release.
+ *
+ * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
*/
-void __brelse(struct buffer_head * buf)
+void __brelse(struct buffer_head *bh)
{
- if (atomic_read(&buf->b_count)) {
- put_bh(buf);
+ if (atomic_read(&bh->b_count)) {
+ put_bh(bh);
return;
}
WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
EXPORT_SYMBOL(__brelse);
-/*
- * bforget() is like brelse(), except it discards any
- * potentially dirty data.
+/**
+ * __bforget - Discard any dirty data in a buffer.
+ * @bh: The buffer to forget.
+ *
+ * This variant of bforget() can be called if @bh is guaranteed to not
+ * be NULL.
*/
void __bforget(struct buffer_head *bh)
{
@@ -1415,6 +1424,11 @@ EXPORT_SYMBOL(__find_get_block);
* @size: The size of buffer_heads for this @bdev.
* @gfp: The memory allocation flags to use.
*
+ * The returned buffer head has its reference count incremented, but is
+ * not locked. The caller should call brelse() when it has finished
+ * with the buffer. The buffer may not be uptodate. If needed, the
+ * caller can bring it uptodate either by reading it or overwriting it.
+ *
* Return: The buffer head, or NULL if memory could not be allocated.
*/
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
@@ -1446,24 +1460,33 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
EXPORT_SYMBOL(__breadahead);
/**
- * __bread_gfp() - reads a specified block and returns the bh
- * @bdev: the block_device to read from
- * @block: number of block
- * @size: size (in bytes) to read
- * @gfp: page allocation flag
- *
- * Reads a specified block, and returns buffer head that contains it.
- * The page cache can be allocated from non-movable area
- * not to prevent page migration if you set gfp to zero.
- * It returns NULL if the block was unreadable.
+ * __bread_gfp() - Read a block.
+ * @bdev: The block device to read from.
+ * @block: Block number in units of block size.
+ * @size: The block size of this device in bytes.
+ * @gfp: Not page allocation flags; see below.
+ *
+ * You are not expected to call this function. You should use one of
+ * sb_bread(), sb_bread_unmovable() or __bread().
+ *
+ * Read a specified block, and return the buffer head that refers to it.
+ * If @gfp is 0, the memory will be allocated using the block device's
+ * default GFP flags. If @gfp is __GFP_MOVABLE, the memory may be
+ * allocated from a movable area. Do not pass in a complete set of
+ * GFP flags.
+ *
+ * The returned buffer head has its refcount increased. The caller should
+ * call brelse() when it has finished with the buffer.
+ *
+ * Context: May sleep waiting for I/O.
+ * Return: NULL if the block was unreadable.
*/
-struct buffer_head *
-__bread_gfp(struct block_device *bdev, sector_t block,
- unsigned size, gfp_t gfp)
+struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
struct buffer_head *bh;
- gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+ gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
/*
* Prefer looping in the allocator rather than here, at least that
@@ -1696,16 +1719,16 @@ EXPORT_SYMBOL(create_empty_buffers);
*/
void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
- struct inode *bd_inode = bdev->bd_inode;
- struct address_space *bd_mapping = bd_inode->i_mapping;
+ struct address_space *bd_mapping = bdev->bd_mapping;
+ const int blkbits = bd_mapping->host->i_blkbits;
struct folio_batch fbatch;
- pgoff_t index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
+ pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
pgoff_t end;
int i, count;
struct buffer_head *bh;
struct buffer_head *head;
- end = ((loff_t)(block + len - 1) << bd_inode->i_blkbits) / PAGE_SIZE;
+ end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
folio_batch_init(&fbatch);
while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
count = folio_batch_count(&fbatch);
@@ -2861,26 +2884,6 @@ int sync_dirty_buffer(struct buffer_head *bh)
}
EXPORT_SYMBOL(sync_dirty_buffer);
-/*
- * try_to_free_buffers() checks if all the buffers on this particular folio
- * are unused, and releases them if so.
- *
- * Exclusion against try_to_free_buffers may be obtained by either
- * locking the folio or by holding its mapping's i_private_lock.
- *
- * If the folio is dirty but all the buffers are clean then we need to
- * be sure to mark the folio clean as well. This is because the folio
- * may be against a block device, and a later reattachment of buffers
- * to a dirty folio will set *all* buffers dirty. Which would corrupt
- * filesystem data on the same device.
- *
- * The same applies to regular filesystem folios: if all the buffers are
- * clean then we set the folio clean and proceed. To do that, we require
- * total exclusion from block_dirty_folio(). That is obtained with
- * i_private_lock.
- *
- * try_to_free_buffers() is non-blocking.
- */
static inline int buffer_busy(struct buffer_head *bh)
{
return atomic_read(&bh->b_count) |
@@ -2914,6 +2917,30 @@ failed:
return false;
}
+/**
+ * try_to_free_buffers - Release buffers attached to this folio.
+ * @folio: The folio.
+ *
+ * If any buffers are in use (dirty, under writeback, elevated refcount),
+ * no buffers will be freed.
+ *
+ * If the folio is dirty but all the buffers are clean then we need to
+ * be sure to mark the folio clean as well. This is because the folio
+ * may be against a block device, and a later reattachment of buffers
+ * to a dirty folio will set *all* buffers dirty. Which would corrupt
+ * filesystem data on the same device.
+ *
+ * The same applies to regular filesystem folios: if all the buffers are
+ * clean then we set the folio clean and proceed. To do that, we require
+ * total exclusion from block_dirty_folio(). That is obtained with
+ * i_private_lock.
+ *
+ * Exclusion against try_to_free_buffers may be obtained by either
+ * locking the folio or by holding its mapping's i_private_lock.
+ *
+ * Context: Process context. @folio must be locked. Will not sleep.
+ * Return: true if all buffers attached to this folio were freed.
+ */
bool try_to_free_buffers(struct folio *folio)
{
struct address_space * const mapping = folio->mapping;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7ade836beb58..f53977169db4 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -563,8 +563,7 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
*/
path.mnt = cache->mnt;
path.dentry = dentry;
- file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
- d_backing_inode(dentry), cache->cache_cred);
+ file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred);
if (IS_ERR(file)) {
trace_cachefiles_vfs_error(object, d_backing_inode(dentry),
PTR_ERR(file),
diff --git a/fs/coredump.c b/fs/coredump.c
index 317065e3eb9b..a57a06b80f57 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -376,9 +376,7 @@ static int zap_process(struct task_struct *start, int exit_code)
if (t != current && !(t->flags & PF_POSTCOREDUMP)) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
- /* The vhost_worker does not particpate in coredumps */
- if ((t->flags & (PF_USER_WORKER | PF_IO_WORKER)) != PF_USER_WORKER)
- nr++;
+ nr++;
}
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9901057a15ba..460690ca0174 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -183,7 +183,7 @@ static int next_buffer;
static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
unsigned int len)
{
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct address_space *mapping = sb->s_bdev->bd_mapping;
struct file_ra_state ra = {};
struct page *pages[BLKS_PER_BUF];
unsigned i, blocknr, buffer;
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index b4002aea7cdb..40de69860dcf 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -284,7 +284,7 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh,
const struct inode **inode_ret,
u64 *lblk_num_ret)
{
- struct page *page = bh->b_page;
+ struct folio *folio = bh->b_folio;
const struct address_space *mapping;
const struct inode *inode;
@@ -292,13 +292,13 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh,
* The ext4 journal (jbd2) can submit a buffer_head it directly created
* for a non-pagecache page. fscrypt doesn't care about these.
*/
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
if (!mapping)
return false;
inode = mapping->host;
*inode_ret = inode;
- *lblk_num_ret = ((u64)page->index << (PAGE_SHIFT - inode->i_blkbits)) +
+ *lblk_num_ret = ((u64)folio->index << (PAGE_SHIFT - inode->i_blkbits)) +
(bh_offset(bh) >> inode->i_blkbits);
return true;
}
diff --git a/fs/dax.c b/fs/dax.c
index 423fc1607dfa..becb4a6920c6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1207,17 +1207,17 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = mapping->host;
pgtable_t pgtable = NULL;
- struct page *zero_page;
+ struct folio *zero_folio;
spinlock_t *ptl;
pmd_t pmd_entry;
pfn_t pfn;
- zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
+ zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
- if (unlikely(!zero_page))
+ if (unlikely(!zero_folio))
goto fallback;
- pfn = page_to_pfn_t(zero_page);
+ pfn = page_to_pfn_t(&zero_folio->page);
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
DAX_PMD | DAX_ZERO_PAGE);
@@ -1237,17 +1237,17 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
mm_inc_nr_ptes(vma->vm_mm);
}
- pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
+ pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot);
pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
+ trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
return VM_FAULT_NOPAGE;
fallback:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
return VM_FAULT_FALLBACK;
}
#else
diff --git a/fs/dcache.c b/fs/dcache.c
index 407095188f83..1ee6404b430b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2360,19 +2360,17 @@ EXPORT_SYMBOL(d_hash_and_lookup);
* - unhash this dentry and free it.
*
* Usually, we want to just turn this into
- * a negative dentry, but if anybody else is
- * currently using the dentry or the inode
- * we can't do that and we fall back on removing
- * it from the hash queues and waiting for
- * it to be deleted later when it has no users
+ * a negative dentry, but certain workloads can
+ * generate a large number of negative dentries.
+ * Therefore, it would be better to simply
+ * unhash it.
*/
/**
* d_delete - delete a dentry
* @dentry: The dentry to delete
*
- * Turn the dentry into a negative dentry if possible, otherwise
- * remove it from the hash queues so it can be deleted later
+ * Remove the dentry from the hash queues so it can be deleted later.
*/
void d_delete(struct dentry * dentry)
@@ -2381,6 +2379,8 @@ void d_delete(struct dentry * dentry)
spin_lock(&inode->i_lock);
spin_lock(&dentry->d_lock);
+ __d_drop(dentry);
+
/*
* Are we the only user?
*/
@@ -2388,7 +2388,6 @@ void d_delete(struct dentry * dentry)
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
dentry_unlink_inode(dentry);
} else {
- __d_drop(dentry);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
}
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 52524bd9698b..5fc03c1e2757 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -29,11 +29,9 @@ void erofs_put_metabuf(struct erofs_buf *buf)
* Derive the block size from inode->i_blkbits to make compatible with
* anonymous inode in fscache mode.
*/
-void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
enum erofs_kmap_type type)
{
- struct inode *inode = buf->inode;
- erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits;
pgoff_t index = offset >> PAGE_SHIFT;
struct page *page = buf->page;
struct folio *folio;
@@ -43,7 +41,7 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
erofs_put_metabuf(buf);
nofs_flag = memalloc_nofs_save();
- folio = read_cache_folio(inode->i_mapping, index, NULL, NULL);
+ folio = read_cache_folio(buf->mapping, index, NULL, NULL);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(folio))
return folio;
@@ -68,16 +66,16 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
{
if (erofs_is_fscache_mode(sb))
- buf->inode = EROFS_SB(sb)->s_fscache->inode;
+ buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping;
else
- buf->inode = sb->s_bdev->bd_inode;
+ buf->mapping = sb->s_bdev->bd_mapping;
}
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
erofs_blk_t blkaddr, enum erofs_kmap_type type)
{
erofs_init_metabuf(buf, sb);
- return erofs_bread(buf, blkaddr, type);
+ return erofs_bread(buf, erofs_pos(sb, blkaddr), type);
}
static int erofs_map_blocks_flatmode(struct inode *inode,
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index b80abec0531a..2193a6710c8f 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -58,12 +58,12 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
int err = 0;
bool initial = true;
- buf.inode = dir;
+ buf.mapping = dir->i_mapping;
while (ctx->pos < dirsize) {
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- de = erofs_bread(&buf, i, EROFS_KMAP);
+ de = erofs_bread(&buf, erofs_pos(sb, i), EROFS_KMAP);
if (IS_ERR(de)) {
erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
i, EROFS_I(dir)->nid);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 21def866a482..00bbb288c08c 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -216,7 +216,7 @@ enum erofs_kmap_type {
};
struct erofs_buf {
- struct inode *inode;
+ struct address_space *mapping;
struct page *page;
void *base;
enum erofs_kmap_type kmap_type;
@@ -402,7 +402,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp);
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
-void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
enum erofs_kmap_type type);
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index f0110a78acb2..c94d0c1608a8 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -99,8 +99,8 @@ static void *erofs_find_target_block(struct erofs_buf *target,
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_dirent *de;
- buf.inode = dir;
- de = erofs_bread(&buf, mid, EROFS_KMAP);
+ buf.mapping = dir->i_mapping;
+ de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP);
if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff, bsz);
const int ndirents = nameoff / sizeof(*de);
@@ -171,7 +171,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
qn.name = name->name;
qn.end = name->name + name->len;
- buf.inode = dir;
+ buf.mapping = dir->i_mapping;
ndirents = 0;
de = erofs_find_target_block(&buf, dir, &qn, &ndirents);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 044c79229a78..348ef1660e50 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -132,11 +132,11 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
int len, i, cnt;
*offset = round_up(*offset, 4);
- ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, EROFS_KMAP);
if (IS_ERR(ptr))
return ptr;
- len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]);
+ len = le16_to_cpu(*(__le16 *)ptr);
if (!len)
len = U16_MAX + 1;
buffer = kmalloc(len, GFP_KERNEL);
@@ -148,12 +148,12 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
for (i = 0; i < len; i += cnt) {
cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
len - i);
- ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, EROFS_KMAP);
if (IS_ERR(ptr)) {
kfree(buffer);
return ptr;
}
- memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt);
+ memcpy(buffer + i, ptr, cnt);
*offset += cnt;
}
return buffer;
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index b58316b49a43..a90d7d649739 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -81,13 +81,13 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
if (IS_ERR(it.kaddr)) {
ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- ih = it.kaddr + erofs_blkoff(sb, it.pos);
+ ih = it.kaddr;
vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter);
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
@@ -102,16 +102,14 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos += sizeof(struct erofs_xattr_ibody_header);
for (i = 0; i < vi->xattr_shared_count; ++i) {
- it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos),
- EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
if (IS_ERR(it.kaddr)) {
kfree(vi->xattr_shared_xattrs);
vi->xattr_shared_xattrs = NULL;
ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)
- (it.kaddr + erofs_blkoff(sb, it.pos)));
+ vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr);
it.pos += sizeof(__le32);
}
erofs_put_metabuf(&it.buf);
@@ -185,12 +183,11 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it,
void *src;
for (processed = 0; processed < len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
- src = it->kaddr + erofs_blkoff(sb, it->pos);
+ src = it->kaddr;
slice = min_t(unsigned int, sb->s_blocksize -
erofs_blkoff(sb, it->pos), len - processed);
memcpy(it->buffer + it->buffer_ofs, src, slice);
@@ -208,8 +205,7 @@ static int erofs_listxattr_foreach(struct erofs_xattr_iter *it)
int err;
/* 1. handle xattr entry */
- entry = *(struct erofs_xattr_entry *)
- (it->kaddr + erofs_blkoff(it->sb, it->pos));
+ entry = *(struct erofs_xattr_entry *)it->kaddr;
it->pos += sizeof(struct erofs_xattr_entry);
base_index = entry.e_name_index;
@@ -259,8 +255,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
unsigned int slice, processed, value_sz;
/* 1. handle xattr entry */
- entry = *(struct erofs_xattr_entry *)
- (it->kaddr + erofs_blkoff(sb, it->pos));
+ entry = *(struct erofs_xattr_entry *)it->kaddr;
it->pos += sizeof(struct erofs_xattr_entry);
value_sz = le16_to_cpu(entry.e_value_size);
@@ -291,8 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
/* 2. handle xattr name */
for (processed = 0; processed < entry.e_name_len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -300,7 +294,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
sb->s_blocksize - erofs_blkoff(sb, it->pos),
entry.e_name_len - processed);
if (memcmp(it->name.name + it->infix_len + processed,
- it->kaddr + erofs_blkoff(sb, it->pos), slice))
+ it->kaddr, slice))
return -ENOATTR;
it->pos += slice;
}
@@ -336,13 +330,11 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
while (remaining) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
- entry_sz = erofs_xattr_entry_size(it->kaddr +
- erofs_blkoff(it->sb, it->pos));
+ entry_sz = erofs_xattr_entry_size(it->kaddr);
/* xattr on-disk corruption: xattr entry beyond xattr_isize */
if (remaining < entry_sz) {
DBG_BUGON(1);
@@ -375,8 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
for (i = 0; i < vi->xattr_shared_count; ++i) {
it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
vi->xattr_shared_xattrs[i] * sizeof(__le32);
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -492,7 +483,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
return -ENOMEM;
if (sbi->packed_inode)
- buf.inode = sbi->packed_inode;
+ buf.mapping = sbi->packed_inode->i_mapping;
else
erofs_init_metabuf(&buf, sb);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 3216b920d369..283c9c3a611d 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -936,16 +936,16 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
if (!packed_inode)
return -EFSCORRUPTED;
- buf.inode = packed_inode;
+ buf.mapping = packed_inode->i_mapping;
for (; cur < end; cur += cnt, pos += cnt) {
cnt = min_t(unsigned int, end - cur,
sb->s_blocksize - erofs_blkoff(sb, pos));
- src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
+ src = erofs_bread(&buf, pos, EROFS_KMAP);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
}
- memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
+ memcpy_to_page(page, cur, src, cnt);
}
erofs_put_metabuf(&buf);
return 0;
diff --git a/fs/exec.c b/fs/exec.c
index b3c40fbb325f..40073142288f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -67,6 +67,7 @@
#include <linux/time_namespace.h>
#include <linux/user_events.h>
#include <linux/rseq.h>
+#include <linux/ksm.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -268,6 +269,14 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
}
/*
+ * Need to be called with mmap write lock
+ * held, to avoid race with ksmd.
+ */
+ err = ksm_execve(mm);
+ if (err)
+ goto err_ksm;
+
+ /*
* Place the stack at the largest stack address the architecture
* supports. Later, we'll move this to an appropriate place. We don't
* use STACK_TOP because that can depend on attributes which aren't
@@ -288,6 +297,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
bprm->p = vma->vm_end - sizeof(void *);
return 0;
err:
+ ksm_exit(mm);
+err_ksm:
mmap_write_unlock(mm);
err_free:
bprm->vma = NULL;
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index d6cfb1849580..d5bce83ad905 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -3,7 +3,6 @@ config EXT2_FS
tristate "Second extended fs support (DEPRECATED)"
select BUFFER_HEAD
select FS_IOMAP
- select LEGACY_DIRECT_IO
help
Ext2 is a standard Linux file system for hard disks.
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 4fb155b5a958..087457061c6e 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -175,7 +175,6 @@ Eend:
(unsigned long) le32_to_cpu(p->inode));
}
fail:
- folio_set_error(folio);
return false;
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 4ddc36f4dbd4..10b061ac5bc0 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -302,6 +302,12 @@ static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
return generic_file_write_iter(iocb, from);
}
+static int ext2_file_open(struct inode *inode, struct file *filp)
+{
+ filp->f_mode |= FMODE_CAN_ODIRECT;
+ return dquot_file_open(inode, filp);
+}
+
const struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
.read_iter = ext2_file_read_iter,
@@ -311,7 +317,7 @@ const struct file_operations ext2_file_operations = {
.compat_ioctl = ext2_compat_ioctl,
#endif
.mmap = ext2_file_mmap,
- .open = dquot_file_open,
+ .open = ext2_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
.get_unmapped_area = thp_get_unmapped_area,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index f3d570a9302b..0caa1650cee8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -965,7 +965,6 @@ const struct address_space_operations ext2_aops = {
.write_begin = ext2_write_begin,
.write_end = ext2_write_end,
.bmap = ext2_bmap,
- .direct_IO = noop_direct_IO,
.writepages = ext2_writepages,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -974,7 +973,6 @@ const struct address_space_operations ext2_aops = {
static const struct address_space_operations ext2_dax_aops = {
.writepages = ext2_dax_writepages,
- .direct_IO = noop_direct_IO,
.dirty_folio = noop_dirty_folio,
};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index ef4c19e5f570..0c5a79c3b5d4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -68,11 +68,6 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
- /* usually, the umask is applied by posix_acl_create(), but if
- ext4 ACL support is disabled at compile time, we need to do
- it here, because posix_acl_create() will never be called */
- inode->i_mode &= ~current_umask();
-
return 0;
}
#endif /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 3985f8c33f95..ff4514e4626b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -192,7 +192,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
(PAGE_SHIFT - inode->i_blkbits);
if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
- sb->s_bdev->bd_inode->i_mapping,
+ sb->s_bdev->bd_mapping,
&file->f_ra, file,
index, 1);
file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8d126654019e..983dad8c07ec 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -213,11 +213,14 @@ enum criteria {
#define EXT4_MB_USE_RESERVED 0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
-/* Large fragment size list lookup succeeded at least once for cr = 0 */
+/* Large fragment size list lookup succeeded at least once for
+ * CR_POWER2_ALIGNED */
#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000
-/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
+/* Avg fragment size rb tree lookup succeeded at least once for
+ * CR_GOAL_LEN_FAST */
#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000
-/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */
+/* Avg fragment size rb tree lookup succeeded at least once for
+ * CR_BEST_AVAIL_LEN */
#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
struct ext4_allocation_request {
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 5d8055161acd..da4a82456383 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -206,7 +206,7 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line,
static void ext4_check_bdev_write_error(struct super_block *sb)
{
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct address_space *mapping = sb->s_bdev->bd_mapping;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e57054bdc5fd..e067f2dd0335 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3402,9 +3402,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_extent *ex, *abut_ex;
ext4_lblk_t ee_block, eof_block;
unsigned int ee_len, depth, map_len = map->m_len;
- int allocated = 0, max_zeroout = 0;
int err = 0;
int split_flag = EXT4_EXT_DATA_VALID2;
+ int allocated = 0;
+ unsigned int max_zeroout = 0;
ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)map->m_lblk, map_len);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 28c51b0cc4db..c89e434db6b7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -844,8 +844,7 @@ static int ext4_sample_last_mounted(struct super_block *sb,
if (err)
goto out_journal;
lock_buffer(sbi->s_sbh);
- strncpy(sbi->s_es->s_last_mounted, cp,
- sizeof(sbi->s_es->s_last_mounted));
+ strtomem_pad(sbi->s_es->s_last_mounted, cp, 0);
ext4_superblock_csum_set(sb);
unlock_buffer(sbi->s_sbh);
ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
@@ -885,7 +884,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
return dquot_file_open(inode, filp);
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 537803250ca9..4bae9ccf5fe0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1865,7 +1865,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
len = folio_size(folio);
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(mpd->inode))
- len = size & ~PAGE_MASK;
+ len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
if (!err)
mpd->wbc->nr_to_write--;
@@ -2334,7 +2334,7 @@ static int mpage_journal_page_buffers(handle_t *handle,
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(inode))
- len = size - folio_pos(folio);
+ len = size & (len - 1);
return ext4_journal_folio_buffers(handle, folio, len);
}
@@ -2887,9 +2887,6 @@ retry:
if (IS_ERR(folio))
return PTR_ERR(folio);
- /* In case writeback began while the folio was unlocked */
- folio_wait_stable(folio);
-
#ifdef CONFIG_FS_ENCRYPTION
ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep);
#else
@@ -3530,7 +3527,6 @@ static const struct address_space_operations ext4_aops = {
.bmap = ext4_bmap,
.invalidate_folio = ext4_invalidate_folio,
.release_folio = ext4_release_folio,
- .direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_folio = generic_error_remove_folio,
@@ -3547,7 +3543,6 @@ static const struct address_space_operations ext4_journalled_aops = {
.bmap = ext4_bmap,
.invalidate_folio = ext4_journalled_invalidate_folio,
.release_folio = ext4_release_folio,
- .direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio_norefs,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_folio = generic_error_remove_folio,
@@ -3564,7 +3559,6 @@ static const struct address_space_operations ext4_da_aops = {
.bmap = ext4_bmap,
.invalidate_folio = ext4_invalidate_folio,
.release_folio = ext4_release_folio,
- .direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_folio = generic_error_remove_folio,
@@ -3573,7 +3567,6 @@ static const struct address_space_operations ext4_da_aops = {
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
- .direct_IO = noop_direct_IO,
.dirty_folio = noop_dirty_folio,
.bmap = ext4_bmap,
.swap_activate = ext4_iomap_swap_activate,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7160a71044c8..dab7acd49709 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1150,9 +1150,8 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label
*/
BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
- memset(label, 0, sizeof(label));
lock_buffer(sbi->s_sbh);
- strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX);
+ strscpy_pad(label, sbi->s_es->s_volume_name);
unlock_buffer(sbi->s_sbh);
if (copy_to_user(user_label, label, sizeof(label)))
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index 044ca5238f41..bb2a223b207c 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -30,7 +30,31 @@ struct mbt_ext4_super_block {
#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx)
#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group])
+static struct inode *mbt_alloc_inode(struct super_block *sb)
+{
+ struct ext4_inode_info *ei;
+
+ ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL);
+ if (!ei)
+ return NULL;
+
+ INIT_LIST_HEAD(&ei->i_orphan);
+ init_rwsem(&ei->xattr_sem);
+ init_rwsem(&ei->i_data_sem);
+ inode_init_once(&ei->vfs_inode);
+ ext4_fc_init_inode(&ei->vfs_inode);
+
+ return &ei->vfs_inode;
+}
+
+static void mbt_free_inode(struct inode *inode)
+{
+ kfree(EXT4_I(inode));
+}
+
static const struct super_operations mbt_sops = {
+ .alloc_inode = mbt_alloc_inode,
+ .free_inode = mbt_free_inode,
};
static void mbt_kill_sb(struct super_block *sb)
@@ -859,6 +883,56 @@ static void test_mb_free_blocks(struct kunit *test)
ext4_mb_unload_buddy(&e4b);
}
+#define COUNT_FOR_ESTIMATE 100000
+static void test_mb_mark_used_cost(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_free_extent ex;
+ int ret;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i, j;
+ unsigned long start, end, all = 0;
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ex.fe_group = TEST_GOAL_GROUP;
+ for (j = 0; j < COUNT_FOR_ESTIMATE; j++) {
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ start = jiffies;
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ if (ranges[i].len == 0)
+ continue;
+
+ ex.fe_start = ranges[i].start;
+ ex.fe_len = ranges[i].len;
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(&e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+ }
+ end = jiffies;
+ all += (end - start);
+
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ if (ranges[i].len == 0)
+ continue;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_free_blocks(NULL, &e4b, ranges[i].start,
+ ranges[i].len);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+ }
+ }
+
+ kunit_info(test, "costed jiffies %lu\n", all);
+ ext4_mb_unload_buddy(&e4b);
+}
+
static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
{
.blocksize_bits = 10,
@@ -901,6 +975,8 @@ static struct kunit_case mbt_test_cases[] = {
KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params),
KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params),
KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params,
+ { .speed = KUNIT_SPEED_SLOW }),
{}
};
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 12b3f196010b..9dda9cd68ab2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
return 0;
if (order == MB_NUM_ORDERS(sb))
order--;
+ if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
+ order = MB_NUM_ORDERS(sb) - 1;
return order;
}
@@ -1008,6 +1010,8 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
* goal length.
*/
order = fls(ac->ac_g_ex.fe_len) - 1;
+ if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb)))
+ order = MB_NUM_ORDERS(ac->ac_sb);
min_order = order - sbi->s_mb_best_avail_max_trim_order;
if (min_order < 0)
min_order = 0;
@@ -1076,23 +1080,11 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
}
/*
- * Return next linear group for allocation. If linear traversal should not be
- * performed, this function just returns the same group
+ * Return next linear group for allocation.
*/
static ext4_group_t
-next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group,
- ext4_group_t ngroups)
+next_linear_group(ext4_group_t group, ext4_group_t ngroups)
{
- if (!should_optimize_scan(ac))
- goto inc_and_return;
-
- if (ac->ac_groups_linear_remaining) {
- ac->ac_groups_linear_remaining--;
- goto inc_and_return;
- }
-
- return group;
-inc_and_return:
/*
* Artificially restricted ngroups for non-extent
* files makes group > ngroups possible on first loop.
@@ -1118,8 +1110,19 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
{
*new_cr = ac->ac_criteria;
- if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
- *group = next_linear_group(ac, *group, ngroups);
+ if (!should_optimize_scan(ac)) {
+ *group = next_linear_group(*group, ngroups);
+ return;
+ }
+
+ /*
+ * Optimized scanning can return non adjacent groups which can cause
+ * seek overhead for rotational disks. So try few linear groups before
+ * trying optimized scan.
+ */
+ if (ac->ac_groups_linear_remaining) {
+ *group = next_linear_group(*group, ngroups);
+ ac->ac_groups_linear_remaining--;
return;
}
@@ -1131,8 +1134,9 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
ext4_mb_choose_next_group_best_avail(ac, new_cr, group);
} else {
/*
- * TODO: For CR=2, we can arrange groups in an rb tree sorted by
- * bb_free. But until that happens, we should never come here.
+ * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an
+ * rb tree sorted by bb_free. But until that happens, we should
+ * never come here.
*/
WARN_ON(1);
}
@@ -1270,7 +1274,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
* for this page; do not hold this lock when calling this routine!
*/
-static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
+static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
{
ext4_group_t ngroups;
unsigned int blocksize;
@@ -1288,13 +1292,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
char *bitmap;
struct ext4_group_info *grinfo;
- inode = page->mapping->host;
+ inode = folio->mapping->host;
sb = inode->i_sb;
ngroups = ext4_get_groups_count(sb);
blocksize = i_blocksize(inode);
blocks_per_page = PAGE_SIZE / blocksize;
- mb_debug(sb, "init page %lu\n", page->index);
+ mb_debug(sb, "init folio %lu\n", folio->index);
groups_per_page = blocks_per_page >> 1;
if (groups_per_page == 0)
@@ -1309,9 +1313,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
} else
bh = &bhs;
- first_group = page->index * blocks_per_page / 2;
+ first_group = folio->index * blocks_per_page / 2;
- /* read all groups the page covers into the cache */
+ /* read all groups the folio covers into the cache */
for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
if (group >= ngroups)
break;
@@ -1322,10 +1326,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
/*
* If page is uptodate then we came here after online resize
* which added some new uninitialized group info structs, so
- * we must skip all initialized uptodate buddies on the page,
+ * we must skip all initialized uptodate buddies on the folio,
* which may be currently in use by an allocating task.
*/
- if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+ if (folio_test_uptodate(folio) &&
+ !EXT4_MB_GRP_NEED_INIT(grinfo)) {
bh[i] = NULL;
continue;
}
@@ -1349,7 +1354,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
err = err2;
}
- first_block = page->index * blocks_per_page;
+ first_block = folio->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
group = (first_block + i) >> 1;
if (group >= ngroups)
@@ -1370,7 +1375,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
* above
*
*/
- data = page_address(page) + (i * blocksize);
+ data = folio_address(folio) + (i * blocksize);
bitmap = bh[group - first_group]->b_data;
/*
@@ -1385,8 +1390,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
- group, page->index, i * blocksize);
+ mb_debug(sb, "put buddy for group %u in folio %lu/%x\n",
+ group, folio->index, i * blocksize);
trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
@@ -1404,8 +1409,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
- group, page->index, i * blocksize);
+ mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n",
+ group, folio->index, i * blocksize);
trace_ext4_mb_bitmap_load(sb, group);
/* see comments in ext4_mb_put_pa() */
@@ -1423,7 +1428,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
incore = data;
}
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
out:
if (bh) {
@@ -1439,7 +1444,7 @@ out:
* Lock the buddy and bitmap pages. This make sure other parallel init_group
* on the same buddy page doesn't happen whild holding the buddy page lock.
* Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
- * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
+ * are on the same page e4b->bd_buddy_folio is NULL and return value is 0.
*/
static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
@@ -1447,10 +1452,10 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
int block, pnum, poff;
int blocks_per_page;
- struct page *page;
+ struct folio *folio;
- e4b->bd_buddy_page = NULL;
- e4b->bd_bitmap_page = NULL;
+ e4b->bd_buddy_folio = NULL;
+ e4b->bd_bitmap_folio = NULL;
blocks_per_page = PAGE_SIZE / sb->s_blocksize;
/*
@@ -1461,12 +1466,13 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
block = group * 2;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (!page)
- return -ENOMEM;
- BUG_ON(page->mapping != inode->i_mapping);
- e4b->bd_bitmap_page = page;
- e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ BUG_ON(folio->mapping != inode->i_mapping);
+ e4b->bd_bitmap_folio = folio;
+ e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize);
if (blocks_per_page >= 2) {
/* buddy and bitmap are on the same page */
@@ -1474,23 +1480,24 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
}
/* blocks_per_page == 1, hence we need another page for the buddy */
- page = find_or_create_page(inode->i_mapping, block + 1, gfp);
- if (!page)
- return -ENOMEM;
- BUG_ON(page->mapping != inode->i_mapping);
- e4b->bd_buddy_page = page;
+ folio = __filemap_get_folio(inode->i_mapping, block + 1,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ BUG_ON(folio->mapping != inode->i_mapping);
+ e4b->bd_buddy_folio = folio;
return 0;
}
static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{
- if (e4b->bd_bitmap_page) {
- unlock_page(e4b->bd_bitmap_page);
- put_page(e4b->bd_bitmap_page);
+ if (e4b->bd_bitmap_folio) {
+ folio_unlock(e4b->bd_bitmap_folio);
+ folio_put(e4b->bd_bitmap_folio);
}
- if (e4b->bd_buddy_page) {
- unlock_page(e4b->bd_buddy_page);
- put_page(e4b->bd_buddy_page);
+ if (e4b->bd_buddy_folio) {
+ folio_unlock(e4b->bd_buddy_folio);
+ folio_put(e4b->bd_buddy_folio);
}
}
@@ -1505,7 +1512,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
struct ext4_group_info *this_grp;
struct ext4_buddy e4b;
- struct page *page;
+ struct folio *folio;
int ret = 0;
might_sleep();
@@ -1532,16 +1539,16 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
goto err;
}
- page = e4b.bd_bitmap_page;
- ret = ext4_mb_init_cache(page, NULL, gfp);
+ folio = e4b.bd_bitmap_folio;
+ ret = ext4_mb_init_cache(folio, NULL, gfp);
if (ret)
goto err;
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
- if (e4b.bd_buddy_page == NULL) {
+ if (e4b.bd_buddy_folio == NULL) {
/*
* If both the bitmap and buddy are in
* the same page we don't need to force
@@ -1551,11 +1558,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
goto err;
}
/* init buddy cache */
- page = e4b.bd_buddy_page;
- ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
+ folio = e4b.bd_buddy_folio;
+ ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp);
if (ret)
goto err;
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
@@ -1577,7 +1584,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
int block;
int pnum;
int poff;
- struct page *page;
+ struct folio *folio;
int ret;
struct ext4_group_info *grp;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1595,8 +1602,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
e4b->bd_info = grp;
e4b->bd_sb = sb;
e4b->bd_group = group;
- e4b->bd_buddy_page = NULL;
- e4b->bd_bitmap_page = NULL;
+ e4b->bd_buddy_folio = NULL;
+ e4b->bd_bitmap_folio = NULL;
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
/*
@@ -1617,102 +1624,103 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- /* we could use find_or_create_page(), but it locks page
- * what we'd like to avoid in fast path ... */
- page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
- if (page == NULL || !PageUptodate(page)) {
- if (page)
+ /* Avoid locking the folio in the fast path ... */
+ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ if (!IS_ERR(folio))
/*
- * drop the page reference and try
- * to get the page with lock. If we
+ * drop the folio reference and try
+ * to get the folio with lock. If we
* are not uptodate that implies
- * somebody just created the page but
- * is yet to initialize the same. So
+ * somebody just created the folio but
+ * is yet to initialize it. So
* wait for it to initialize.
*/
- put_page(page);
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (page) {
- if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
- "ext4: bitmap's paging->mapping != inode->i_mapping\n")) {
+ folio_put(folio);
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (!IS_ERR(folio)) {
+ if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
+ "ext4: bitmap's mapping != inode->i_mapping\n")) {
/* should never happen */
- unlock_page(page);
+ folio_unlock(folio);
ret = -EINVAL;
goto err;
}
- if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, NULL, gfp);
+ if (!folio_test_uptodate(folio)) {
+ ret = ext4_mb_init_cache(folio, NULL, gfp);
if (ret) {
- unlock_page(page);
+ folio_unlock(folio);
goto err;
}
- mb_cmp_bitmaps(e4b, page_address(page) +
+ mb_cmp_bitmaps(e4b, folio_address(folio) +
(poff * sb->s_blocksize));
}
- unlock_page(page);
+ folio_unlock(folio);
}
}
- if (page == NULL) {
- ret = -ENOMEM;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto err;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
- /* Pages marked accessed already */
- e4b->bd_bitmap_page = page;
- e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+ /* Folios marked accessed already */
+ e4b->bd_bitmap_folio = folio;
+ e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize);
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
- if (page == NULL || !PageUptodate(page)) {
- if (page)
- put_page(page);
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
- if (page) {
- if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
- "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) {
+ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ if (!IS_ERR(folio))
+ folio_put(folio);
+ folio = __filemap_get_folio(inode->i_mapping, pnum,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (!IS_ERR(folio)) {
+ if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
+ "ext4: buddy bitmap's mapping != inode->i_mapping\n")) {
/* should never happen */
- unlock_page(page);
+ folio_unlock(folio);
ret = -EINVAL;
goto err;
}
- if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
+ if (!folio_test_uptodate(folio)) {
+ ret = ext4_mb_init_cache(folio, e4b->bd_bitmap,
gfp);
if (ret) {
- unlock_page(page);
+ folio_unlock(folio);
goto err;
}
}
- unlock_page(page);
+ folio_unlock(folio);
}
}
- if (page == NULL) {
- ret = -ENOMEM;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto err;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto err;
}
- /* Pages marked accessed already */
- e4b->bd_buddy_page = page;
- e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
+ /* Folios marked accessed already */
+ e4b->bd_buddy_folio = folio;
+ e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize);
return 0;
err:
- if (page)
- put_page(page);
- if (e4b->bd_bitmap_page)
- put_page(e4b->bd_bitmap_page);
+ if (!IS_ERR_OR_NULL(folio))
+ folio_put(folio);
+ if (e4b->bd_bitmap_folio)
+ folio_put(e4b->bd_bitmap_folio);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
@@ -1727,10 +1735,10 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
- if (e4b->bd_bitmap_page)
- put_page(e4b->bd_bitmap_page);
- if (e4b->bd_buddy_page)
- put_page(e4b->bd_buddy_page);
+ if (e4b->bd_bitmap_folio)
+ folio_put(e4b->bd_bitmap_folio);
+ if (e4b->bd_buddy_folio)
+ folio_put(e4b->bd_buddy_folio);
}
@@ -2040,13 +2048,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
int ord;
int mlen = 0;
int max = 0;
- int cur;
int start = ex->fe_start;
int len = ex->fe_len;
unsigned ret = 0;
int len0 = len;
void *buddy;
- bool split = false;
+ int ord_start, ord_end;
BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
BUG_ON(e4b->bd_group != ex->fe_group);
@@ -2071,16 +2078,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
/* let's maintain buddy itself */
while (len) {
- if (!split)
- ord = mb_find_order_for_block(e4b, start);
+ ord = mb_find_order_for_block(e4b, start);
if (((start >> ord) << ord) == start && len >= (1 << ord)) {
/* the whole chunk may be allocated at once! */
mlen = 1 << ord;
- if (!split)
- buddy = mb_find_buddy(e4b, ord, &max);
- else
- split = false;
+ buddy = mb_find_buddy(e4b, ord, &max);
BUG_ON((start >> ord) >= max);
mb_set_bit(start >> ord, buddy);
e4b->bd_info->bb_counters[ord]--;
@@ -2094,20 +2097,29 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
if (ret == 0)
ret = len | (ord << 16);
- /* we have to split large buddy */
BUG_ON(ord <= 0);
buddy = mb_find_buddy(e4b, ord, &max);
mb_set_bit(start >> ord, buddy);
e4b->bd_info->bb_counters[ord]--;
- ord--;
- cur = (start >> ord) & ~1U;
- buddy = mb_find_buddy(e4b, ord, &max);
- mb_clear_bit(cur, buddy);
- mb_clear_bit(cur + 1, buddy);
- e4b->bd_info->bb_counters[ord]++;
- e4b->bd_info->bb_counters[ord]++;
- split = true;
+ ord_start = (start >> ord) << ord;
+ ord_end = ord_start + (1 << ord);
+ /* first chunk */
+ if (start > ord_start)
+ ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
+ ord_start, start - ord_start,
+ e4b->bd_info);
+
+ /* last chunk */
+ if (start + len < ord_end) {
+ ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
+ start + len,
+ ord_end - (start + len),
+ e4b->bd_info);
+ break;
+ }
+ len = start + len - ord_end;
+ start = ord_end;
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
@@ -2149,10 +2161,10 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
* double allocate blocks. The reference is dropped
* in ext4_mb_release_context
*/
- ac->ac_bitmap_page = e4b->bd_bitmap_page;
- get_page(ac->ac_bitmap_page);
- ac->ac_buddy_page = e4b->bd_buddy_page;
- get_page(ac->ac_buddy_page);
+ ac->ac_bitmap_folio = e4b->bd_bitmap_folio;
+ folio_get(ac->ac_bitmap_folio);
+ ac->ac_buddy_folio = e4b->bd_buddy_folio;
+ folio_get(ac->ac_buddy_folio);
/* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
@@ -2675,7 +2687,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
int ret;
/*
- * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
+ * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
* search to find large good chunks almost for free. If buddy
* data is not ready, then this optimization makes no sense. But
* we never skip the first block group in a flex_bg, since this
@@ -2856,6 +2868,7 @@ repeat:
group = ac->ac_g_ex.fe_group;
ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
prefetch_grp = group;
+ nr = 0;
for (i = 0, new_cr = cr; i < ngroups; i++,
ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
@@ -3186,7 +3199,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
}
static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
-__acquires(&EXT4_SB(sb)->s_mb_rb_lock)
{
struct super_block *sb = pde_data(file_inode(seq->file));
unsigned long position;
@@ -3440,10 +3452,11 @@ static int ext4_mb_init_backend(struct super_block *sb)
}
if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
sbi->s_mb_prefetch = ext4_get_groups_count(sb);
- /* now many real IOs to prefetch within a single allocation at cr=0
- * given cr=0 is an CPU-related optimization we shouldn't try to
- * load too many groups, at some point we should start to use what
- * we've got in memory.
+ /*
+ * now many real IOs to prefetch within a single allocation at
+ * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related
+ * optimization we shouldn't try to load too many groups, at some point
+ * we should start to use what we've got in memory.
* with an average random access time 5ms, it'd take a second to get
* 200 groups (* N with flex_bg), so let's make this limit 4
*/
@@ -3884,8 +3897,8 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
/* No more items in the per group rb tree
* balance refcounts from ext4_mb_free_metadata()
*/
- put_page(e4b.bd_buddy_page);
- put_page(e4b.bd_bitmap_page);
+ folio_put(e4b.bd_buddy_folio);
+ folio_put(e4b.bd_bitmap_folio);
}
ext4_unlock_group(sb, entry->efd_group);
ext4_mb_unload_buddy(&e4b);
@@ -5989,10 +6002,10 @@ static void ext4_mb_release_context(struct ext4_allocation_context *ac)
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
- if (ac->ac_bitmap_page)
- put_page(ac->ac_bitmap_page);
- if (ac->ac_buddy_page)
- put_page(ac->ac_buddy_page);
+ if (ac->ac_bitmap_folio)
+ folio_put(ac->ac_bitmap_folio);
+ if (ac->ac_buddy_folio)
+ folio_put(ac->ac_buddy_folio);
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
@@ -6113,6 +6126,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
ext4_mb_mark_bb(sb, block, 1, true);
ar->len = 1;
+ *errp = 0;
return block;
}
@@ -6307,8 +6321,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct rb_node *parent = NULL, *new_node;
BUG_ON(!ext4_handle_valid(handle));
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
+ BUG_ON(e4b->bd_bitmap_folio == NULL);
+ BUG_ON(e4b->bd_buddy_folio == NULL);
new_node = &new_entry->efd_node;
cluster = new_entry->efd_start_cluster;
@@ -6319,8 +6333,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
* otherwise we'll refresh it from
* on-disk bitmap and lose not-yet-available
* blocks */
- get_page(e4b->bd_buddy_page);
- get_page(e4b->bd_bitmap_page);
+ folio_get(e4b->bd_buddy_folio);
+ folio_get(e4b->bd_bitmap_folio);
}
while (*n) {
parent = *n;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 56938532b4ce..d8553f1498d3 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -187,14 +187,14 @@ struct ext4_allocation_context {
struct ext4_free_extent ac_f_ex;
/*
- * goal len can change in CR1.5, so save the original len. This is
- * used while adjusting the PA window and for accounting.
+ * goal len can change in CR_BEST_AVAIL_LEN, so save the original len.
+ * This is used while adjusting the PA window and for accounting.
*/
ext4_grpblk_t ac_orig_goal_len;
__u32 ac_flags; /* allocation hints */
+ __u32 ac_groups_linear_remaining;
__u16 ac_groups_scanned;
- __u16 ac_groups_linear_remaining;
__u16 ac_found;
__u16 ac_cX_found[EXT4_MB_NUM_CRS];
__u16 ac_tail;
@@ -204,8 +204,8 @@ struct ext4_allocation_context {
__u8 ac_2order; /* if request is to allocate 2^N blocks and
* N > 0, the field stores N, otherwise 0 */
__u8 ac_op; /* operation, for history only */
- struct page *ac_bitmap_page;
- struct page *ac_buddy_page;
+ struct folio *ac_bitmap_folio;
+ struct folio *ac_buddy_folio;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
};
@@ -215,9 +215,9 @@ struct ext4_allocation_context {
#define AC_STATUS_BREAK 3
struct ext4_buddy {
- struct page *bd_buddy_page;
+ struct folio *bd_buddy_folio;
void *bd_buddy;
- struct page *bd_bitmap_page;
+ struct folio *bd_bitmap_folio;
void *bd_bitmap;
struct ext4_group_info *bd_info;
struct super_block *bd_sb;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 7cd4afa4de1d..204f53b23622 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -199,10 +199,8 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
continue;
if (!buffer_mapped(bh)) {
err = ext4_get_block(inode, block, bh, 0);
- if (err) {
- folio_set_error(folio);
+ if (err)
return err;
- }
if (!buffer_mapped(bh)) {
folio_zero_range(folio, block_start, blocksize);
set_buffer_uptodate(bh);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5e4f65c14dfb..a630b27a4cc6 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2897,7 +2897,7 @@ retry:
inode = ext4_new_inode_start_handle(idmap, dir, mode,
NULL, 0, NULL,
EXT4_HT_DIR,
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) +
4 + EXT4_XATTR_TRANS_BLOCKS);
handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 312bc6813357..ad5543866d21 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -117,7 +117,6 @@ static void ext4_finish_bio(struct bio *bio)
if (bio->bi_status) {
int err = blk_status_to_errno(bio->bi_status);
- folio_set_error(folio);
mapping_set_error(folio->mapping, err);
}
bh = head = folio_buffers(folio);
@@ -441,8 +440,6 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
- folio_clear_error(folio);
-
/*
* Comments copied from block_write_full_folio:
*
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 21e8f0aebb3c..8494492582ab 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -289,7 +289,6 @@ int ext4_mpage_readpages(struct inode *inode,
if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
set_error_page:
- folio_set_error(folio);
folio_zero_segment(folio, 0,
folio_size(folio));
folio_unlock(folio);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3fce1b80c419..c682fb927b64 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -244,7 +244,7 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
blk_opf_t op_flags)
{
- gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
~__GFP_FS) | __GFP_MOVABLE;
return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
@@ -253,7 +253,7 @@ struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block)
{
- gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
~__GFP_FS);
return __ext4_sb_bread_gfp(sb, block, 0, gfp);
@@ -492,22 +492,6 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}
-/*
- * The del_gendisk() function uninitializes the disk-specific data
- * structures, including the bdi structure, without telling anyone
- * else. Once this happens, any attempt to call mark_buffer_dirty()
- * (for example, by ext4_commit_super), will cause a kernel OOPS.
- * This is a kludge to prevent these oops until we can put in a proper
- * hook in del_gendisk() to inform the VFS and file system layers.
- */
-static int block_device_ejected(struct super_block *sb)
-{
- struct inode *bd_inode = sb->s_bdev->bd_inode;
- struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
-
- return bdi->dev == NULL;
-}
-
static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
struct super_block *sb = journal->j_private;
@@ -2074,8 +2058,7 @@ static int unnote_qf_name(struct fs_context *fc, int qtype)
{
struct ext4_fs_context *ctx = fc->fs_private;
- if (ctx->s_qf_names[qtype])
- kfree(ctx->s_qf_names[qtype]);
+ kfree(ctx->s_qf_names[qtype]);
ctx->s_qf_names[qtype] = NULL;
ctx->qname_spec |= 1 << qtype;
@@ -2480,8 +2463,7 @@ static int parse_options(struct fs_context *fc, char *options)
param.size = v_len;
ret = ext4_parse_param(fc, &param);
- if (param.string)
- kfree(param.string);
+ kfree(param.string);
if (ret < 0)
return ret;
}
@@ -5338,6 +5320,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid));
+ super_set_sysfs_name_bdev(sb);
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
@@ -5547,19 +5530,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (err)
goto failed_mount6;
- err = ext4_register_sysfs(sb);
- if (err)
- goto failed_mount7;
-
err = ext4_init_orphan_info(sb);
if (err)
- goto failed_mount8;
+ goto failed_mount7;
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount. */
if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
err = ext4_enable_quotas(sb);
if (err)
- goto failed_mount9;
+ goto failed_mount8;
}
#endif /* CONFIG_QUOTA */
@@ -5568,7 +5547,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* used to detect the metadata async write error.
*/
spin_lock_init(&sbi->s_bdev_wb_lock);
- errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+ errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
&sbi->s_bdev_wb_err);
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
@@ -5585,7 +5564,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
ext4_msg(sb, KERN_INFO, "recovery complete");
err = ext4_mark_recovery_complete(sb, es);
if (err)
- goto failed_mount10;
+ goto failed_mount9;
}
if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
@@ -5602,15 +5581,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
atomic_set(&sbi->s_warning_count, 0);
atomic_set(&sbi->s_msg_count, 0);
+ /* Register sysfs after all initializations are complete. */
+ err = ext4_register_sysfs(sb);
+ if (err)
+ goto failed_mount9;
+
return 0;
-failed_mount10:
+failed_mount9:
ext4_quotas_off(sb, EXT4_MAXQUOTAS);
-failed_mount9: __maybe_unused
+failed_mount8: __maybe_unused
ext4_release_orphan_info(sb);
-failed_mount8:
- ext4_unregister_sysfs(sb);
- kobject_put(&sbi->s_kobj);
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
@@ -5869,7 +5850,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
offset = EXT4_MIN_BLOCK_SIZE % blocksize;
- set_blocksize(bdev, blocksize);
+ set_blocksize(bdev_file, blocksize);
bh = __bread(bdev, sb_block, blocksize);
if (!bh) {
ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
@@ -6126,8 +6107,8 @@ static void ext4_update_super(struct super_block *sb)
__ext4_update_tstamp(&es->s_first_error_time,
&es->s_first_error_time_hi,
sbi->s_first_error_time);
- strncpy(es->s_first_error_func, sbi->s_first_error_func,
- sizeof(es->s_first_error_func));
+ strtomem_pad(es->s_first_error_func,
+ sbi->s_first_error_func, 0);
es->s_first_error_line =
cpu_to_le32(sbi->s_first_error_line);
es->s_first_error_ino =
@@ -6140,8 +6121,7 @@ static void ext4_update_super(struct super_block *sb)
__ext4_update_tstamp(&es->s_last_error_time,
&es->s_last_error_time_hi,
sbi->s_last_error_time);
- strncpy(es->s_last_error_func, sbi->s_last_error_func,
- sizeof(es->s_last_error_func));
+ strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0);
es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
@@ -6168,8 +6148,6 @@ static int ext4_commit_super(struct super_block *sb)
if (!sbh)
return -EINVAL;
- if (block_device_ejected(sb))
- return -ENODEV;
ext4_update_super(sb);
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 6d332dff79dd..ddb54608ca2e 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -29,7 +29,10 @@ typedef enum {
attr_trigger_test_error,
attr_first_error_time,
attr_last_error_time,
+ attr_clusters_in_group,
+ attr_mb_order,
attr_feature,
+ attr_pointer_pi,
attr_pointer_ui,
attr_pointer_ul,
attr_pointer_u64,
@@ -104,7 +107,7 @@ static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi,
int ret;
ret = kstrtoull(skip_spaces(buf), 0, &val);
- if (ret || val >= clusters)
+ if (ret || val >= clusters || (s64)val < 0)
return -EINVAL;
atomic64_set(&sbi->s_resv_clusters, val);
@@ -178,6 +181,9 @@ static struct ext4_attr ext4_attr_##_name = { \
#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \
EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)
+#define EXT4_RW_ATTR_SBI_PI(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname)
+
#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
@@ -207,23 +213,25 @@ EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
ext4_sb_info, s_inode_readahead_blks);
+EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group,
+ ext4_sb_info, s_mb_group_prealloc);
+EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order,
+ ext4_sb_info, s_mb_best_avail_max_trim_order);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
+EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
#ifdef CONFIG_EXT4_DEBUG
EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
#endif
@@ -366,13 +374,45 @@ static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
#define print_tstamp(buf, es, tstamp) \
__print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi)
+static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ void *ptr = calc_ptr(a, sbi);
+
+ if (!ptr)
+ return 0;
+
+ switch (a->attr_id) {
+ case attr_inode_readahead:
+ case attr_clusters_in_group:
+ case attr_mb_order:
+ case attr_pointer_pi:
+ case attr_pointer_ui:
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
+ return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr));
+ case attr_pointer_ul:
+ return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr));
+ case attr_pointer_u8:
+ return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr));
+ case attr_pointer_u64:
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr));
+ return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr));
+ case attr_pointer_string:
+ return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr);
+ case attr_pointer_atomic:
+ return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr));
+ }
+ return 0;
+}
+
static ssize_t ext4_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
s_kobj);
struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
- void *ptr = calc_ptr(a, sbi);
switch (a->attr_id) {
case attr_delayed_allocation_blocks:
@@ -391,45 +431,6 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return sysfs_emit(buf, "%llu\n",
(unsigned long long)
percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
- case attr_inode_readahead:
- case attr_pointer_ui:
- if (!ptr)
- return 0;
- if (a->attr_ptr == ptr_ext4_super_block_offset)
- return sysfs_emit(buf, "%u\n",
- le32_to_cpup(ptr));
- else
- return sysfs_emit(buf, "%u\n",
- *((unsigned int *) ptr));
- case attr_pointer_ul:
- if (!ptr)
- return 0;
- return sysfs_emit(buf, "%lu\n",
- *((unsigned long *) ptr));
- case attr_pointer_u8:
- if (!ptr)
- return 0;
- return sysfs_emit(buf, "%u\n",
- *((unsigned char *) ptr));
- case attr_pointer_u64:
- if (!ptr)
- return 0;
- if (a->attr_ptr == ptr_ext4_super_block_offset)
- return sysfs_emit(buf, "%llu\n",
- le64_to_cpup(ptr));
- else
- return sysfs_emit(buf, "%llu\n",
- *((unsigned long long *) ptr));
- case attr_pointer_string:
- if (!ptr)
- return 0;
- return sysfs_emit(buf, "%.*s\n", a->attr_size,
- (char *) ptr);
- case attr_pointer_atomic:
- if (!ptr)
- return 0;
- return sysfs_emit(buf, "%d\n",
- atomic_read((atomic_t *) ptr));
case attr_feature:
return sysfs_emit(buf, "supported\n");
case attr_first_error_time:
@@ -438,29 +439,34 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return print_tstamp(buf, sbi->s_es, s_last_error_time);
case attr_journal_task:
return journal_task_show(sbi, buf);
+ default:
+ return ext4_generic_attr_show(a, sbi, buf);
}
-
- return 0;
}
-static ssize_t ext4_attr_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buf, size_t len)
+static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t len)
{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
- void *ptr = calc_ptr(a, sbi);
- unsigned long t;
int ret;
+ unsigned int t;
+ unsigned long lt;
+ void *ptr = calc_ptr(a, sbi);
+
+ if (!ptr)
+ return 0;
switch (a->attr_id) {
- case attr_reserved_clusters:
- return reserved_clusters_store(sbi, buf, len);
+ case attr_pointer_pi:
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if ((int)t < 0)
+ return -EINVAL;
+ *((unsigned int *) ptr) = t;
+ return len;
case attr_pointer_ui:
- if (!ptr)
- return 0;
- ret = kstrtoul(skip_spaces(buf), 0, &t);
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
if (ret)
return ret;
if (a->attr_ptr == ptr_ext4_super_block_offset)
@@ -468,20 +474,50 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
else
*((unsigned int *) ptr) = t;
return len;
+ case attr_mb_order:
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t > 64)
+ return -EINVAL;
+ *((unsigned int *) ptr) = t;
+ return len;
+ case attr_clusters_in_group:
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t > sbi->s_clusters_per_group)
+ return -EINVAL;
+ *((unsigned int *) ptr) = t;
+ return len;
case attr_pointer_ul:
- if (!ptr)
- return 0;
- ret = kstrtoul(skip_spaces(buf), 0, &t);
+ ret = kstrtoul(skip_spaces(buf), 0, &lt);
if (ret)
return ret;
- *((unsigned long *) ptr) = t;
+ *((unsigned long *) ptr) = lt;
return len;
+ }
+ return 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ switch (a->attr_id) {
+ case attr_reserved_clusters:
+ return reserved_clusters_store(sbi, buf, len);
case attr_inode_readahead:
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
return trigger_test_error(sbi, buf, len);
+ default:
+ return ext4_generic_attr_store(a, sbi, buf, len);
}
- return 0;
}
static void ext4_sb_release(struct kobject *kobj)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b67a176bfcf9..6460879b9fcb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1619,6 +1619,7 @@ out_err:
static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
struct ext4_xattr_search *s,
handle_t *handle, struct inode *inode,
+ struct inode *new_ea_inode,
bool is_block)
{
struct ext4_xattr_entry *last, *next;
@@ -1626,7 +1627,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
size_t min_offs = s->end - s->base, name_len = strlen(i->name);
int in_inode = i->in_inode;
struct inode *old_ea_inode = NULL;
- struct inode *new_ea_inode = NULL;
size_t old_size, new_size;
int ret;
@@ -1711,38 +1711,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
old_ea_inode = NULL;
goto out;
}
- }
- if (i->value && in_inode) {
- WARN_ON_ONCE(!i->value_len);
-
- new_ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
- i->value, i->value_len);
- if (IS_ERR(new_ea_inode)) {
- ret = PTR_ERR(new_ea_inode);
- new_ea_inode = NULL;
- goto out;
- }
- }
- if (old_ea_inode) {
/* We are ready to release ref count on the old_ea_inode. */
ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
- if (ret) {
- /* Release newly required ref count on new_ea_inode. */
- if (new_ea_inode) {
- int err;
-
- err = ext4_xattr_inode_dec_ref(handle,
- new_ea_inode);
- if (err)
- ext4_warning_inode(new_ea_inode,
- "dec ref new_ea_inode err=%d",
- err);
- ext4_xattr_inode_free_quota(inode, new_ea_inode,
- i->value_len);
- }
+ if (ret)
goto out;
- }
ext4_xattr_inode_free_quota(inode, old_ea_inode,
le32_to_cpu(here->e_value_size));
@@ -1866,7 +1839,6 @@ update_hash:
ret = 0;
out:
iput(old_ea_inode);
- iput(new_ea_inode);
return ret;
}
@@ -1929,9 +1901,21 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
size_t old_ea_inode_quota = 0;
unsigned int ea_ino;
-
#define header(x) ((struct ext4_xattr_header *)(x))
+ /* If we need EA inode, prepare it before locking the buffer */
+ if (i->value && i->in_inode) {
+ WARN_ON_ONCE(!i->value_len);
+
+ ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
+ i->value, i->value_len);
+ if (IS_ERR(ea_inode)) {
+ error = PTR_ERR(ea_inode);
+ ea_inode = NULL;
+ goto cleanup;
+ }
+ }
+
if (s->base) {
int offset = (char *)s->here - bs->bh->b_data;
@@ -1940,6 +1924,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
EXT4_JTR_NONE);
if (error)
goto cleanup;
+
lock_buffer(bs->bh);
if (header(s->base)->h_refcount == cpu_to_le32(1)) {
@@ -1966,7 +1951,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
}
ea_bdebug(bs->bh, "modifying in-place");
error = ext4_xattr_set_entry(i, s, handle, inode,
- true /* is_block */);
+ ea_inode, true /* is_block */);
ext4_xattr_block_csum_set(inode, bs->bh);
unlock_buffer(bs->bh);
if (error == -EFSCORRUPTED)
@@ -2034,33 +2019,22 @@ clone_block:
s->end = s->base + sb->s_blocksize;
}
- error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */);
+ error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode,
+ true /* is_block */);
if (error == -EFSCORRUPTED)
goto bad_block;
if (error)
goto cleanup;
- if (i->value && s->here->e_value_inum) {
- /*
- * A ref count on ea_inode has been taken as part of the call to
- * ext4_xattr_set_entry() above. We would like to drop this
- * extra ref but we have to wait until the xattr block is
- * initialized and has its own ref count on the ea_inode.
- */
- ea_ino = le32_to_cpu(s->here->e_value_inum);
- error = ext4_xattr_inode_iget(inode, ea_ino,
- le32_to_cpu(s->here->e_hash),
- &ea_inode);
- if (error) {
- ea_inode = NULL;
+inserted:
+ if (!IS_LAST_ENTRY(s->first)) {
+ new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce);
+ if (IS_ERR(new_bh)) {
+ error = PTR_ERR(new_bh);
+ new_bh = NULL;
goto cleanup;
}
- }
-inserted:
- if (!IS_LAST_ENTRY(s->first)) {
- new_bh = ext4_xattr_block_cache_find(inode, header(s->base),
- &ce);
if (new_bh) {
/* We found an identical block in the cache. */
if (new_bh == bs->bh)
@@ -2158,6 +2132,17 @@ getblk_failed:
ENTRY(header(s->base)+1));
if (error)
goto getblk_failed;
+ if (ea_inode) {
+ /* Drop the extra ref on ea_inode. */
+ error = ext4_xattr_inode_dec_ref(handle,
+ ea_inode);
+ if (error)
+ ext4_warning_inode(ea_inode,
+ "dec ref error=%d",
+ error);
+ iput(ea_inode);
+ ea_inode = NULL;
+ }
lock_buffer(new_bh);
error = ext4_journal_get_create_access(handle, sb,
@@ -2198,17 +2183,16 @@ getblk_failed:
cleanup:
if (ea_inode) {
- int error2;
-
- error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
- if (error2)
- ext4_warning_inode(ea_inode, "dec ref error=%d",
- error2);
+ if (error) {
+ int error2;
- /* If there was an error, revert the quota charge. */
- if (error)
+ error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
+ if (error2)
+ ext4_warning_inode(ea_inode, "dec ref error=%d",
+ error2);
ext4_xattr_inode_free_quota(inode, ea_inode,
i_size_read(ea_inode));
+ }
iput(ea_inode);
}
if (ce)
@@ -2266,14 +2250,38 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
{
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_search *s = &is->s;
+ struct inode *ea_inode = NULL;
int error;
if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return -ENOSPC;
- error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
- if (error)
+ /* If we need EA inode, prepare it before locking the buffer */
+ if (i->value && i->in_inode) {
+ WARN_ON_ONCE(!i->value_len);
+
+ ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
+ i->value, i->value_len);
+ if (IS_ERR(ea_inode))
+ return PTR_ERR(ea_inode);
+ }
+ error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode,
+ false /* is_block */);
+ if (error) {
+ if (ea_inode) {
+ int error2;
+
+ error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
+ if (error2)
+ ext4_warning_inode(ea_inode, "dec ref error=%d",
+ error2);
+
+ ext4_xattr_inode_free_quota(inode, ea_inode,
+ i_size_read(ea_inode));
+ iput(ea_inode);
+ }
return error;
+ }
header = IHDR(inode, ext4_raw_inode(&is->iloc));
if (!IS_LAST_ENTRY(s->first)) {
header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
@@ -2282,6 +2290,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
header->h_magic = cpu_to_le32(0);
ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
}
+ iput(ea_inode);
return 0;
}
@@ -3090,8 +3099,8 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
*
* Find an identical extended attribute block.
*
- * Returns a pointer to the block found, or NULL if such a block was
- * not found or an error occurred.
+ * Returns a pointer to the block found, or NULL if such a block was not
+ * found, or an error pointer if an error occurred while reading ea block.
*/
static struct buffer_head *
ext4_xattr_block_cache_find(struct inode *inode,
@@ -3113,11 +3122,11 @@ ext4_xattr_block_cache_find(struct inode *inode,
bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO);
if (IS_ERR(bh)) {
- if (PTR_ERR(bh) == -ENOMEM)
- return NULL;
- bh = NULL;
- EXT4_ERROR_INODE(inode, "block %lu read error",
- (unsigned long)ce->e_value);
+ if (PTR_ERR(bh) != -ENOMEM)
+ EXT4_ERROR_INODE(inode, "block %lu read error",
+ (unsigned long)ce->e_value);
+ mb_cache_entry_put(ea_block_cache, ce);
+ return bh;
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
*pce = ce;
return bh;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index eac698b8dd38..55d444bec5c0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -179,22 +179,22 @@ static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
break;
case META_SIT:
if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
- goto err;
+ goto check_only;
break;
case META_SSA:
if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
blkaddr < SM_I(sbi)->ssa_blkaddr))
- goto err;
+ goto check_only;
break;
case META_CP:
if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
blkaddr < __start_cp_addr(sbi)))
- goto err;
+ goto check_only;
break;
case META_POR:
if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
blkaddr < MAIN_BLKADDR(sbi)))
- goto err;
+ goto check_only;
break;
case DATA_GENERIC:
case DATA_GENERIC_ENHANCE:
@@ -228,6 +228,7 @@ static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
return true;
err:
f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+check_only:
return false;
}
@@ -345,7 +346,7 @@ static int __f2fs_write_meta_page(struct page *page,
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
- trace_f2fs_writepage(page, META);
+ trace_f2fs_writepage(page_folio(page), META);
if (unlikely(f2fs_cp_error(sbi))) {
if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
@@ -492,7 +493,7 @@ stop:
static bool f2fs_dirty_meta_folio(struct address_space *mapping,
struct folio *folio)
{
- trace_f2fs_set_page_dirty(&folio->page, META);
+ trace_f2fs_set_page_dirty(folio, META);
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 8892c8262141..1ef82a546391 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -198,8 +198,8 @@ static int lzo_compress_pages(struct compress_ctx *cc)
ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
&cc->clen, cc->private);
if (ret != LZO_E_OK) {
- printk_ratelimited("%sF2FS-fs (%s): lzo compress failed, ret:%d\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret);
+ f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+ "lzo compress failed, ret:%d", ret);
return -EIO;
}
return 0;
@@ -212,17 +212,15 @@ static int lzo_decompress_pages(struct decompress_io_ctx *dic)
ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen,
dic->rbuf, &dic->rlen);
if (ret != LZO_E_OK) {
- printk_ratelimited("%sF2FS-fs (%s): lzo decompress failed, ret:%d\n",
- KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret);
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "lzo decompress failed, ret:%d", ret);
return -EIO;
}
if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) {
- printk_ratelimited("%sF2FS-fs (%s): lzo invalid rlen:%zu, "
- "expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id,
- dic->rlen,
- PAGE_SIZE << dic->log_cluster_size);
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "lzo invalid rlen:%zu, expected:%lu",
+ dic->rlen, PAGE_SIZE << dic->log_cluster_size);
return -EIO;
}
return 0;
@@ -294,16 +292,15 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic)
ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf,
dic->clen, dic->rlen);
if (ret < 0) {
- printk_ratelimited("%sF2FS-fs (%s): lz4 decompress failed, ret:%d\n",
- KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret);
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "lz4 decompress failed, ret:%d", ret);
return -EIO;
}
if (ret != PAGE_SIZE << dic->log_cluster_size) {
- printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, "
- "expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id, ret,
- PAGE_SIZE << dic->log_cluster_size);
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "lz4 invalid ret:%d, expected:%lu",
+ ret, PAGE_SIZE << dic->log_cluster_size);
return -EIO;
}
return 0;
@@ -350,9 +347,8 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
stream = zstd_init_cstream(&params, 0, workspace, workspace_size);
if (!stream) {
- printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_cstream failed\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
- __func__);
+ f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+ "%s zstd_init_cstream failed", __func__);
kvfree(workspace);
return -EIO;
}
@@ -390,16 +386,16 @@ static int zstd_compress_pages(struct compress_ctx *cc)
ret = zstd_compress_stream(stream, &outbuf, &inbuf);
if (zstd_is_error(ret)) {
- printk_ratelimited("%sF2FS-fs (%s): %s zstd_compress_stream failed, ret: %d\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+ "%s zstd_compress_stream failed, ret: %d",
__func__, zstd_get_error_code(ret));
return -EIO;
}
ret = zstd_end_stream(stream, &outbuf);
if (zstd_is_error(ret)) {
- printk_ratelimited("%sF2FS-fs (%s): %s zstd_end_stream returned %d\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+ "%s zstd_end_stream returned %d",
__func__, zstd_get_error_code(ret));
return -EIO;
}
@@ -432,9 +428,8 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
stream = zstd_init_dstream(max_window_size, workspace, workspace_size);
if (!stream) {
- printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_dstream failed\n",
- KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
- __func__);
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "%s zstd_init_dstream failed", __func__);
kvfree(workspace);
return -EIO;
}
@@ -469,16 +464,15 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic)
ret = zstd_decompress_stream(stream, &outbuf, &inbuf);
if (zstd_is_error(ret)) {
- printk_ratelimited("%sF2FS-fs (%s): %s zstd_decompress_stream failed, ret: %d\n",
- KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "%s zstd_decompress_stream failed, ret: %d",
__func__, zstd_get_error_code(ret));
return -EIO;
}
if (dic->rlen != outbuf.pos) {
- printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, "
- "expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id,
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "%s ZSTD invalid rlen:%zu, expected:%lu",
__func__, dic->rlen,
PAGE_SIZE << dic->log_cluster_size);
return -EIO;
@@ -1031,6 +1025,31 @@ static void set_cluster_writeback(struct compress_ctx *cc)
}
}
+static void cancel_cluster_writeback(struct compress_ctx *cc,
+ struct compress_io_ctx *cic, int submitted)
+{
+ int i;
+
+ /* Wait for submitted IOs. */
+ if (submitted > 1) {
+ f2fs_submit_merged_write(F2FS_I_SB(cc->inode), DATA);
+ while (atomic_read(&cic->pending_pages) !=
+ (cc->valid_nr_cpages - submitted + 1))
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ }
+
+ /* Cancel writeback and stay locked. */
+ for (i = 0; i < cc->cluster_size; i++) {
+ if (i < submitted) {
+ inode_inc_dirty_pages(cc->inode);
+ lock_page(cc->rpages[i]);
+ }
+ clear_page_private_gcing(cc->rpages[i]);
+ if (folio_test_writeback(page_folio(cc->rpages[i])))
+ end_page_writeback(cc->rpages[i]);
+ }
+}
+
static void set_cluster_dirty(struct compress_ctx *cc)
{
int i;
@@ -1232,7 +1251,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
.page = NULL,
.encrypted_page = NULL,
.compressed_page = NULL,
- .submitted = 0,
.io_type = io_type,
.io_wbc = wbc,
.encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ?
@@ -1358,7 +1376,16 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
fio.compressed_page = cc->cpages[i - 1];
cc->cpages[i - 1] = NULL;
+ fio.submitted = 0;
f2fs_outplace_write_data(&dn, &fio);
+ if (unlikely(!fio.submitted)) {
+ cancel_cluster_writeback(cc, cic, i);
+
+ /* To call fscrypt_finalize_bounce_page */
+ i = cc->valid_nr_cpages;
+ *submitted = 0;
+ goto out_destroy_crypt;
+ }
(*submitted)++;
unlock_continue:
inode_dec_dirty_pages(cc->inode);
@@ -1392,8 +1419,11 @@ unlock_continue:
out_destroy_crypt:
page_array_free(cc->inode, cic->rpages, cc->cluster_size);
- for (--i; i >= 0; i--)
+ for (--i; i >= 0; i--) {
+ if (!cc->cpages[i])
+ continue;
fscrypt_finalize_bounce_page(&cc->cpages[i]);
+ }
out_put_cic:
kmem_cache_free(cic_entry_slab, cic);
out_put_dnode:
@@ -1484,7 +1514,7 @@ continue_unlock:
if (!PageDirty(cc->rpages[i]))
goto continue_unlock;
- if (PageWriteback(cc->rpages[i])) {
+ if (folio_test_writeback(page_folio(cc->rpages[i]))) {
if (wbc->sync_mode == WB_SYNC_NONE)
goto continue_unlock;
f2fs_wait_on_page_writeback(cc->rpages[i], DATA, true, true);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d9494b5fc7c1..b9b0debc6b3d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -465,6 +465,8 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
} else {
bio->bi_end_io = f2fs_write_end_io;
bio->bi_private = sbi;
+ bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi,
+ fio->type, fio->temp);
}
iostat_alloc_and_bind_ctx(sbi, bio, NULL);
@@ -593,17 +595,20 @@ int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi)
return -ENOMEM;
for (j = HOT; j < n; j++) {
- init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem);
- sbi->write_io[i][j].sbi = sbi;
- sbi->write_io[i][j].bio = NULL;
- spin_lock_init(&sbi->write_io[i][j].io_lock);
- INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
- INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
- init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock);
+ struct f2fs_bio_info *io = &sbi->write_io[i][j];
+
+ init_f2fs_rwsem(&io->io_rwsem);
+ io->sbi = sbi;
+ io->bio = NULL;
+ io->last_block_in_bio = 0;
+ spin_lock_init(&io->io_lock);
+ INIT_LIST_HEAD(&io->io_list);
+ INIT_LIST_HEAD(&io->bio_list);
+ init_f2fs_rwsem(&io->bio_list_lock);
#ifdef CONFIG_BLK_DEV_ZONED
- init_completion(&sbi->write_io[i][j].zone_wait);
- sbi->write_io[i][j].zone_pending_bio = NULL;
- sbi->write_io[i][j].bi_private = NULL;
+ init_completion(&io->zone_wait);
+ io->zone_pending_bio = NULL;
+ io->bi_private = NULL;
#endif
}
}
@@ -1507,6 +1512,25 @@ static bool f2fs_map_blocks_cached(struct inode *inode,
return true;
}
+static bool map_is_mergeable(struct f2fs_sb_info *sbi,
+ struct f2fs_map_blocks *map,
+ block_t blkaddr, int flag, int bidx,
+ int ofs)
+{
+ if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev)
+ return false;
+ if (map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs))
+ return true;
+ if (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR)
+ return true;
+ if (flag == F2FS_GET_BLOCK_PRE_DIO)
+ return true;
+ if (flag == F2FS_GET_BLOCK_DIO &&
+ map->m_pblk == NULL_ADDR && blkaddr == NULL_ADDR)
+ return true;
+ return false;
+}
+
/*
* f2fs_map_blocks() tries to find or build mapping relationship which
* maps continuous logical blocks to physical blocks, and return such
@@ -1574,8 +1598,9 @@ next_block:
}
/* use out-place-update for direct IO under LFS mode */
- if (map->m_may_create &&
- (is_hole || (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO))) {
+ if (map->m_may_create && (is_hole ||
+ (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) &&
+ !f2fs_is_pinned_file(inode)))) {
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
goto sync_out;
@@ -1628,6 +1653,10 @@ next_block:
goto sync_out;
}
break;
+ case F2FS_GET_BLOCK_DIO:
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs = pgofs + 1;
+ break;
default:
/* for defragment case */
if (map->m_next_pgofs)
@@ -1646,19 +1675,15 @@ next_block:
/* reserved delalloc block should be mapped for fiemap. */
if (blkaddr == NEW_ADDR)
map->m_flags |= F2FS_MAP_DELALLOC;
- map->m_flags |= F2FS_MAP_MAPPED;
+ if (flag != F2FS_GET_BLOCK_DIO || !is_hole)
+ map->m_flags |= F2FS_MAP_MAPPED;
map->m_pblk = blkaddr;
map->m_len = 1;
if (map->m_multidev_dio)
map->m_bdev = FDEV(bidx).bdev;
- } else if ((map->m_pblk != NEW_ADDR &&
- blkaddr == (map->m_pblk + ofs)) ||
- (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
- flag == F2FS_GET_BLOCK_PRE_DIO) {
- if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev)
- goto sync_out;
+ } else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) {
ofs++;
map->m_len++;
} else {
@@ -2042,7 +2067,7 @@ static inline loff_t f2fs_readpage_limit(struct inode *inode)
return i_size_read(inode);
}
-static int f2fs_read_single_page(struct inode *inode, struct page *page,
+static int f2fs_read_single_page(struct inode *inode, struct folio *folio,
unsigned nr_pages,
struct f2fs_map_blocks *map,
struct bio **bio_ret,
@@ -2055,9 +2080,10 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
sector_t last_block;
sector_t last_block_in_file;
sector_t block_nr;
+ pgoff_t index = folio_index(folio);
int ret = 0;
- block_in_file = (sector_t)page_index(page);
+ block_in_file = (sector_t)index;
last_block = block_in_file + nr_pages;
last_block_in_file = bytes_to_blks(inode,
f2fs_readpage_limit(inode) + blocksize - 1);
@@ -2088,7 +2114,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
got_it:
if ((map->m_flags & F2FS_MAP_MAPPED)) {
block_nr = map->m_pblk + block_in_file - map->m_lblk;
- SetPageMappedToDisk(page);
+ folio_set_mappedtodisk(folio);
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
DATA_GENERIC_ENHANCE_READ)) {
@@ -2097,15 +2123,15 @@ got_it:
}
} else {
zero_out:
- zero_user_segment(page, 0, PAGE_SIZE);
- if (f2fs_need_verity(inode, page->index) &&
- !fsverity_verify_page(page)) {
+ folio_zero_segment(folio, 0, folio_size(folio));
+ if (f2fs_need_verity(inode, index) &&
+ !fsverity_verify_folio(folio)) {
ret = -EIO;
goto out;
}
- if (!PageUptodate(page))
- SetPageUptodate(page);
- unlock_page(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
goto out;
}
@@ -2115,14 +2141,14 @@ zero_out:
*/
if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
*last_block_in_bio, block_nr) ||
- !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
+ !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
submit_and_realloc:
f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
if (bio == NULL) {
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
- is_readahead ? REQ_RAHEAD : 0, page->index,
+ is_readahead ? REQ_RAHEAD : 0, index,
false);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
@@ -2137,7 +2163,7 @@ submit_and_realloc:
*/
f2fs_wait_on_block_writeback(inode, block_nr);
- if (bio_add_page(bio, page, blocksize, 0) < blocksize)
+ if (!bio_add_folio(bio, folio, blocksize, 0))
goto submit_and_realloc;
inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
@@ -2324,7 +2350,7 @@ out:
* Major change was from block_size == page_size in f2fs by default.
*/
static int f2fs_mpage_readpages(struct inode *inode,
- struct readahead_control *rac, struct page *page)
+ struct readahead_control *rac, struct folio *folio)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
@@ -2344,6 +2370,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
#endif
unsigned nr_pages = rac ? readahead_count(rac) : 1;
unsigned max_nr_pages = nr_pages;
+ pgoff_t index;
int ret = 0;
map.m_pblk = 0;
@@ -2357,64 +2384,63 @@ static int f2fs_mpage_readpages(struct inode *inode,
for (; nr_pages; nr_pages--) {
if (rac) {
- page = readahead_page(rac);
- prefetchw(&page->flags);
+ folio = readahead_folio(rac);
+ prefetchw(&folio->flags);
}
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (f2fs_compressed_file(inode)) {
- /* there are remained compressed pages, submit them */
- if (!f2fs_cluster_can_merge_page(&cc, page->index)) {
- ret = f2fs_read_multi_pages(&cc, &bio,
- max_nr_pages,
- &last_block_in_bio,
- rac != NULL, false);
- f2fs_destroy_compress_ctx(&cc, false);
- if (ret)
- goto set_error_page;
- }
- if (cc.cluster_idx == NULL_CLUSTER) {
- if (nc_cluster_idx ==
- page->index >> cc.log_cluster_size) {
- goto read_single_page;
- }
-
- ret = f2fs_is_compressed_cluster(inode, page->index);
- if (ret < 0)
- goto set_error_page;
- else if (!ret) {
- nc_cluster_idx =
- page->index >> cc.log_cluster_size;
- goto read_single_page;
- }
+ index = folio_index(folio);
- nc_cluster_idx = NULL_CLUSTER;
- }
- ret = f2fs_init_compress_ctx(&cc);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (!f2fs_compressed_file(inode))
+ goto read_single_page;
+
+ /* there are remained compressed pages, submit them */
+ if (!f2fs_cluster_can_merge_page(&cc, index)) {
+ ret = f2fs_read_multi_pages(&cc, &bio,
+ max_nr_pages,
+ &last_block_in_bio,
+ rac != NULL, false);
+ f2fs_destroy_compress_ctx(&cc, false);
if (ret)
goto set_error_page;
+ }
+ if (cc.cluster_idx == NULL_CLUSTER) {
+ if (nc_cluster_idx == index >> cc.log_cluster_size)
+ goto read_single_page;
- f2fs_compress_ctx_add_page(&cc, page);
+ ret = f2fs_is_compressed_cluster(inode, index);
+ if (ret < 0)
+ goto set_error_page;
+ else if (!ret) {
+ nc_cluster_idx =
+ index >> cc.log_cluster_size;
+ goto read_single_page;
+ }
- goto next_page;
+ nc_cluster_idx = NULL_CLUSTER;
}
+ ret = f2fs_init_compress_ctx(&cc);
+ if (ret)
+ goto set_error_page;
+
+ f2fs_compress_ctx_add_page(&cc, &folio->page);
+
+ goto next_page;
read_single_page:
#endif
- ret = f2fs_read_single_page(inode, page, max_nr_pages, &map,
+ ret = f2fs_read_single_page(inode, folio, max_nr_pages, &map,
&bio, &last_block_in_bio, rac);
if (ret) {
#ifdef CONFIG_F2FS_FS_COMPRESSION
set_error_page:
#endif
- zero_user_segment(page, 0, PAGE_SIZE);
- unlock_page(page);
+ folio_zero_segment(folio, 0, folio_size(folio));
+ folio_unlock(folio);
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
next_page:
#endif
- if (rac)
- put_page(page);
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
@@ -2436,22 +2462,21 @@ next_page:
static int f2fs_read_data_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = folio_file_mapping(folio)->host;
int ret = -EAGAIN;
- trace_f2fs_readpage(page, DATA);
+ trace_f2fs_readpage(folio, DATA);
if (!f2fs_is_compress_backend_ready(inode)) {
- unlock_page(page);
+ folio_unlock(folio);
return -EOPNOTSUPP;
}
/* If the file has inline data, try to read it directly */
if (f2fs_has_inline_data(inode))
- ret = f2fs_read_inline_data(inode, page);
+ ret = f2fs_read_inline_data(inode, folio);
if (ret == -EAGAIN)
- ret = f2fs_mpage_readpages(inode, NULL, page);
+ ret = f2fs_mpage_readpages(inode, NULL, folio);
return ret;
}
@@ -2685,12 +2710,11 @@ got_it:
if (err) {
if (fscrypt_inode_uses_fs_layer_crypto(inode))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
- if (PageWriteback(page))
- end_page_writeback(page);
+ end_page_writeback(page);
} else {
set_inode_flag(inode, FI_UPDATE_WRITE);
}
- trace_f2fs_do_write_data_page(fio->page, IPU);
+ trace_f2fs_do_write_data_page(page_folio(page), IPU);
return err;
}
@@ -2719,7 +2743,7 @@ got_it:
/* LFS mode write path */
f2fs_outplace_write_data(&dn, fio);
- trace_f2fs_do_write_data_page(page, OPU);
+ trace_f2fs_do_write_data_page(page_folio(page), OPU);
set_inode_flag(inode, FI_APPEND_WRITE);
out_writepage:
f2fs_put_dnode(&dn);
@@ -2766,7 +2790,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
.last_block = last_block,
};
- trace_f2fs_writepage(page, DATA);
+ trace_f2fs_writepage(page_folio(page), DATA);
/* we should bypass data pages to proceed the kworker jobs */
if (unlikely(f2fs_cp_error(sbi))) {
@@ -3379,7 +3403,7 @@ restart:
if (f2fs_has_inline_data(inode)) {
if (pos + len <= MAX_INLINE_DATA(inode)) {
- f2fs_do_read_inline_data(page, ipage);
+ f2fs_do_read_inline_data(page_folio(page), ipage);
set_inode_flag(inode, FI_DATA_EXIST);
if (inode->i_nlink)
set_page_private_inline(ipage);
@@ -3740,7 +3764,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping,
{
struct inode *inode = mapping->host;
- trace_f2fs_set_page_dirty(&folio->page, DATA);
+ trace_f2fs_set_page_dirty(folio, DATA);
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
@@ -3896,15 +3920,14 @@ static int check_swap_activate(struct swap_info_struct *sis,
struct address_space *mapping = swap_file->f_mapping;
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- sector_t cur_lblock;
- sector_t last_lblock;
- sector_t pblock;
- sector_t lowest_pblock = -1;
- sector_t highest_pblock = 0;
+ block_t cur_lblock;
+ block_t last_lblock;
+ block_t pblock;
+ block_t lowest_pblock = -1;
+ block_t highest_pblock = 0;
int nr_extents = 0;
- unsigned long nr_pblocks;
+ unsigned int nr_pblocks;
unsigned int blks_per_sec = BLKS_PER_SEC(sbi);
- unsigned int sec_blks_mask = BLKS_PER_SEC(sbi) - 1;
unsigned int not_aligned = 0;
int ret = 0;
@@ -3942,8 +3965,8 @@ retry:
pblock = map.m_pblk;
nr_pblocks = map.m_len;
- if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask ||
- nr_pblocks & sec_blks_mask ||
+ if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec ||
+ nr_pblocks % blks_per_sec ||
!f2fs_valid_pinned_area(sbi, pblock)) {
bool last_extent = false;
@@ -4082,11 +4105,12 @@ const struct address_space_operations f2fs_dblock_aops = {
void f2fs_clear_page_cache_dirty_tag(struct page *page)
{
- struct address_space *mapping = page_mapping(page);
+ struct folio *folio = page_folio(page);
+ struct address_space *mapping = folio->mapping;
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- __xa_clear_mark(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, folio->index,
PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -4159,7 +4183,8 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_lblk = bytes_to_blks(inode, offset);
map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1;
map.m_next_pgofs = &next_pgofs;
- map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint);
+ map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
+ inode->i_write_hint);
if (flags & IOMAP_WRITE)
map.m_may_create = true;
@@ -4180,12 +4205,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
* We should never see delalloc or compressed extents here based on
* prior flushing and checks.
*/
- if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR))
- return -EINVAL;
if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR))
return -EINVAL;
- if (map.m_pblk != NULL_ADDR) {
+ if (map.m_flags & F2FS_MAP_MAPPED) {
+ if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR))
+ return -EINVAL;
+
iomap->length = blks_to_bytes(inode, map.m_len);
iomap->type = IOMAP_MAPPED;
iomap->flags |= IOMAP_F_MERGED;
@@ -4194,9 +4220,17 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
} else {
if (flags & IOMAP_WRITE)
return -ENOTBLK;
- iomap->length = blks_to_bytes(inode, next_pgofs) -
- iomap->offset;
- iomap->type = IOMAP_HOLE;
+
+ if (map.m_pblk == NULL_ADDR) {
+ iomap->length = blks_to_bytes(inode, next_pgofs) -
+ iomap->offset;
+ iomap->type = IOMAP_HOLE;
+ } else if (map.m_pblk == NEW_ADDR) {
+ iomap->length = blks_to_bytes(inode, map.m_len);
+ iomap->type = IOMAP_UNWRITTEN;
+ } else {
+ f2fs_bug_on(F2FS_I_SB(inode), 1);
+ }
iomap->addr = IOMAP_NULL_ADDR;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index fced2b7652f4..1974b6aff397 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -72,7 +72,7 @@ enum {
struct f2fs_fault_info {
atomic_t inject_ops;
- unsigned int inject_rate;
+ int inject_rate;
unsigned int inject_type;
};
@@ -765,11 +765,6 @@ enum {
#define DEF_DIR_LEVEL 0
-enum {
- GC_FAILURE_PIN,
- MAX_GC_FAILURE
-};
-
/* used for f2fs_inode_info->flags */
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
@@ -816,9 +811,10 @@ struct f2fs_inode_info {
unsigned long i_flags; /* keep an inode flags for ioctl */
unsigned char i_advise; /* use to give file attribute hints */
unsigned char i_dir_level; /* use for dentry level for large dir */
- unsigned int i_current_depth; /* only for directory depth */
- /* for gc failure statistic */
- unsigned int i_gc_failures[MAX_GC_FAILURE];
+ union {
+ unsigned int i_current_depth; /* only for directory depth */
+ unsigned short i_gc_failures; /* for gc failure statistic */
+ };
unsigned int i_pino; /* parent inode number */
umode_t i_acl_mode; /* keep file acl mode temporarily */
@@ -1557,6 +1553,7 @@ struct f2fs_sb_info {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int blocks_per_blkz; /* F2FS blocks per zone */
+ unsigned int max_open_zones; /* max open zone resources of the zoned device */
#endif
/* for node-related operations */
@@ -1676,7 +1673,7 @@ struct f2fs_sb_info {
unsigned long long skipped_gc_rwsem; /* FG_GC only */
/* threshold for gc trials on pinned files */
- u64 gc_pin_file_threshold;
+ unsigned short gc_pin_file_threshold;
struct f2fs_rwsem pin_sem;
/* maximum # of trials to find a victim segment for SSR and GC */
@@ -2309,7 +2306,7 @@ static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool);
static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
struct inode *inode, blkcnt_t *count, bool partial)
{
- blkcnt_t diff = 0, release = 0;
+ long long diff = 0, release = 0;
block_t avail_user_block_count;
int ret;
@@ -2329,26 +2326,27 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
percpu_counter_add(&sbi->alloc_valid_block_count, (*count));
spin_lock(&sbi->stat_lock);
- sbi->total_valid_block_count += (block_t)(*count);
- avail_user_block_count = get_available_block_count(sbi, inode, true);
- if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
+ avail_user_block_count = get_available_block_count(sbi, inode, true);
+ diff = (long long)sbi->total_valid_block_count + *count -
+ avail_user_block_count;
+ if (unlikely(diff > 0)) {
if (!partial) {
spin_unlock(&sbi->stat_lock);
+ release = *count;
goto enospc;
}
-
- diff = sbi->total_valid_block_count - avail_user_block_count;
if (diff > *count)
diff = *count;
*count -= diff;
release = diff;
- sbi->total_valid_block_count -= diff;
if (!*count) {
spin_unlock(&sbi->stat_lock);
goto enospc;
}
}
+ sbi->total_valid_block_count += (block_t)(*count);
+
spin_unlock(&sbi->stat_lock);
if (unlikely(release)) {
@@ -3132,7 +3130,7 @@ static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
static inline void f2fs_i_gc_failures_write(struct inode *inode,
unsigned int count)
{
- F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = count;
+ F2FS_I(inode)->i_gc_failures = count;
f2fs_mark_inode_dirty_sync(inode, true);
}
@@ -3497,6 +3495,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
+int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
+ bool readonly);
int f2fs_precache_extents(struct inode *inode);
int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int f2fs_fileattr_set(struct mnt_idmap *idmap,
@@ -3719,6 +3719,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
block_t old_addr, block_t new_addr,
unsigned char version, bool recover_curseg,
bool recover_newaddr);
+int f2fs_get_segment_temp(int seg_type);
int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
@@ -3741,7 +3742,9 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi);
void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi);
int __init f2fs_create_segment_manager_caches(void);
void f2fs_destroy_segment_manager_caches(void);
-int f2fs_rw_hint_to_seg_type(enum rw_hint hint);
+int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint);
+enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
+ enum page_type type, enum temp_type temp);
unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
unsigned int segno);
unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
@@ -4148,10 +4151,10 @@ extern struct kmem_cache *f2fs_inode_entry_slab;
bool f2fs_may_inline_data(struct inode *inode);
bool f2fs_sanity_check_inline_data(struct inode *inode);
bool f2fs_may_inline_dentry(struct inode *inode);
-void f2fs_do_read_inline_data(struct page *page, struct page *ipage);
+void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage);
void f2fs_truncate_inline_inode(struct inode *inode,
struct page *ipage, u64 from);
-int f2fs_read_inline_data(struct inode *inode, struct page *page);
+int f2fs_read_inline_data(struct inode *inode, struct folio *folio);
int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
int f2fs_convert_inline_inode(struct inode *inode);
int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry);
@@ -4596,10 +4599,14 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
}
#ifdef CONFIG_F2FS_FAULT_INJECTION
-extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
- unsigned int type);
+extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
+ unsigned long type);
#else
-#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0)
+static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
+ unsigned long rate, unsigned long type)
+{
+ return 0;
+}
#endif
static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
@@ -4657,7 +4664,7 @@ static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi,
page = find_get_page(META_MAPPING(sbi), blkaddr + i);
if (page) {
- if (PageWriteback(page))
+ if (folio_test_writeback(page_folio(page)))
need_submit = true;
f2fs_put_page(page, 0);
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 2b65e09822d4..5c0b281a70f3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -58,7 +58,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
struct inode *inode = file_inode(vmf->vma->vm_file);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
- bool need_alloc = true;
+ bool need_alloc = !f2fs_is_pinned_file(inode);
int err = 0;
vm_fault_t ret;
@@ -115,19 +115,18 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
goto out_sem;
}
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
if (need_alloc) {
/* block allocation */
- set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_block_locked(&dn, page->index);
- }
-
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (!need_alloc) {
- set_new_dnode(&dn, inode, NULL, NULL, 0);
+ } else {
err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
f2fs_put_dnode(&dn);
+ if (f2fs_is_pinned_file(inode) &&
+ !__is_valid_data_blkaddr(dn.data_blkaddr))
+ err = -EIO;
}
-#endif
+
if (err) {
unlock_page(page);
goto out_sem;
@@ -834,7 +833,8 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw)
* for blkzoned device, fallback direct IO to buffered IO, so
* all IOs can be serialized by log-structured write.
*/
- if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE))
+ if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE) &&
+ !f2fs_is_pinned_file(inode))
return true;
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
return true;
@@ -952,9 +952,14 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
ATTR_GID | ATTR_TIMES_SET))))
return -EPERM;
- if ((attr->ia_valid & ATTR_SIZE) &&
- !f2fs_is_compress_backend_ready(inode))
- return -EOPNOTSUPP;
+ if ((attr->ia_valid & ATTR_SIZE)) {
+ if (!f2fs_is_compress_backend_ready(inode))
+ return -EOPNOTSUPP;
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) &&
+ !IS_ALIGNED(attr->ia_size,
+ F2FS_BLK_TO_BYTES(F2FS_I(inode)->i_cluster_size)))
+ return -EINVAL;
+ }
err = setattr_prepare(idmap, dentry, attr);
if (err)
@@ -1325,6 +1330,9 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
f2fs_put_page(psrc, 1);
return PTR_ERR(pdst);
}
+
+ f2fs_wait_on_page_writeback(pdst, DATA, true, true);
+
memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE);
set_page_dirty(pdst);
set_page_private_gcing(pdst);
@@ -1817,15 +1825,6 @@ static long f2fs_fallocate(struct file *file, int mode,
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
- /*
- * Pinned file should not support partial truncation since the block
- * can be used by applications.
- */
- if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) &&
- (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
- FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)))
- return -EOPNOTSUPP;
-
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
FALLOC_FL_INSERT_RANGE))
@@ -1833,6 +1832,17 @@ static long f2fs_fallocate(struct file *file, int mode,
inode_lock(inode);
+ /*
+ * Pinned file should not support partial truncation since the block
+ * can be used by applications.
+ */
+ if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) &&
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
ret = file_modified(file);
if (ret)
goto out;
@@ -2224,34 +2234,13 @@ static int f2fs_ioc_abort_atomic_write(struct file *filp)
return ret;
}
-static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
+int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
+ bool readonly)
{
- struct inode *inode = file_inode(filp);
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct super_block *sb = sbi->sb;
- __u32 in;
int ret = 0;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (get_user(in, (__u32 __user *)arg))
- return -EFAULT;
-
- if (in != F2FS_GOING_DOWN_FULLSYNC) {
- ret = mnt_want_write_file(filp);
- if (ret) {
- if (ret == -EROFS) {
- ret = 0;
- f2fs_stop_checkpoint(sbi, false,
- STOP_CP_REASON_SHUTDOWN);
- trace_f2fs_shutdown(sbi, in, ret);
- }
- return ret;
- }
- }
-
- switch (in) {
+ switch (flag) {
case F2FS_GOING_DOWN_FULLSYNC:
ret = bdev_freeze(sb->s_bdev);
if (ret)
@@ -2290,6 +2279,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
goto out;
}
+ if (readonly)
+ goto out;
+
f2fs_stop_gc_thread(sbi);
f2fs_stop_discard_thread(sbi);
@@ -2298,10 +2290,44 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
f2fs_update_time(sbi, REQ_TIME);
out:
- if (in != F2FS_GOING_DOWN_FULLSYNC)
- mnt_drop_write_file(filp);
- trace_f2fs_shutdown(sbi, in, ret);
+ trace_f2fs_shutdown(sbi, flag, ret);
+
+ return ret;
+}
+
+static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ __u32 in;
+ int ret;
+ bool need_drop = false, readonly = false;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(in, (__u32 __user *)arg))
+ return -EFAULT;
+
+ if (in != F2FS_GOING_DOWN_FULLSYNC) {
+ ret = mnt_want_write_file(filp);
+ if (ret) {
+ if (ret != -EROFS)
+ return ret;
+
+ /* fallback to nosync shutdown for readonly fs */
+ in = F2FS_GOING_DOWN_NOSYNC;
+ readonly = true;
+ } else {
+ need_drop = true;
+ }
+ }
+
+ ret = f2fs_do_shutdown(sbi, in, readonly);
+
+ if (need_drop)
+ mnt_drop_write_file(filp);
return ret;
}
@@ -2354,13 +2380,14 @@ static bool uuid_is_nonzero(__u8 u[16])
static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
+ int ret;
if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode)))
return -EOPNOTSUPP;
+ ret = fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
-
- return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
+ return ret;
}
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
@@ -2607,12 +2634,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
bool fragmented = false;
int err;
- pg_start = range->start >> PAGE_SHIFT;
- pg_end = (range->start + range->len) >> PAGE_SHIFT;
-
f2fs_balance_fs(sbi, true);
inode_lock(inode);
+ pg_start = range->start >> PAGE_SHIFT;
+ pg_end = min_t(pgoff_t,
+ (range->start + range->len) >> PAGE_SHIFT,
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
err = -EINVAL;
@@ -2627,8 +2655,9 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
}
/* writeback all dirty pages in the range */
- err = filemap_write_and_wait_range(inode->i_mapping, range->start,
- range->start + range->len - 1);
+ err = filemap_write_and_wait_range(inode->i_mapping,
+ pg_start << PAGE_SHIFT,
+ (pg_end << PAGE_SHIFT) - 1);
if (err)
goto out;
@@ -2786,7 +2815,8 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
err = f2fs_defragment_range(sbi, filp, &range);
mnt_drop_write_file(filp);
- f2fs_update_time(sbi, REQ_TIME);
+ if (range.len)
+ f2fs_update_time(sbi, REQ_TIME);
if (err < 0)
return err;
@@ -2837,7 +2867,8 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
goto out;
}
- if (f2fs_compressed_file(src) || f2fs_compressed_file(dst)) {
+ if (f2fs_compressed_file(src) || f2fs_compressed_file(dst) ||
+ f2fs_is_pinned_file(src) || f2fs_is_pinned_file(dst)) {
ret = -EOPNOTSUPP;
goto out_unlock;
}
@@ -3189,18 +3220,17 @@ int f2fs_pin_file_control(struct inode *inode, bool inc)
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- /* Use i_gc_failures for normal file as a risk signal. */
- if (inc)
- f2fs_i_gc_failures_write(inode,
- fi->i_gc_failures[GC_FAILURE_PIN] + 1);
-
- if (fi->i_gc_failures[GC_FAILURE_PIN] > sbi->gc_pin_file_threshold) {
+ if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) {
f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials",
- __func__, inode->i_ino,
- fi->i_gc_failures[GC_FAILURE_PIN]);
+ __func__, inode->i_ino, fi->i_gc_failures);
clear_inode_flag(inode, FI_PIN_FILE);
return -EAGAIN;
}
+
+ /* Use i_gc_failures for normal file as a risk signal. */
+ if (inc)
+ f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1);
+
return 0;
}
@@ -3234,7 +3264,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
goto done;
}
- if (f2fs_sb_has_blkzoned(sbi) && F2FS_HAS_BLOCKS(inode)) {
+ if (F2FS_HAS_BLOCKS(inode)) {
ret = -EFBIG;
goto out;
}
@@ -3261,7 +3291,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
}
set_inode_flag(inode, FI_PIN_FILE);
- ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
+ ret = F2FS_I(inode)->i_gc_failures;
done:
f2fs_update_time(sbi, REQ_TIME);
out:
@@ -3276,7 +3306,7 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg)
__u32 pin = 0;
if (is_inode_flag_set(inode, FI_PIN_FILE))
- pin = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
+ pin = F2FS_I(inode)->i_gc_failures;
return put_user(pin, (u32 __user *)arg);
}
@@ -3522,9 +3552,6 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
if (!f2fs_sb_has_compression(sbi))
return -EOPNOTSUPP;
- if (!f2fs_compressed_file(inode))
- return -EINVAL;
-
if (f2fs_readonly(sbi->sb))
return -EROFS;
@@ -3543,7 +3570,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
goto out;
}
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ if (!f2fs_compressed_file(inode) ||
+ is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
ret = -EINVAL;
goto out;
}
@@ -3570,9 +3598,12 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
struct dnode_of_data dn;
pgoff_t end_offset, count;
+ f2fs_lock_op(sbi);
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE);
if (ret) {
+ f2fs_unlock_op(sbi);
if (ret == -ENOENT) {
page_idx = f2fs_get_next_page_offset(&dn,
page_idx);
@@ -3590,6 +3621,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
+
if (ret < 0)
break;
@@ -3600,6 +3633,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
filemap_invalidate_unlock(inode->i_mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
out:
+ if (released_blocks)
+ f2fs_update_time(sbi, REQ_TIME);
inode_unlock(inode);
mnt_drop_write_file(filp);
@@ -3641,7 +3676,8 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
while (count) {
int compr_blocks = 0;
- blkcnt_t reserved;
+ blkcnt_t reserved = 0;
+ blkcnt_t to_reserved;
int ret;
for (i = 0; i < cluster_size; i++) {
@@ -3661,20 +3697,26 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
* fails in release_compress_blocks(), so NEW_ADDR
* is a possible case.
*/
- if (blkaddr == NEW_ADDR ||
- __is_valid_data_blkaddr(blkaddr)) {
+ if (blkaddr == NEW_ADDR) {
+ reserved++;
+ continue;
+ }
+ if (__is_valid_data_blkaddr(blkaddr)) {
compr_blocks++;
continue;
}
}
- reserved = cluster_size - compr_blocks;
+ to_reserved = cluster_size - compr_blocks - reserved;
/* for the case all blocks in cluster were reserved */
- if (reserved == 1)
+ if (to_reserved == 1) {
+ dn->ofs_in_node += cluster_size;
goto next;
+ }
- ret = inc_valid_block_count(sbi, dn->inode, &reserved, false);
+ ret = inc_valid_block_count(sbi, dn->inode,
+ &to_reserved, false);
if (unlikely(ret))
return ret;
@@ -3685,7 +3727,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true);
- *reserved_blocks += reserved;
+ *reserved_blocks += to_reserved;
next:
count -= cluster_size;
}
@@ -3704,9 +3746,6 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
if (!f2fs_sb_has_compression(sbi))
return -EOPNOTSUPP;
- if (!f2fs_compressed_file(inode))
- return -EINVAL;
-
if (f2fs_readonly(sbi->sb))
return -EROFS;
@@ -3718,7 +3757,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
inode_lock(inode);
- if (!is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ if (!f2fs_compressed_file(inode) ||
+ !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
ret = -EINVAL;
goto unlock_inode;
}
@@ -3735,9 +3775,12 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
struct dnode_of_data dn;
pgoff_t end_offset, count;
+ f2fs_lock_op(sbi);
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE);
if (ret) {
+ f2fs_unlock_op(sbi);
if (ret == -ENOENT) {
page_idx = f2fs_get_next_page_offset(&dn,
page_idx);
@@ -3755,6 +3798,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
+
if (ret < 0)
break;
@@ -3770,6 +3815,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
f2fs_mark_inode_dirty_sync(inode, true);
}
unlock_inode:
+ if (reserved_blocks)
+ f2fs_update_time(sbi, REQ_TIME);
inode_unlock(inode);
mnt_drop_write_file(filp);
@@ -3778,7 +3825,7 @@ unlock_inode:
} else if (reserved_blocks &&
atomic_read(&F2FS_I(inode)->i_compr_blocks)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx "
+ f2fs_warn(sbi, "%s: partial blocks were reserved i_ino=%lx "
"iblocks=%llu, reserved=%u, compr_blocks=%u, "
"run fsck to fix.",
__func__, inode->i_ino, inode->i_blocks,
@@ -3966,6 +4013,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
if (len)
ret = f2fs_secure_erase(prev_bdev, inode, prev_index,
prev_block, len, range.flags);
+ f2fs_update_time(sbi, REQ_TIME);
out:
filemap_invalidate_unlock(mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -4119,9 +4167,6 @@ static int f2fs_ioc_decompress_file(struct file *filp)
if (!(filp->f_mode & FMODE_WRITE))
return -EBADF;
- if (!f2fs_compressed_file(inode))
- return -EINVAL;
-
f2fs_balance_fs(sbi, true);
file_start_write(filp);
@@ -4132,7 +4177,8 @@ static int f2fs_ioc_decompress_file(struct file *filp)
goto out;
}
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ if (!f2fs_compressed_file(inode) ||
+ is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
ret = -EINVAL;
goto out;
}
@@ -4175,6 +4221,7 @@ static int f2fs_ioc_decompress_file(struct file *filp)
if (ret)
f2fs_warn(sbi, "%s: The file might be partially decompressed (errno=%d). Please delete the file.",
__func__, ret);
+ f2fs_update_time(sbi, REQ_TIME);
out:
inode_unlock(inode);
file_end_write(filp);
@@ -4197,9 +4244,6 @@ static int f2fs_ioc_compress_file(struct file *filp)
if (!(filp->f_mode & FMODE_WRITE))
return -EBADF;
- if (!f2fs_compressed_file(inode))
- return -EINVAL;
-
f2fs_balance_fs(sbi, true);
file_start_write(filp);
@@ -4210,7 +4254,8 @@ static int f2fs_ioc_compress_file(struct file *filp)
goto out;
}
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ if (!f2fs_compressed_file(inode) ||
+ is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
ret = -EINVAL;
goto out;
}
@@ -4254,6 +4299,7 @@ static int f2fs_ioc_compress_file(struct file *filp)
if (ret)
f2fs_warn(sbi, "%s: The file might be partially compressed (errno=%d). Please delete the file.",
__func__, ret);
+ f2fs_update_time(sbi, REQ_TIME);
out:
inode_unlock(inode);
file_end_write(filp);
@@ -4612,7 +4658,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
map.m_may_create = true;
if (dio) {
- map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint);
+ map.m_seg_type = f2fs_rw_hint_to_seg_type(sbi,
+ inode->i_write_hint);
flag = F2FS_GET_BLOCK_PRE_DIO;
} else {
map.m_seg_type = NO_CHECK_TYPE;
@@ -4660,8 +4707,21 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
return 0;
}
+static void f2fs_dio_write_submit_io(const struct iomap_iter *iter,
+ struct bio *bio, loff_t file_offset)
+{
+ struct inode *inode = iter->inode;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int seg_type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint);
+ enum temp_type temp = f2fs_get_segment_temp(seg_type);
+
+ bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp);
+ submit_bio(bio);
+}
+
static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = {
- .end_io = f2fs_dio_write_end_io,
+ .end_io = f2fs_dio_write_end_io,
+ .submit_io = f2fs_dio_write_submit_io,
};
static void f2fs_flush_buffered_write(struct address_space *mapping,
@@ -4798,6 +4858,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
bool dio;
bool may_need_sync = true;
int preallocated;
+ const loff_t pos = iocb->ki_pos;
+ const ssize_t count = iov_iter_count(from);
ssize_t ret;
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
@@ -4819,6 +4881,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
inode_lock(inode);
}
+ if (f2fs_is_pinned_file(inode) &&
+ !f2fs_overwrite_io(inode, pos, count)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
ret = f2fs_write_checks(iocb, from);
if (ret <= 0)
goto out_unlock;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 8852814dab7f..6066c6eecf41 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1434,7 +1434,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
goto out;
if (gc_type == BG_GC) {
- if (PageWriteback(page)) {
+ if (folio_test_writeback(page_folio(page))) {
err = -EAGAIN;
goto out;
}
@@ -1554,10 +1554,15 @@ next_step:
int err;
inode = f2fs_iget(sb, dni.ino);
- if (IS_ERR(inode) || is_bad_inode(inode) ||
- special_file(inode->i_mode))
+ if (IS_ERR(inode))
continue;
+ if (is_bad_inode(inode) ||
+ special_file(inode->i_mode)) {
+ iput(inode);
+ continue;
+ }
+
err = f2fs_gc_pinned_control(inode, gc_type, segno);
if (err == -EAGAIN) {
iput(inode);
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 9c0d06c4d19a..a8ea3301b815 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -26,6 +26,7 @@
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
#define DEF_GC_FAILED_PINNED_FILES 2048
+#define MAX_GC_FAILED_PINNED_FILES USHRT_MAX
/* Search max. number of dirty segments to select a victim segment */
#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index ac00423f117b..7638d0d7b7ee 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -61,22 +61,22 @@ bool f2fs_may_inline_dentry(struct inode *inode)
return true;
}
-void f2fs_do_read_inline_data(struct page *page, struct page *ipage)
+void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio_file_mapping(folio)->host;
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
return;
- f2fs_bug_on(F2FS_P_SB(page), page->index);
+ f2fs_bug_on(F2FS_I_SB(inode), folio_index(folio));
- zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE);
+ folio_zero_segment(folio, MAX_INLINE_DATA(inode), folio_size(folio));
/* Copy the whole inline data block */
- memcpy_to_page(page, 0, inline_data_addr(inode, ipage),
+ memcpy_to_folio(folio, 0, inline_data_addr(inode, ipage),
MAX_INLINE_DATA(inode));
- if (!PageUptodate(page))
- SetPageUptodate(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
}
void f2fs_truncate_inline_inode(struct inode *inode,
@@ -97,13 +97,13 @@ void f2fs_truncate_inline_inode(struct inode *inode,
clear_inode_flag(inode, FI_DATA_EXIST);
}
-int f2fs_read_inline_data(struct inode *inode, struct page *page)
+int f2fs_read_inline_data(struct inode *inode, struct folio *folio)
{
struct page *ipage;
ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage)) {
- unlock_page(page);
+ folio_unlock(folio);
return PTR_ERR(ipage);
}
@@ -112,15 +112,15 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
return -EAGAIN;
}
- if (page->index)
- zero_user_segment(page, 0, PAGE_SIZE);
+ if (folio_index(folio))
+ folio_zero_segment(folio, 0, folio_size(folio));
else
- f2fs_do_read_inline_data(page, ipage);
+ f2fs_do_read_inline_data(folio, ipage);
- if (!PageUptodate(page))
- SetPageUptodate(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
f2fs_put_page(ipage, 1);
- unlock_page(page);
+ folio_unlock(folio);
return 0;
}
@@ -164,9 +164,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
return -EFSCORRUPTED;
}
- f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
+ f2fs_bug_on(F2FS_P_SB(page), folio_test_writeback(page_folio(page)));
- f2fs_do_read_inline_data(page, dn->inode_page);
+ f2fs_do_read_inline_data(page_folio(page), dn->inode_page);
set_page_dirty(page);
/* clear dirty state */
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index c26effdce9aa..005dde72aff3 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -161,7 +161,8 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
if (!f2fs_enable_inode_chksum(sbi, page))
#else
if (!f2fs_enable_inode_chksum(sbi, page) ||
- PageDirty(page) || PageWriteback(page))
+ PageDirty(page) ||
+ folio_test_writeback(page_folio(page)))
#endif
return true;
@@ -361,6 +362,12 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
+ if (fi->i_xattr_nid && f2fs_check_nid_range(sbi, fi->i_xattr_nid)) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_xattr_nid: %u, run fsck to fix.",
+ __func__, inode->i_ino, fi->i_xattr_nid);
+ return false;
+ }
+
return true;
}
@@ -408,8 +415,7 @@ static int do_read_inode(struct inode *inode)
if (S_ISDIR(inode->i_mode))
fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
else if (S_ISREG(inode->i_mode))
- fi->i_gc_failures[GC_FAILURE_PIN] =
- le16_to_cpu(ri->i_gc_failures);
+ fi->i_gc_failures = le16_to_cpu(ri->i_gc_failures);
fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
fi->i_flags = le32_to_cpu(ri->i_flags);
if (S_ISREG(inode->i_mode))
@@ -679,8 +685,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
ri->i_current_depth =
cpu_to_le32(F2FS_I(inode)->i_current_depth);
else if (S_ISREG(inode->i_mode))
- ri->i_gc_failures =
- cpu_to_le16(F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]);
+ ri->i_gc_failures = cpu_to_le16(F2FS_I(inode)->i_gc_failures);
ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid);
ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
@@ -804,6 +809,7 @@ void f2fs_evict_inode(struct inode *inode)
struct f2fs_inode_info *fi = F2FS_I(inode);
nid_t xnid = fi->i_xattr_nid;
int err = 0;
+ bool freeze_protected = false;
f2fs_abort_atomic_write(inode, true);
@@ -843,8 +849,10 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
- if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) {
sb_start_intwrite(inode->i_sb);
+ freeze_protected = true;
+ }
set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
retry:
@@ -887,7 +895,7 @@ retry:
if (dquot_initialize_needed(inode))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
}
- if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ if (freeze_protected)
sb_end_intwrite(inode->i_sb);
no_delete:
dquot_drop(inode);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b3de6d6cdb02..b72ef96f7e33 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1187,7 +1187,17 @@ skip_partial:
default:
BUG();
}
- if (err < 0 && err != -ENOENT)
+ if (err == -ENOENT) {
+ set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+ f2fs_err_ratelimited(sbi,
+ "truncate node fail, ino:%lu, nid:%u, "
+ "offset[0]:%d, offset[1]:%d, nofs:%d",
+ inode->i_ino, dn.nid, offset[0],
+ offset[1], nofs);
+ err = 0;
+ }
+ if (err < 0)
goto fail;
if (offset[1] == 0 &&
ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
@@ -1319,6 +1329,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
}
if (unlikely(new_ni.blk_addr != NULL_ADDR)) {
err = -EFSCORRUPTED;
+ dec_valid_node_count(sbi, dn->inode, !ofs);
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto fail;
@@ -1345,7 +1356,6 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
if (ofs == 0)
inc_valid_inode_count(sbi);
return page;
-
fail:
clear_node_page_dirty(page);
f2fs_put_page(page, 1);
@@ -1614,7 +1624,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
};
unsigned int seq;
- trace_f2fs_writepage(page, NODE);
+ trace_f2fs_writepage(page_folio(page), NODE);
if (unlikely(f2fs_cp_error(sbi))) {
/* keep node pages in remount-ro mode */
@@ -1733,7 +1743,7 @@ int f2fs_move_node_page(struct page *node_page, int gc_type)
goto release_page;
} else {
/* set page dirty and write it */
- if (!PageWriteback(node_page))
+ if (!folio_test_writeback(page_folio(node_page)))
set_page_dirty(node_page);
}
out_page:
@@ -2161,7 +2171,7 @@ skip_write:
static bool f2fs_dirty_node_folio(struct address_space *mapping,
struct folio *folio)
{
- trace_f2fs_set_page_dirty(&folio->page, NODE);
+ trace_f2fs_set_page_dirty(folio, NODE);
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index e7bf15b8240a..496aee53c38a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -330,8 +330,7 @@ static int recover_inode(struct inode *inode, struct page *page)
F2FS_I(inode)->i_advise = raw->i_advise;
F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags);
f2fs_set_inode_flags(inode);
- F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] =
- le16_to_cpu(raw->i_gc_failures);
+ F2FS_I(inode)->i_gc_failures = le16_to_cpu(raw->i_gc_failures);
recover_inline_flags(inode, raw);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4fd76e867e0a..a0ce3d080f80 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -771,8 +771,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
block_t valid_blocks =
get_valid_blocks(sbi, segno, true);
- f2fs_bug_on(sbi, unlikely(!valid_blocks ||
- valid_blocks == CAP_BLKS_PER_SEC(sbi)));
+ f2fs_bug_on(sbi,
+ (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+ !valid_blocks) ||
+ valid_blocks == CAP_BLKS_PER_SEC(sbi));
if (!IS_CURSEC(sbi, secno))
set_bit(secno, dirty_i->dirty_secmap);
@@ -1109,9 +1111,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
dc->error = 0;
if (dc->error)
- printk_ratelimited(
- "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d",
- KERN_INFO, sbi->sb->s_id,
+ f2fs_info_ratelimited(sbi,
+ "Issue discard(%u, %u, %u) failed, ret: %d",
dc->di.lstart, dc->di.start, dc->di.len, dc->error);
__detach_discard_cmd(dcc, dc);
}
@@ -2645,7 +2646,7 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
}
static int is_next_segment_free(struct f2fs_sb_info *sbi,
- struct curseg_info *curseg, int type)
+ struct curseg_info *curseg)
{
unsigned int segno = curseg->segno + 1;
struct free_segmap_info *free_i = FREE_I(sbi);
@@ -3073,8 +3074,7 @@ static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
curseg->seg_type == CURSEG_WARM_NODE)
return true;
- if (curseg->alloc_type == LFS &&
- is_next_segment_free(sbi, curseg, type) &&
+ if (curseg->alloc_type == LFS && is_next_segment_free(sbi, curseg) &&
likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
return true;
if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0))
@@ -3352,8 +3352,14 @@ out:
return err;
}
-int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
+int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint)
{
+ if (F2FS_OPTION(sbi).active_logs == 2)
+ return CURSEG_HOT_DATA;
+ else if (F2FS_OPTION(sbi).active_logs == 4)
+ return CURSEG_COLD_DATA;
+
+ /* active_log == 6 */
switch (hint) {
case WRITE_LIFE_SHORT:
return CURSEG_HOT_DATA;
@@ -3364,6 +3370,65 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
}
}
+/*
+ * This returns write hints for each segment type. This hints will be
+ * passed down to block layer as below by default.
+ *
+ * User F2FS Block
+ * ---- ---- -----
+ * META WRITE_LIFE_NONE|REQ_META
+ * HOT_NODE WRITE_LIFE_NONE
+ * WARM_NODE WRITE_LIFE_MEDIUM
+ * COLD_NODE WRITE_LIFE_LONG
+ * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
+ * extension list " "
+ *
+ * -- buffered io
+ * COLD_DATA WRITE_LIFE_EXTREME
+ * HOT_DATA WRITE_LIFE_SHORT
+ * WARM_DATA WRITE_LIFE_NOT_SET
+ *
+ * -- direct io
+ * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE " WRITE_LIFE_NONE
+ * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
+ * WRITE_LIFE_LONG " WRITE_LIFE_LONG
+ */
+enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
+ enum page_type type, enum temp_type temp)
+{
+ switch (type) {
+ case DATA:
+ switch (temp) {
+ case WARM:
+ return WRITE_LIFE_NOT_SET;
+ case HOT:
+ return WRITE_LIFE_SHORT;
+ case COLD:
+ return WRITE_LIFE_EXTREME;
+ default:
+ return WRITE_LIFE_NONE;
+ }
+ case NODE:
+ switch (temp) {
+ case WARM:
+ return WRITE_LIFE_MEDIUM;
+ case HOT:
+ return WRITE_LIFE_NONE;
+ case COLD:
+ return WRITE_LIFE_LONG;
+ default:
+ return WRITE_LIFE_NONE;
+ }
+ case META:
+ return WRITE_LIFE_NONE;
+ default:
+ return WRITE_LIFE_NONE;
+ }
+}
+
static int __get_segment_type_2(struct f2fs_io_info *fio)
{
if (fio->type == DATA)
@@ -3434,7 +3499,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
is_inode_flag_set(inode, FI_HOT_DATA) ||
f2fs_is_cow_file(inode))
return CURSEG_HOT_DATA;
- return f2fs_rw_hint_to_seg_type(inode->i_write_hint);
+ return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
+ inode->i_write_hint);
} else {
if (IS_DNODE(fio->page))
return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
@@ -3443,6 +3509,15 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
}
}
+int f2fs_get_segment_temp(int seg_type)
+{
+ if (IS_HOT(seg_type))
+ return HOT;
+ else if (IS_WARM(seg_type))
+ return WARM;
+ return COLD;
+}
+
static int __get_segment_type(struct f2fs_io_info *fio)
{
int type = 0;
@@ -3461,12 +3536,8 @@ static int __get_segment_type(struct f2fs_io_info *fio)
f2fs_bug_on(fio->sbi, true);
}
- if (IS_HOT(type))
- fio->temp = HOT;
- else if (IS_WARM(type))
- fio->temp = WARM;
- else
- fio->temp = COLD;
+ fio->temp = f2fs_get_segment_temp(type);
+
return type;
}
@@ -3559,6 +3630,8 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
if (segment_full) {
if (type == CURSEG_COLD_DATA_PINNED &&
!((curseg->segno + 1) % sbi->segs_per_sec)) {
+ write_sum_page(sbi, curseg->sum_blk,
+ GET_SUM_BLOCK(sbi, curseg->segno));
reset_curseg_fields(curseg);
goto skip_new_segment;
}
@@ -3612,13 +3685,13 @@ skip_new_segment:
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
return 0;
+
out_err:
*new_blkaddr = NULL_ADDR;
up_write(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
return ret;
-
}
void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
@@ -3660,8 +3733,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
&fio->new_blkaddr, sum, type, fio)) {
if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
- if (PageWriteback(fio->page))
- end_page_writeback(fio->page);
+ end_page_writeback(fio->page);
if (f2fs_in_warm_node_list(fio->sbi, fio->page))
f2fs_del_fsync_node_entry(fio->sbi, fio->page);
goto out;
@@ -3904,7 +3976,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type, bool ordered, bool locked)
{
- if (PageWriteback(page)) {
+ if (folio_test_writeback(page_folio(page))) {
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
/* submit cached LFS IO */
@@ -3913,7 +3985,8 @@ void f2fs_wait_on_page_writeback(struct page *page,
f2fs_submit_merged_ipu_write(sbi, NULL, page);
if (ordered) {
wait_on_page_writeback(page);
- f2fs_bug_on(sbi, locked && PageWriteback(page));
+ f2fs_bug_on(sbi, locked &&
+ folio_test_writeback(page_folio(page)));
} else {
wait_for_stable_page(page);
}
@@ -4959,17 +5032,6 @@ out:
}
#ifdef CONFIG_BLK_DEV_ZONED
-static const char *f2fs_zone_status[BLK_ZONE_COND_OFFLINE + 1] = {
- [BLK_ZONE_COND_NOT_WP] = "NOT_WP",
- [BLK_ZONE_COND_EMPTY] = "EMPTY",
- [BLK_ZONE_COND_IMP_OPEN] = "IMPLICIT_OPEN",
- [BLK_ZONE_COND_EXP_OPEN] = "EXPLICIT_OPEN",
- [BLK_ZONE_COND_CLOSED] = "CLOSED",
- [BLK_ZONE_COND_READONLY] = "READONLY",
- [BLK_ZONE_COND_FULL] = "FULL",
- [BLK_ZONE_COND_OFFLINE] = "OFFLINE",
-};
-
static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
struct f2fs_dev_info *fdev,
struct blk_zone *zone)
@@ -5000,7 +5062,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) {
f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]",
zone_segno, valid_block_cnt,
- f2fs_zone_status[zone->cond]);
+ blk_zone_cond_str(zone->cond));
return 0;
}
@@ -5011,7 +5073,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
if (!valid_block_cnt) {
f2fs_notice(sbi, "Zone without valid block has non-zero write "
"pointer. Reset the write pointer: cond[%s]",
- f2fs_zone_status[zone->cond]);
+ blk_zone_cond_str(zone->cond));
ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
zone->len >> log_sectors_per_block);
if (ret)
@@ -5029,7 +5091,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
*/
f2fs_notice(sbi, "Valid blocks are not aligned with write "
"pointer: valid block[0x%x,0x%x] cond[%s]",
- zone_segno, valid_block_cnt, f2fs_zone_status[zone->cond]);
+ zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond));
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a4bc26dfdb1a..1f1b3647a998 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -66,21 +66,31 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_NO_SEGMENT] = "no free segment",
};
-void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
- unsigned int type)
+int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
+ unsigned long type)
{
struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
if (rate) {
+ if (rate > INT_MAX)
+ return -EINVAL;
atomic_set(&ffi->inject_ops, 0);
- ffi->inject_rate = rate;
+ ffi->inject_rate = (int)rate;
}
- if (type)
- ffi->inject_type = type;
+ if (type) {
+ if (type >= BIT(FAULT_MAX))
+ return -EINVAL;
+ ffi->inject_type = (unsigned int)type;
+ }
if (!rate && !type)
memset(ffi, 0, sizeof(struct f2fs_fault_info));
+ else
+ f2fs_info(sbi,
+ "build fault injection attr: rate: %lu, type: 0x%lx",
+ rate, type);
+ return 0;
}
#endif
@@ -886,14 +896,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_fault_injection:
if (args->from && match_int(args, &arg))
return -EINVAL;
- f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE);
+ if (f2fs_build_fault_attr(sbi, arg,
+ F2FS_ALL_FAULT_TYPE))
+ return -EINVAL;
set_opt(sbi, FAULT_INJECTION);
break;
case Opt_fault_type:
if (args->from && match_int(args, &arg))
return -EINVAL;
- f2fs_build_fault_attr(sbi, 0, arg);
+ if (f2fs_build_fault_attr(sbi, 0, arg))
+ return -EINVAL;
set_opt(sbi, FAULT_INJECTION);
break;
#else
@@ -2132,8 +2145,6 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount)
F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL;
F2FS_OPTION(sbi).errors = MOUNT_ERRORS_CONTINUE;
- sbi->sb->s_flags &= ~SB_INLINECRYPT;
-
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
@@ -2326,6 +2337,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if (err)
goto restore_opts;
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_has_blkzoned(sbi) &&
+ sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) {
+ f2fs_err(sbi,
+ "zoned: max open zones %u is too small, need at least %u open zones",
+ sbi->max_open_zones, F2FS_OPTION(sbi).active_logs);
+ err = -EINVAL;
+ goto restore_opts;
+ }
+#endif
+
/* flush outstanding errors before changing fs state */
flush_work(&sbi->s_error_work);
@@ -2547,6 +2569,11 @@ restore_opts:
return err;
}
+static void f2fs_shutdown(struct super_block *sb)
+{
+ f2fs_do_shutdown(F2FS_SB(sb), F2FS_GOING_DOWN_NOSYNC, false);
+}
+
#ifdef CONFIG_QUOTA
static bool f2fs_need_recovery(struct f2fs_sb_info *sbi)
{
@@ -3146,6 +3173,7 @@ static const struct super_operations f2fs_sops = {
.unfreeze_fs = f2fs_unfreeze,
.statfs = f2fs_statfs,
.remount_fs = f2fs_remount,
+ .shutdown = f2fs_shutdown,
};
#ifdef CONFIG_FS_ENCRYPTION
@@ -3441,7 +3469,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
}
}
- /* Currently, support only 4KB block size */
+ /* only support block_size equals to PAGE_SIZE */
if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) {
f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u",
le32_to_cpu(raw_super->log_blocksize),
@@ -3862,11 +3890,24 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
sector_t nr_sectors = bdev_nr_sectors(bdev);
struct f2fs_report_zones_args rep_zone_arg;
u64 zone_sectors;
+ unsigned int max_open_zones;
int ret;
if (!f2fs_sb_has_blkzoned(sbi))
return 0;
+ if (bdev_is_zoned(FDEV(devi).bdev)) {
+ max_open_zones = bdev_max_open_zones(bdev);
+ if (max_open_zones && (max_open_zones < sbi->max_open_zones))
+ sbi->max_open_zones = max_open_zones;
+ if (sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) {
+ f2fs_err(sbi,
+ "zoned: max open zones %u is too small, need at least %u open zones",
+ sbi->max_open_zones, F2FS_OPTION(sbi).active_logs);
+ return -EINVAL;
+ }
+ }
+
zone_sectors = bdev_zone_sectors(bdev);
if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
SECTOR_TO_BLOCK(zone_sectors))
@@ -4131,9 +4172,15 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason,
if (shutdown)
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
- /* continue filesystem operators if errors=continue */
- if (continue_fs || f2fs_readonly(sb))
+ /*
+ * Continue filesystem operators if errors=continue. Should not set
+ * RO by shutdown, since RO bypasses thaw_super which can hang the
+ * system.
+ */
+ if (continue_fs || f2fs_readonly(sb) || shutdown) {
+ f2fs_warn(sbi, "Stopped filesystem due to reason: %d", reason);
return;
+ }
f2fs_warn(sbi, "Remounting filesystem read-only");
/*
@@ -4180,6 +4227,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev);
sbi->aligned_blksize = true;
+#ifdef CONFIG_BLK_DEV_ZONED
+ sbi->max_open_zones = UINT_MAX;
+#endif
for (i = 0; i < max_devices; i++) {
if (i == 0)
@@ -4894,12 +4944,6 @@ static int __init init_f2fs_fs(void)
{
int err;
- if (PAGE_SIZE != F2FS_BLKSIZE) {
- printk("F2FS not supported on PAGE_SIZE(%lu) != BLOCK_SIZE(%lu)\n",
- PAGE_SIZE, F2FS_BLKSIZE);
- return -EINVAL;
- }
-
err = init_inodecache();
if (err)
goto fail;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index a568ce96cf56..09d3ecfaa4f1 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -484,10 +484,16 @@ out:
if (ret < 0)
return ret;
#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX))
- return -EINVAL;
- if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX)
- return -EINVAL;
+ if (a->struct_type == FAULT_INFO_TYPE) {
+ if (f2fs_build_fault_attr(sbi, 0, t))
+ return -EINVAL;
+ return count;
+ }
+ if (a->struct_type == FAULT_INFO_RATE) {
+ if (f2fs_build_fault_attr(sbi, t, 0))
+ return -EINVAL;
+ return count;
+ }
#endif
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
@@ -675,6 +681,13 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "gc_pin_file_threshold")) {
+ if (t > MAX_GC_FAILED_PINNED_FILES)
+ return -EINVAL;
+ sbi->gc_pin_file_threshold = t;
+ return count;
+ }
+
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
if (t != 0)
return -EINVAL;
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 00235b8a1823..acbec5bdd521 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -269,6 +269,18 @@ enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, };
/**
* fat_parse_long - Parse extended directory entry.
*
+ * @dir: Pointer to the inode that represents the directory.
+ * @pos: On input, contains the starting position to read from.
+ * On output, updated with the new position.
+ * @bh: Pointer to the buffer head that may be used for reading directory
+ * entries. May be updated.
+ * @de: On input, points to the current directory entry.
+ * On output, points to the next directory entry.
+ * @unicode: Pointer to a buffer where the parsed Unicode long filename will be
+ * stored.
+ * @nr_slots: Pointer to a variable that will store the number of longname
+ * slots found.
+ *
* This function returns zero on success, negative value on error, or one of
* the following:
*
diff --git a/fs/file.c b/fs/file.c
index 3b683b9101d8..8076aef9c210 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -271,6 +271,11 @@ static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}
+static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
+{
+ return test_bit(fd, fdt->open_fds);
+}
+
static unsigned int count_open_files(struct fdtable *fdt)
{
unsigned int size = fdt->max_fds;
@@ -915,13 +920,8 @@ struct file *get_file_rcu(struct file __rcu **f)
struct file __rcu *file;
file = __get_file_rcu(f);
- if (unlikely(!file))
- return NULL;
-
- if (unlikely(IS_ERR(file)))
- continue;
-
- return file;
+ if (!IS_ERR(file))
+ return file;
}
}
EXPORT_SYMBOL_GPL(get_file_rcu);
@@ -1219,12 +1219,9 @@ void set_close_on_exec(unsigned int fd, int flag)
bool get_close_on_exec(unsigned int fd)
{
- struct files_struct *files = current->files;
- struct fdtable *fdt;
bool res;
rcu_read_lock();
- fdt = files_fdtable(files);
- res = close_on_exec(fd, fdt);
+ res = close_on_exec(fd, current->files);
rcu_read_unlock();
return res;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 3ec8bb5e68ff..9eb191b5c4de 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1813,7 +1813,8 @@ static void fuse_resend(struct fuse_conn *fc)
spin_unlock(&fc->lock);
list_for_each_entry_safe(req, next, &to_queue, list) {
- __set_bit(FR_PENDING, &req->flags);
+ set_bit(FR_PENDING, &req->flags);
+ clear_bit(FR_SENT, &req->flags);
/* mark the request as resend request */
req->in.h.unique |= FUSE_UNIQUE_RESEND;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b57ce4157640..f39456c65ed7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -935,14 +935,10 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
}
for (i = 0; i < ap->num_pages; i++) {
- struct page *page = ap->pages[i];
+ struct folio *folio = page_folio(ap->pages[i]);
- if (!err)
- SetPageUptodate(page);
- else
- SetPageError(page);
- unlock_page(page);
- put_page(page);
+ folio_end_read(folio, !err);
+ folio_put(folio);
}
if (ia->ff)
fuse_file_put(ia->ff, false);
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 726640fa439e..572ce8a82ceb 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -8,6 +8,7 @@
#include <linux/uio.h>
#include <linux/compat.h>
#include <linux/fileattr.h>
+#include <linux/fsverity.h>
static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args,
struct fuse_ioctl_out *outarg)
@@ -117,6 +118,53 @@ static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
return 0;
}
+/* For fs-verity, determine iov lengths from input */
+static int fuse_setup_measure_verity(unsigned long arg, struct iovec *iov)
+{
+ __u16 digest_size;
+ struct fsverity_digest __user *uarg = (void __user *)arg;
+
+ if (copy_from_user(&digest_size, &uarg->digest_size, sizeof(digest_size)))
+ return -EFAULT;
+
+ if (digest_size > SIZE_MAX - sizeof(struct fsverity_digest))
+ return -EINVAL;
+
+ iov->iov_len = sizeof(struct fsverity_digest) + digest_size;
+
+ return 0;
+}
+
+static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov,
+ unsigned int *in_iovs)
+{
+ struct fsverity_enable_arg enable;
+ struct fsverity_enable_arg __user *uarg = (void __user *)arg;
+ const __u32 max_buffer_len = FUSE_MAX_MAX_PAGES * PAGE_SIZE;
+
+ if (copy_from_user(&enable, uarg, sizeof(enable)))
+ return -EFAULT;
+
+ if (enable.salt_size > max_buffer_len || enable.sig_size > max_buffer_len)
+ return -ENOMEM;
+
+ if (enable.salt_size > 0) {
+ iov++;
+ (*in_iovs)++;
+
+ iov->iov_base = u64_to_user_ptr(enable.salt_ptr);
+ iov->iov_len = enable.salt_size;
+ }
+
+ if (enable.sig_size > 0) {
+ iov++;
+ (*in_iovs)++;
+
+ iov->iov_base = u64_to_user_ptr(enable.sig_ptr);
+ iov->iov_len = enable.sig_size;
+ }
+ return 0;
+}
/*
* For ioctls, there is no generic way to determine how much memory
@@ -227,6 +275,18 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
out_iov = iov;
out_iovs = 1;
}
+
+ err = 0;
+ switch (cmd) {
+ case FS_IOC_MEASURE_VERITY:
+ err = fuse_setup_measure_verity(arg, iov);
+ break;
+ case FS_IOC_ENABLE_VERITY:
+ err = fuse_setup_enable_verity(arg, iov, &in_iovs);
+ break;
+ }
+ if (err)
+ goto out;
}
retry:
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index bb3e941b9503..1a52a51b6b07 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -7,6 +7,8 @@
#include <linux/fs.h>
#include <linux/dax.h>
#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/group_cpus.h>
#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/module.h>
@@ -67,6 +69,8 @@ struct virtio_fs {
unsigned int num_request_queues; /* number of request queues */
struct dax_device *dax_dev;
+ unsigned int *mq_map; /* index = cpu id, value = request vq id */
+
/* DAX memory window where file contents are mapped */
void *window_kaddr;
phys_addr_t window_phys_addr;
@@ -185,6 +189,7 @@ static void virtio_fs_ktype_release(struct kobject *kobj)
{
struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj);
+ kfree(vfs->mq_map);
kfree(vfs->vqs);
kfree(vfs);
}
@@ -706,6 +711,44 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
}
}
+static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ const struct cpumask *mask, *masks;
+ unsigned int q, cpu;
+
+ /* First attempt to map using existing transport layer affinities
+ * e.g. PCIe MSI-X
+ */
+ if (!vdev->config->get_vq_affinity)
+ goto fallback;
+
+ for (q = 0; q < fs->num_request_queues; q++) {
+ mask = vdev->config->get_vq_affinity(vdev, VQ_REQUEST + q);
+ if (!mask)
+ goto fallback;
+
+ for_each_cpu(cpu, mask)
+ fs->mq_map[cpu] = q;
+ }
+
+ return;
+fallback:
+ /* Attempt to map evenly in groups over the CPUs */
+ masks = group_cpus_evenly(fs->num_request_queues);
+ /* If even this fails we default to all CPUs use queue zero */
+ if (!masks) {
+ for_each_possible_cpu(cpu)
+ fs->mq_map[cpu] = 0;
+ return;
+ }
+
+ for (q = 0; q < fs->num_request_queues; q++) {
+ for_each_cpu(cpu, &masks[q])
+ fs->mq_map[cpu] = q;
+ }
+ kfree(masks);
+}
+
/* Virtqueue interrupt handler */
static void virtio_fs_vq_done(struct virtqueue *vq)
{
@@ -742,6 +785,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
{
struct virtqueue **vqs;
vq_callback_t **callbacks;
+ /* Specify pre_vectors to ensure that the queues before the
+ * request queues (e.g. hiprio) don't claim any of the CPUs in
+ * the multi-queue mapping and interrupt affinities
+ */
+ struct irq_affinity desc = { .pre_vectors = VQ_REQUEST };
const char **names;
unsigned int i;
int ret = 0;
@@ -751,6 +799,9 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
if (fs->num_request_queues == 0)
return -EINVAL;
+ /* Truncate nr of request queues to nr_cpu_id */
+ fs->num_request_queues = min_t(unsigned int, fs->num_request_queues,
+ nr_cpu_ids);
fs->nvqs = VQ_REQUEST + fs->num_request_queues;
fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
if (!fs->vqs)
@@ -760,7 +811,9 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
GFP_KERNEL);
names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
- if (!vqs || !callbacks || !names) {
+ fs->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*fs->mq_map), GFP_KERNEL,
+ dev_to_node(&vdev->dev));
+ if (!vqs || !callbacks || !names || !fs->mq_map) {
ret = -ENOMEM;
goto out;
}
@@ -780,7 +833,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
names[i] = fs->vqs[i].name;
}
- ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
+ ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, &desc);
if (ret < 0)
goto out;
@@ -792,8 +845,10 @@ out:
kfree(names);
kfree(callbacks);
kfree(vqs);
- if (ret)
+ if (ret) {
kfree(fs->vqs);
+ kfree(fs->mq_map);
+ }
return ret;
}
@@ -939,7 +994,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
if (ret < 0)
goto out;
- /* TODO vq affinity */
+ virtio_fs_map_queues(vdev, fs);
ret = virtio_fs_setup_dax(vdev, fs);
if (ret < 0)
@@ -1023,7 +1078,6 @@ static const unsigned int feature_table[] = {};
static struct virtio_driver virtio_fs_driver = {
.driver.name = KBUILD_MODNAME,
- .driver.owner = THIS_MODULE,
.id_table = id_table,
.feature_table = feature_table,
.feature_table_size = ARRAY_SIZE(feature_table),
@@ -1288,7 +1342,7 @@ out:
static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
__releases(fiq->lock)
{
- unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
+ unsigned int queue_id;
struct virtio_fs *fs;
struct fuse_req *req;
struct virtio_fs_vq *fsvq;
@@ -1302,11 +1356,13 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
fs = fiq->priv;
+ queue_id = VQ_REQUEST + fs->mq_map[raw_smp_processor_id()];
- pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
- __func__, req->in.h.opcode, req->in.h.unique,
+ pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u queue_id %u\n",
+ __func__, req->in.h.opcode, req->in.h.unique,
req->in.h.nodeid, req->in.h.len,
- fuse_len_args(req->args->out_numargs, req->args->out_args));
+ fuse_len_args(req->args->out_numargs, req->args->out_args),
+ queue_id);
fsvq = &fs->vqs[queue_id];
ret = virtio_fs_enqueue_req(fsvq, req, false);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9f11fc1e79eb..4ea6c8bfb4e6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1267,7 +1267,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping = gfs2_glock2aspace(gl);
if (mapping) {
mapping->a_ops = &gfs2_meta_aops;
- mapping->host = s->s_bdev->bd_inode;
+ mapping->host = s->s_bdev->bd_mapping->host;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->i_private_data = NULL;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 227edbaddfbc..05975ec76d35 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -114,7 +114,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
address_space_init_once(mapping);
mapping->a_ops = &gfs2_rgrp_aops;
- mapping->host = sb->s_bdev->bd_inode;
+ mapping->host = sb->s_bdev->bd_mapping->host;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->i_private_data = NULL;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 34ac73cc36b1..412f295acebe 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -176,14 +176,12 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
- info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_base;
info.high_limit = arch_get_mmap_end(addr, len, flags);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
return vm_unmapped_area(&info);
}
@@ -192,14 +190,13 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
@@ -249,11 +246,11 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
/*
- * Use mm->get_unmapped_area value as a hint to use topdown routine.
+ * Use MMF_TOPDOWN flag as a hint to use topdown routine.
* If architectures have special needs, they should define their own
* version of hugetlb_get_unmapped_area.
*/
- if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
+ if (test_bit(MMF_TOPDOWN, &mm->flags))
return hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
diff --git a/fs/internal.h b/fs/internal.h
index 7ca738904e34..ab2225136f60 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -62,6 +62,9 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode);
int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
int do_linkat(int olddfd, struct filename *old, int newdfd,
struct filename *new, int flags);
+int vfs_tmpfile(struct mnt_idmap *idmap,
+ const struct path *parentpath,
+ struct file *file, umode_t mode);
/*
* namespace.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index fb0628e680c4..64776891120c 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -796,6 +796,9 @@ static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp)
*
* When you add any new common ioctls to the switches above and below,
* please ensure they have compatible arguments in compat mode.
+ *
+ * The LSM mailing list should also be notified of any command additions or
+ * changes, as specific LSMs may be affected.
*/
static int do_vfs_ioctl(struct file *filp, unsigned int fd,
unsigned int cmd, unsigned long arg)
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index fc070184b7fa..381d76c5c232 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -4,7 +4,7 @@
# All Rights Reserved.
#
-ccflags-y += -I $(srctree)/$(src) # needed for trace events
+ccflags-y += -I $(src) # needed for trace events
obj-$(CONFIG_FS_IOMAP) += iomap.o
diff --git a/fs/isofs/Makefile b/fs/isofs/Makefile
index 6498fd2b0f60..b25bc542a22b 100644
--- a/fs/isofs/Makefile
+++ b/fs/isofs/Makefile
@@ -5,7 +5,6 @@
obj-$(CONFIG_ISO9660_FS) += isofs.o
-isofs-objs-y := namei.o inode.o dir.o util.o rock.o export.o
-isofs-objs-$(CONFIG_JOLIET) += joliet.o
-isofs-objs-$(CONFIG_ZISOFS) += compress.o
-isofs-objs := $(isofs-objs-y)
+isofs-y := namei.o inode.o dir.o util.o rock.o export.o
+isofs-$(CONFIG_JOLIET) += joliet.o
+isofs-$(CONFIG_ZISOFS) += compress.o
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index c4da3f634b92..34d5baa5d88a 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -346,8 +346,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
for (i = 0; i < pcount; i++, index++) {
if (i != full_page)
pages[i] = grab_cache_page_nowait(mapping, index);
- if (pages[i])
- ClearPageError(pages[i]);
}
err = zisofs_fill_pages(inode, full_page, pcount, pages);
@@ -356,8 +354,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
for (i = 0; i < pcount; i++) {
if (pages[i]) {
flush_dcache_page(pages[i]);
- if (i == full_page && err)
- SetPageError(pages[i]);
unlock_page(pages[i]);
if (i != full_page)
put_page(pages[i]);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 2a616a9f289d..93b1077a380a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -21,11 +21,12 @@
#include <linux/ctype.h>
#include <linux/statfs.h>
#include <linux/cdrom.h>
-#include <linux/parser.h>
#include <linux/mpage.h>
#include <linux/user_namespace.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "isofs.h"
#include "zisofs.h"
@@ -110,10 +111,10 @@ static void destroy_inodecache(void)
kmem_cache_destroy(isofs_inode_cachep);
}
-static int isofs_remount(struct super_block *sb, int *flags, char *data)
+static int isofs_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- if (!(*flags & SB_RDONLY))
+ sync_filesystem(fc->root->d_sb);
+ if (!(fc->sb_flags & SB_RDONLY))
return -EROFS;
return 0;
}
@@ -123,7 +124,6 @@ static const struct super_operations isofs_sops = {
.free_inode = isofs_free_inode,
.put_super = isofs_put_super,
.statfs = isofs_statfs,
- .remount_fs = isofs_remount,
.show_options = isofs_show_options,
};
@@ -145,7 +145,7 @@ static const struct dentry_operations isofs_dentry_ops[] = {
#endif
};
-struct iso9660_options{
+struct isofs_options{
unsigned int rock:1;
unsigned int joliet:1;
unsigned int cruft:1;
@@ -289,197 +289,161 @@ isofs_dentry_cmpi_ms(const struct dentry *dentry,
#endif
enum {
- Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore,
- Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet,
- Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err,
- Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm,
+ Opt_block, Opt_check, Opt_cruft, Opt_gid, Opt_ignore, Opt_iocharset,
+ Opt_map, Opt_mode, Opt_nojoliet, Opt_norock, Opt_sb, Opt_session,
+ Opt_uid, Opt_unhide, Opt_utf8, Opt_err, Opt_nocompress, Opt_hide,
+ Opt_showassoc, Opt_dmode, Opt_overriderockperm,
};
-static const match_table_t tokens = {
- {Opt_norock, "norock"},
- {Opt_nojoliet, "nojoliet"},
- {Opt_unhide, "unhide"},
- {Opt_hide, "hide"},
- {Opt_showassoc, "showassoc"},
- {Opt_cruft, "cruft"},
- {Opt_utf8, "utf8"},
- {Opt_iocharset, "iocharset=%s"},
- {Opt_map_a, "map=acorn"},
- {Opt_map_a, "map=a"},
- {Opt_map_n, "map=normal"},
- {Opt_map_n, "map=n"},
- {Opt_map_o, "map=off"},
- {Opt_map_o, "map=o"},
- {Opt_session, "session=%u"},
- {Opt_sb, "sbsector=%u"},
- {Opt_check_r, "check=relaxed"},
- {Opt_check_r, "check=r"},
- {Opt_check_s, "check=strict"},
- {Opt_check_s, "check=s"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_mode, "mode=%u"},
- {Opt_dmode, "dmode=%u"},
- {Opt_overriderockperm, "overriderockperm"},
- {Opt_block, "block=%u"},
- {Opt_ignore, "conv=binary"},
- {Opt_ignore, "conv=b"},
- {Opt_ignore, "conv=text"},
- {Opt_ignore, "conv=t"},
- {Opt_ignore, "conv=mtext"},
- {Opt_ignore, "conv=m"},
- {Opt_ignore, "conv=auto"},
- {Opt_ignore, "conv=a"},
- {Opt_nocompress, "nocompress"},
- {Opt_err, NULL}
+static const struct constant_table isofs_param_map[] = {
+ {"acorn", 'a'},
+ {"a", 'a'},
+ {"normal", 'n'},
+ {"n", 'n'},
+ {"off", 'o'},
+ {"o", 'o'},
+ {}
};
-static int parse_options(char *options, struct iso9660_options *popt)
-{
- char *p;
- int option;
- unsigned int uv;
-
- popt->map = 'n';
- popt->rock = 1;
- popt->joliet = 1;
- popt->cruft = 0;
- popt->hide = 0;
- popt->showassoc = 0;
- popt->check = 'u'; /* unset */
- popt->nocompress = 0;
- popt->blocksize = 1024;
- popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
- popt->uid_set = 0;
- popt->gid_set = 0;
- popt->gid = GLOBAL_ROOT_GID;
- popt->uid = GLOBAL_ROOT_UID;
- popt->iocharset = NULL;
- popt->overriderockperm = 0;
- popt->session=-1;
- popt->sbsector=-1;
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- substring_t args[MAX_OPT_ARGS];
- unsigned n;
-
- if (!*p)
- continue;
+static const struct constant_table isofs_param_check[] = {
+ {"relaxed", 'r'},
+ {"r", 'r'},
+ {"strict", 's'},
+ {"s", 's'},
+ {}
+};
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_norock:
- popt->rock = 0;
- break;
- case Opt_nojoliet:
- popt->joliet = 0;
- break;
- case Opt_hide:
- popt->hide = 1;
- break;
- case Opt_unhide:
- case Opt_showassoc:
- popt->showassoc = 1;
- break;
- case Opt_cruft:
- popt->cruft = 1;
- break;
+static const struct fs_parameter_spec isofs_param_spec[] = {
+ fsparam_flag ("norock", Opt_norock),
+ fsparam_flag ("nojoliet", Opt_nojoliet),
+ fsparam_flag ("unhide", Opt_unhide),
+ fsparam_flag ("hide", Opt_hide),
+ fsparam_flag ("showassoc", Opt_showassoc),
+ fsparam_flag ("cruft", Opt_cruft),
+ fsparam_flag ("utf8", Opt_utf8),
+ fsparam_string ("iocharset", Opt_iocharset),
+ fsparam_enum ("map", Opt_map, isofs_param_map),
+ fsparam_u32 ("session", Opt_session),
+ fsparam_u32 ("sbsector", Opt_sb),
+ fsparam_enum ("check", Opt_check, isofs_param_check),
+ fsparam_u32 ("uid", Opt_uid),
+ fsparam_u32 ("gid", Opt_gid),
+ /* Note: mode/dmode historically accepted %u not strictly %o */
+ fsparam_u32 ("mode", Opt_mode),
+ fsparam_u32 ("dmode", Opt_dmode),
+ fsparam_flag ("overriderockperm", Opt_overriderockperm),
+ fsparam_u32 ("block", Opt_block),
+ fsparam_string ("conv", Opt_ignore),
+ fsparam_flag ("nocompress", Opt_nocompress),
+ {}
+};
+
+static int isofs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct isofs_options *popt = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+ kuid_t uid;
+ kgid_t gid;
+ unsigned int n;
+
+ /* There are no remountable options */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
+
+ opt = fs_parse(fc, isofs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_norock:
+ popt->rock = 0;
+ break;
+ case Opt_nojoliet:
+ popt->joliet = 0;
+ break;
+ case Opt_hide:
+ popt->hide = 1;
+ break;
+ case Opt_unhide:
+ case Opt_showassoc:
+ popt->showassoc = 1;
+ break;
+ case Opt_cruft:
+ popt->cruft = 1;
+ break;
#ifdef CONFIG_JOLIET
- case Opt_utf8:
- kfree(popt->iocharset);
- popt->iocharset = kstrdup("utf8", GFP_KERNEL);
- if (!popt->iocharset)
- return 0;
- break;
- case Opt_iocharset:
- kfree(popt->iocharset);
- popt->iocharset = match_strdup(&args[0]);
- if (!popt->iocharset)
- return 0;
- break;
+ case Opt_utf8:
+ kfree(popt->iocharset);
+ popt->iocharset = kstrdup("utf8", GFP_KERNEL);
+ if (!popt->iocharset)
+ return -ENOMEM;
+ break;
+ case Opt_iocharset:
+ kfree(popt->iocharset);
+ popt->iocharset = kstrdup(param->string, GFP_KERNEL);
+ if (!popt->iocharset)
+ return -ENOMEM;
+ break;
#endif
- case Opt_map_a:
- popt->map = 'a';
- break;
- case Opt_map_o:
- popt->map = 'o';
- break;
- case Opt_map_n:
- popt->map = 'n';
- break;
- case Opt_session:
- if (match_int(&args[0], &option))
- return 0;
- n = option;
- /*
- * Track numbers are supposed to be in range 1-99, the
- * mount option starts indexing at 0.
- */
- if (n >= 99)
- return 0;
- popt->session = n + 1;
- break;
- case Opt_sb:
- if (match_int(&args[0], &option))
- return 0;
- popt->sbsector = option;
- break;
- case Opt_check_r:
- popt->check = 'r';
- break;
- case Opt_check_s:
- popt->check = 's';
- break;
- case Opt_ignore:
- break;
- case Opt_uid:
- if (match_uint(&args[0], &uv))
- return 0;
- popt->uid = make_kuid(current_user_ns(), uv);
- if (!uid_valid(popt->uid))
- return 0;
- popt->uid_set = 1;
- break;
- case Opt_gid:
- if (match_uint(&args[0], &uv))
- return 0;
- popt->gid = make_kgid(current_user_ns(), uv);
- if (!gid_valid(popt->gid))
- return 0;
- popt->gid_set = 1;
- break;
- case Opt_mode:
- if (match_int(&args[0], &option))
- return 0;
- popt->fmode = option;
- break;
- case Opt_dmode:
- if (match_int(&args[0], &option))
- return 0;
- popt->dmode = option;
- break;
- case Opt_overriderockperm:
- popt->overriderockperm = 1;
- break;
- case Opt_block:
- if (match_int(&args[0], &option))
- return 0;
- n = option;
- if (n != 512 && n != 1024 && n != 2048)
- return 0;
- popt->blocksize = n;
- break;
- case Opt_nocompress:
- popt->nocompress = 1;
- break;
- default:
- return 0;
- }
+ case Opt_map:
+ popt->map = result.uint_32;
+ break;
+ case Opt_session:
+ n = result.uint_32;
+ /*
+ * Track numbers are supposed to be in range 1-99, the
+ * mount option starts indexing at 0.
+ */
+ if (n >= 99)
+ return -EINVAL;
+ popt->session = n + 1;
+ break;
+ case Opt_sb:
+ popt->sbsector = result.uint_32;
+ break;
+ case Opt_check:
+ popt->check = result.uint_32;
+ break;
+ case Opt_ignore:
+ break;
+ case Opt_uid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ return -EINVAL;
+ popt->uid = uid;
+ popt->uid_set = 1;
+ break;
+ case Opt_gid:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ return -EINVAL;
+ popt->gid = gid;
+ popt->gid_set = 1;
+ break;
+ case Opt_mode:
+ popt->fmode = result.uint_32;
+ break;
+ case Opt_dmode:
+ popt->dmode = result.uint_32;
+ break;
+ case Opt_overriderockperm:
+ popt->overriderockperm = 1;
+ break;
+ case Opt_block:
+ n = result.uint_32;
+ if (n != 512 && n != 1024 && n != 2048)
+ return -EINVAL;
+ popt->blocksize = n;
+ break;
+ case Opt_nocompress:
+ popt->nocompress = 1;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
/*
@@ -615,7 +579,7 @@ static bool rootdir_empty(struct super_block *sb, unsigned long block)
/*
* Initialize the superblock and read the root inode.
*/
-static int isofs_fill_super(struct super_block *s, void *data, int silent)
+static int isofs_fill_super(struct super_block *s, struct fs_context *fc)
{
struct buffer_head *bh = NULL, *pri_bh = NULL;
struct hs_primary_descriptor *h_pri = NULL;
@@ -623,7 +587,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
struct iso_supplementary_descriptor *sec = NULL;
struct iso_directory_record *rootp;
struct inode *inode;
- struct iso9660_options opt;
+ struct isofs_options *opt = fc->fs_private;
struct isofs_sb_info *sbi;
unsigned long first_data_zone;
int joliet_level = 0;
@@ -631,15 +595,13 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
int orig_zonesize;
int table, error = -EINVAL;
unsigned int vol_desc_start;
+ int silent = fc->sb_flags & SB_SILENT;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
s->s_fs_info = sbi;
- if (!parse_options((char *)data, &opt))
- goto out_freesbi;
-
/*
* First of all, get the hardware blocksize for this device.
* If we don't know what it is, or the hardware blocksize is
@@ -655,14 +617,14 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
bdev_logical_block_size(s->s_bdev));
goto out_freesbi;
}
- opt.blocksize = sb_min_blocksize(s, opt.blocksize);
+ opt->blocksize = sb_min_blocksize(s, opt->blocksize);
sbi->s_high_sierra = 0; /* default is iso9660 */
- sbi->s_session = opt.session;
- sbi->s_sbsector = opt.sbsector;
+ sbi->s_session = opt->session;
+ sbi->s_sbsector = opt->sbsector;
- vol_desc_start = (opt.sbsector != -1) ?
- opt.sbsector : isofs_get_last_session(s,opt.session);
+ vol_desc_start = (opt->sbsector != -1) ?
+ opt->sbsector : isofs_get_last_session(s, opt->session);
for (iso_blknum = vol_desc_start+16;
iso_blknum < vol_desc_start+100; iso_blknum++) {
@@ -696,7 +658,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
sec = (struct iso_supplementary_descriptor *)vdp;
if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
- if (opt.joliet) {
+ if (opt->joliet) {
if (sec->escape[2] == 0x40)
joliet_level = 1;
else if (sec->escape[2] == 0x43)
@@ -721,7 +683,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
goto out_freebh;
sbi->s_high_sierra = 1;
- opt.rock = 0;
+ opt->rock = 0;
h_pri = (struct hs_primary_descriptor *)vdp;
goto root_found;
}
@@ -749,7 +711,7 @@ root_found:
goto out_freebh;
}
- if (joliet_level && (!pri || !opt.rock)) {
+ if (joliet_level && (!pri || !opt->rock)) {
/* This is the case of Joliet with the norock mount flag.
* A disc with both Joliet and Rock Ridge is handled later
*/
@@ -780,7 +742,7 @@ root_found:
* blocks that were 512 bytes (which should only very rarely
* happen.)
*/
- if (orig_zonesize < opt.blocksize)
+ if (orig_zonesize < opt->blocksize)
goto out_bad_size;
/* RDE: convert log zone size to bit shift */
@@ -865,10 +827,10 @@ root_found:
#ifdef CONFIG_JOLIET
if (joliet_level) {
- char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
+ char *p = opt->iocharset ? opt->iocharset : CONFIG_NLS_DEFAULT;
if (strcmp(p, "utf8") != 0) {
- sbi->s_nls_iocharset = opt.iocharset ?
- load_nls(opt.iocharset) : load_nls_default();
+ sbi->s_nls_iocharset = opt->iocharset ?
+ load_nls(opt->iocharset) : load_nls_default();
if (!sbi->s_nls_iocharset)
goto out_freesbi;
}
@@ -876,29 +838,29 @@ root_found:
#endif
s->s_op = &isofs_sops;
s->s_export_op = &isofs_export_ops;
- sbi->s_mapping = opt.map;
- sbi->s_rock = (opt.rock ? 2 : 0);
+ sbi->s_mapping = opt->map;
+ sbi->s_rock = (opt->rock ? 2 : 0);
sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
- sbi->s_cruft = opt.cruft;
- sbi->s_hide = opt.hide;
- sbi->s_showassoc = opt.showassoc;
- sbi->s_uid = opt.uid;
- sbi->s_gid = opt.gid;
- sbi->s_uid_set = opt.uid_set;
- sbi->s_gid_set = opt.gid_set;
- sbi->s_nocompress = opt.nocompress;
- sbi->s_overriderockperm = opt.overriderockperm;
+ sbi->s_cruft = opt->cruft;
+ sbi->s_hide = opt->hide;
+ sbi->s_showassoc = opt->showassoc;
+ sbi->s_uid = opt->uid;
+ sbi->s_gid = opt->gid;
+ sbi->s_uid_set = opt->uid_set;
+ sbi->s_gid_set = opt->gid_set;
+ sbi->s_nocompress = opt->nocompress;
+ sbi->s_overriderockperm = opt->overriderockperm;
/*
* It would be incredibly stupid to allow people to mark every file
* on the disk as suid, so we merely allow them to set the default
* permissions.
*/
- if (opt.fmode != ISOFS_INVALID_MODE)
- sbi->s_fmode = opt.fmode & 0777;
+ if (opt->fmode != ISOFS_INVALID_MODE)
+ sbi->s_fmode = opt->fmode & 0777;
else
sbi->s_fmode = ISOFS_INVALID_MODE;
- if (opt.dmode != ISOFS_INVALID_MODE)
- sbi->s_dmode = opt.dmode & 0777;
+ if (opt->dmode != ISOFS_INVALID_MODE)
+ sbi->s_dmode = opt->dmode & 0777;
else
sbi->s_dmode = ISOFS_INVALID_MODE;
@@ -960,12 +922,12 @@ root_found:
}
}
- if (opt.check == 'u') {
+ if (opt->check == 'u') {
/* Only Joliet is case insensitive by default */
if (joliet_level)
- opt.check = 'r';
+ opt->check = 'r';
else
- opt.check = 's';
+ opt->check = 's';
}
sbi->s_joliet_level = joliet_level;
@@ -980,9 +942,9 @@ root_found:
table = 0;
if (joliet_level)
table += 2;
- if (opt.check == 'r')
+ if (opt->check == 'r')
table++;
- sbi->s_check = opt.check;
+ sbi->s_check = opt->check;
if (table)
s->s_d_op = &isofs_dentry_ops[table - 1];
@@ -994,7 +956,7 @@ root_found:
goto out_no_inode;
}
- kfree(opt.iocharset);
+ kfree(opt->iocharset);
return 0;
@@ -1023,7 +985,7 @@ out_bad_zone_size:
goto out_freebh;
out_bad_size:
printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n",
- orig_zonesize, opt.blocksize);
+ orig_zonesize, opt->blocksize);
goto out_freebh;
out_unknown_format:
if (!silent)
@@ -1033,7 +995,7 @@ out_freebh:
brelse(bh);
brelse(pri_bh);
out_freesbi:
- kfree(opt.iocharset);
+ kfree(opt->iocharset);
kfree(sbi);
s->s_fs_info = NULL;
return error;
@@ -1567,18 +1529,63 @@ struct inode *__isofs_iget(struct super_block *sb,
return inode;
}
-static struct dentry *isofs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int isofs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
+ return get_tree_bdev(fc, isofs_fill_super);
+}
+
+static void isofs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations isofs_context_ops = {
+ .parse_param = isofs_parse_param,
+ .get_tree = isofs_get_tree,
+ .reconfigure = isofs_reconfigure,
+ .free = isofs_free_fc,
+};
+
+static int isofs_init_fs_context(struct fs_context *fc)
+{
+ struct isofs_options *opt;
+
+ opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+ if (!opt)
+ return -ENOMEM;
+
+ opt->map = 'n';
+ opt->rock = 1;
+ opt->joliet = 1;
+ opt->cruft = 0;
+ opt->hide = 0;
+ opt->showassoc = 0;
+ opt->check = 'u'; /* unset */
+ opt->nocompress = 0;
+ opt->blocksize = 1024;
+ opt->fmode = opt->dmode = ISOFS_INVALID_MODE;
+ opt->uid_set = 0;
+ opt->gid_set = 0;
+ opt->gid = GLOBAL_ROOT_GID;
+ opt->uid = GLOBAL_ROOT_UID;
+ opt->iocharset = NULL;
+ opt->overriderockperm = 0;
+ opt->session = -1;
+ opt->sbsector = -1;
+
+ fc->fs_private = opt;
+ fc->ops = &isofs_context_ops;
+
+ return 0;
}
static struct file_system_type iso9660_fs_type = {
.owner = THIS_MODULE,
.name = "iso9660",
- .mount = isofs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = isofs_init_fs_context,
+ .parameters = isofs_param_spec,
};
MODULE_ALIAS_FS("iso9660");
MODULE_ALIAS("iso9660");
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1c97e64c4784..951f78634adf 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -337,8 +337,6 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
/* Checkpoint list management */
-enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP};
-
/*
* journal_shrink_one_cp_list
*
@@ -350,7 +348,7 @@ enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP};
* Called with j_list_lock held.
*/
static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
- enum shrink_type type,
+ enum jbd2_shrink_type type,
bool *released)
{
struct journal_head *last_jh;
@@ -367,12 +365,12 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- if (type == SHRINK_DESTROY) {
+ if (type == JBD2_SHRINK_DESTROY) {
ret = __jbd2_journal_remove_checkpoint(jh);
} else {
ret = jbd2_journal_try_remove_checkpoint(jh);
if (ret < 0) {
- if (type == SHRINK_BUSY_SKIP)
+ if (type == JBD2_SHRINK_BUSY_SKIP)
continue;
break;
}
@@ -439,7 +437,7 @@ again:
tid = transaction->t_tid;
freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list,
- SHRINK_BUSY_SKIP, &released);
+ JBD2_SHRINK_BUSY_SKIP, &released);
nr_freed += freed;
(*nr_to_scan) -= min(*nr_to_scan, freed);
if (*nr_to_scan == 0)
@@ -472,21 +470,25 @@ out:
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
- * If 'destroy' is set, release all buffers unconditionally.
+ * If 'type' is JBD2_SHRINK_DESTROY, release all buffers unconditionally. If
+ * 'type' is JBD2_SHRINK_BUSY_STOP, will stop release buffers if encounters a
+ * busy buffer. To avoid wasting CPU cycles scanning the buffer list in some
+ * cases, don't pass JBD2_SHRINK_BUSY_SKIP 'type' for this function.
*
* Called with j_list_lock held.
*/
-void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal,
+ enum jbd2_shrink_type type)
{
transaction_t *transaction, *last_transaction, *next_transaction;
- enum shrink_type type;
bool released;
+ WARN_ON_ONCE(type == JBD2_SHRINK_BUSY_SKIP);
+
transaction = journal->j_checkpoint_transactions;
if (!transaction)
return;
- type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
@@ -527,7 +529,7 @@ void jbd2_journal_destroy_checkpoint(journal_t *journal)
spin_unlock(&journal->j_list_lock);
break;
}
- __jbd2_journal_clean_checkpoint_list(journal, true);
+ __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_DESTROY);
spin_unlock(&journal->j_list_lock);
cond_resched();
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 5e122586e06e..75ea4e9a5cab 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -501,7 +501,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* frees some memory
*/
spin_lock(&journal->j_list_lock);
- __jbd2_journal_clean_checkpoint_list(journal, false);
+ __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP);
spin_unlock(&journal->j_list_lock);
jbd2_debug(3, "JBD2: commit phase 1\n");
@@ -571,7 +571,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(commit_transaction->t_nr_buffers <=
atomic_read(&commit_transaction->t_outstanding_credits));
- err = 0;
bufs = 0;
descriptor = NULL;
while (commit_transaction->t_buffers) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b6c114c11b97..03c4b9214f56 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2009,7 +2009,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
byte_count = (block_stop - block_start + 1) *
journal->j_blocksize;
- truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
+ truncate_inode_pages_range(journal->j_dev->bd_mapping,
byte_start, byte_stop);
if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index e29f4edf9572..1358c21837f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -206,7 +206,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
struct super_block *sb)
{
struct dentry *dentry;
- struct kernfs_node *knparent = NULL;
+ struct kernfs_node *knparent;
BUG_ON(sb->s_op != &kernfs_sops);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 127a728fcbc8..c11516801784 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -117,7 +117,6 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
if (nsm != NULL)
refcount_inc(&nsm->sm_count);
else {
- host = NULL;
nsm = nsm_get_handle(ni->net, ni->sap, ni->salen,
ni->hostname, ni->hostname_len);
if (unlikely(nsm == NULL)) {
diff --git a/fs/namei.c b/fs/namei.c
index cb5dde0e309f..37fb0a8aa09a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3676,9 +3676,9 @@ static int do_open(struct nameidata *nd,
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
*/
-static int vfs_tmpfile(struct mnt_idmap *idmap,
- const struct path *parentpath,
- struct file *file, umode_t mode)
+int vfs_tmpfile(struct mnt_idmap *idmap,
+ const struct path *parentpath,
+ struct file *file, umode_t mode)
{
struct dentry *child;
struct inode *dir = d_inode(parentpath->dentry);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f7e32d76e34d..57249f040dfc 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -33,12 +33,12 @@ config NFS_FS
config NFS_V2
tristate "NFS client support for NFS version 2"
depends on NFS_FS
- default y
+ default n
help
This option enables support for version 2 of the NFS protocol
(RFC 1094) in the kernel's NFS client.
- If unsure, say Y.
+ If unsure, say N.
config NFS_V3
tristate "NFS client support for NFS version 3"
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ac505671efbd..342930996226 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -56,6 +56,8 @@ static int nfs_readdir(struct file *, struct dir_context *);
static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
static void nfs_readdir_clear_array(struct folio *);
+static int nfs_do_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, int open_flags);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
@@ -2243,6 +2245,41 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
#endif /* CONFIG_NFSV4 */
+int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned int open_flags,
+ umode_t mode)
+{
+
+ /* Same as look+open from lookup_open(), but with different O_TRUNC
+ * handling.
+ */
+ int error = 0;
+
+ if (open_flags & O_CREAT) {
+ file->f_mode |= FMODE_CREATED;
+ error = nfs_do_create(dir, dentry, mode, open_flags);
+ if (error)
+ return error;
+ return finish_open(file, dentry, NULL);
+ } else if (d_in_lookup(dentry)) {
+ /* The only flags nfs_lookup considers are
+ * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and
+ * we want those to be zero so the lookup isn't skipped.
+ */
+ struct dentry *res = nfs_lookup(dir, dentry, 0);
+
+ d_lookup_done(dentry);
+ if (unlikely(res)) {
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+ return finish_no_open(file, res);
+ }
+ }
+ return finish_no_open(file, NULL);
+
+}
+EXPORT_SYMBOL_GPL(nfs_atomic_open_v23);
+
struct dentry *
nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
struct nfs_fattr *fattr)
@@ -2303,18 +2340,23 @@ EXPORT_SYMBOL_GPL(nfs_instantiate);
* that the operation succeeded on the server, but an error in the
* reply path made it appear to have failed.
*/
-int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool excl)
+static int nfs_do_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, int open_flags)
{
struct iattr attr;
- int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
int error;
+ open_flags |= O_CREAT;
+
dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
+ if (open_flags & O_TRUNC) {
+ attr.ia_size = 0;
+ attr.ia_valid |= ATTR_SIZE;
+ }
trace_nfs_create_enter(dir, dentry, open_flags);
error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
@@ -2326,6 +2368,12 @@ out_err:
d_drop(dentry);
return error;
}
+
+int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ return nfs_do_create(dir, dentry, mode, excl ? O_EXCL : 0);
+}
EXPORT_SYMBOL_GPL(nfs_create);
/*
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index ce8f8934bca5..29d84dc66ca3 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -605,14 +605,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
- /* FIXME: remove this check when layout segment support is added */
- if (lgr->range.offset != 0 ||
- lgr->range.length != NFS4_MAX_UINT64) {
- dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
- __func__);
- goto out;
- }
-
if (fl->pattern_offset > lgr->range.offset) {
dprintk("%s pattern_offset %lld too large\n",
__func__, fl->pattern_offset);
@@ -875,15 +867,15 @@ static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_READ,
false,
- GFP_KERNEL);
+ nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -899,15 +891,15 @@ static void
filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_RW,
false,
- GFP_NOFS);
+ nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 3e724cb7ef01..24188af56d5b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -823,14 +823,6 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
}
static void
-ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio,
- struct nfs_page *req)
-{
- pnfs_generic_pg_check_layout(pgio);
- pnfs_generic_pg_check_range(pgio, req);
-}
-
-static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
@@ -840,7 +832,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
u32 ds_idx;
retry:
- ff_layout_pg_check_layout(pgio, req);
+ pnfs_generic_pg_check_layout(pgio, req);
/* Use full layout for now */
if (!pgio->pg_lseg) {
ff_layout_pg_get_read(pgio, req, false);
@@ -895,7 +887,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
u32 i;
retry:
- ff_layout_pg_check_layout(pgio, req);
+ pnfs_generic_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
pgio->pg_lseg =
pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index d0a0956f8a13..6c9f3f6645dd 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -600,9 +600,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_lock:
if (result.negated) {
+ ctx->lock_status = NFS_LOCK_NOLOCK;
ctx->flags |= NFS_MOUNT_NONLM;
ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
} else {
+ ctx->lock_status = NFS_LOCK_LOCK;
ctx->flags &= ~NFS_MOUNT_NONLM;
ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
}
@@ -1112,9 +1114,12 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
ctx->acdirmax = data->acdirmax;
ctx->need_mount = false;
- memcpy(sap, &data->addr, sizeof(data->addr));
- ctx->nfs_server.addrlen = sizeof(data->addr);
- ctx->nfs_server.port = ntohs(data->addr.sin_port);
+ if (!is_remount_fc(fc)) {
+ memcpy(sap, &data->addr, sizeof(data->addr));
+ ctx->nfs_server.addrlen = sizeof(data->addr);
+ ctx->nfs_server.port = ntohs(data->addr.sin_port);
+ }
+
if (sap->ss_family != AF_INET ||
!nfs_verify_server_address(sap))
goto out_no_address;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 06253695fe53..9f0f4534744b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -112,6 +112,7 @@ struct nfs_fs_context {
unsigned short protofamily;
unsigned short mountfamily;
bool has_sec_mnt_opts;
+ int lock_status;
struct {
union {
@@ -153,6 +154,12 @@ struct nfs_fs_context {
} clone_data;
};
+enum nfs_lock_status {
+ NFS_LOCK_NOT_SET = 0,
+ NFS_LOCK_LOCK = 1,
+ NFS_LOCK_NOLOCK = 2,
+};
+
#define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \
errorf(fc, fmt, ## __VA_ARGS__) : \
({ dprintk(fmt "\n", ## __VA_ARGS__); }))
@@ -710,9 +717,9 @@ unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
if ((bsize & (bsize - 1)) || nrbitsp) {
unsigned char nrbits;
- for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+ for (nrbits = 31; nrbits && !(bsize & (1UL << nrbits)); nrbits--)
;
- bsize = 1 << nrbits;
+ bsize = 1UL << nrbits;
if (nrbitsp)
*nrbitsp = nrbits;
}
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 5aa776b5a3e7..b17a9eb9b148 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -46,10 +46,7 @@ static inline void nfs_add_stats(const struct inode *inode,
nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
}
-static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
-{
- return alloc_percpu(struct nfs_iostats);
-}
+#define nfs_alloc_iostats() alloc_percpu(struct nfs_iostats)
static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
{
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cbbe3f0193b8..74bda639a7cf 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -986,6 +986,7 @@ static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
static const struct inode_operations nfs3_dir_inode_operations = {
.create = nfs_create,
+ .atomic_open = nfs_atomic_open_v23,
.lookup = nfs_lookup,
.link = nfs_link,
.unlink = nfs_unlink,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ea390db94b62..c93c12063b3a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5456,7 +5456,7 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task,
struct rpc_message *msg = &task->tk_msg;
if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] &&
- server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) {
+ task->tk_status == -ENOTSUPP) {
server->caps &= ~NFS_CAP_READ_PLUS;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
rpc_restart_call_prepare(task);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 662e86ea3a2d..5b452411e8fd 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2116,6 +2116,7 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_fs_locations *locations = NULL;
+ struct nfs_fattr *fattr;
struct inode *inode;
struct page *page;
int status, result;
@@ -2125,19 +2126,16 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
(unsigned long long)server->fsid.minor,
clp->cl_hostname);
- result = 0;
page = alloc_page(GFP_KERNEL);
locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
- if (page == NULL || locations == NULL) {
- dprintk("<-- %s: no memory\n", __func__);
- goto out;
- }
- locations->fattr = nfs_alloc_fattr();
- if (locations->fattr == NULL) {
+ fattr = nfs_alloc_fattr();
+ if (page == NULL || locations == NULL || fattr == NULL) {
dprintk("<-- %s: no memory\n", __func__);
+ result = 0;
goto out;
}
+ locations->fattr = fattr;
inode = d_inode(server->super->s_root);
result = nfs4_proc_get_locations(server, NFS_FH(inode), locations,
page, cred);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 10985a4b8259..4de8780a7c48 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -47,7 +47,7 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event,
TP_fast_assign(
__entry->error = error < 0 ? -error : 0;
- __assign_str(dstaddr, clp->cl_hostname);
+ __assign_str(dstaddr);
),
TP_printk(
@@ -94,8 +94,8 @@ TRACE_EVENT(nfs4_trunked_exchange_id,
TP_fast_assign(
__entry->error = error < 0 ? -error : 0;
- __assign_str(main_addr, clp->cl_hostname);
- __assign_str(trunk_addr, addr);
+ __assign_str(main_addr);
+ __assign_str(trunk_addr);
),
TP_printk(
@@ -365,7 +365,7 @@ TRACE_EVENT(nfs4_state_mgr,
TP_fast_assign(
__entry->state = clp->cl_state;
- __assign_str(hostname, clp->cl_hostname);
+ __assign_str(hostname);
),
TP_printk(
@@ -393,8 +393,8 @@ TRACE_EVENT(nfs4_state_mgr_failed,
TP_fast_assign(
__entry->error = status < 0 ? -status : 0;
__entry->state = clp->cl_state;
- __assign_str(hostname, clp->cl_hostname);
- __assign_str(section, section);
+ __assign_str(hostname);
+ __assign_str(section);
),
TP_printk(
@@ -578,7 +578,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->fhandle = 0;
}
__entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent));
- __assign_str(name, ctx->dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -1072,7 +1072,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->error = -error;
- __assign_str(name, name->name);
+ __assign_str(name);
),
TP_printk(
@@ -1156,8 +1156,8 @@ TRACE_EVENT(nfs4_rename,
__entry->olddir = NFS_FILEID(olddir);
__entry->newdir = NFS_FILEID(newdir);
__entry->error = error < 0 ? -error : 0;
- __assign_str(oldname, oldname->name);
- __assign_str(newname, newname->name);
+ __assign_str(oldname);
+ __assign_str(newname);
),
TP_printk(
@@ -1359,7 +1359,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
__entry->fileid = 0;
__entry->dev = 0;
}
- __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown");
+ __assign_str(dstaddr);
),
TP_printk(
@@ -1416,7 +1416,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
__entry->fileid = 0;
__entry->dev = 0;
}
- __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown");
+ __assign_str(dstaddr);
__entry->stateid_seq =
be32_to_cpu(stateid->seqid);
__entry->stateid_hash =
@@ -1960,7 +1960,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_event,
),
TP_fast_assign(
- __assign_str(dstaddr, clp->cl_hostname);
+ __assign_str(dstaddr);
memcpy(__entry->deviceid, deviceid->data,
NFS4_DEVICEID4_SIZE);
),
@@ -1998,7 +1998,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status,
TP_fast_assign(
__entry->dev = server->s_dev;
__entry->status = status;
- __assign_str(dstaddr, server->nfs_client->cl_hostname);
+ __assign_str(dstaddr);
memcpy(__entry->deviceid, deviceid->data,
NFS4_DEVICEID4_SIZE);
),
@@ -2036,8 +2036,8 @@ TRACE_EVENT(fl_getdevinfo,
),
TP_fast_assign(
- __assign_str(mds_addr, server->nfs_client->cl_hostname);
- __assign_str(ds_ips, ds_remotestr);
+ __assign_str(mds_addr);
+ __assign_str(ds_ips);
memcpy(__entry->deviceid, deviceid->data,
NFS4_DEVICEID4_SIZE);
),
@@ -2083,9 +2083,7 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
be32_to_cpu(hdr->args.stateid.seqid);
__entry->stateid_hash =
nfs_stateid_hash(&hdr->args.stateid);
- __assign_str(dstaddr, hdr->ds_clp ?
- rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient,
- RPC_DISPLAY_ADDR) : "unknown");
+ __assign_str(dstaddr);
),
TP_printk(
@@ -2139,9 +2137,7 @@ TRACE_EVENT(ff_layout_commit_error,
__entry->dev = inode->i_sb->s_dev;
__entry->offset = data->args.offset;
__entry->count = data->args.count;
- __assign_str(dstaddr, data->ds_clp ?
- rpc_peeraddr2str(data->ds_clp->cl_rpcclient,
- RPC_DISPLAY_ADDR) : "unknown");
+ __assign_str(dstaddr);
),
TP_printk(
@@ -2579,7 +2575,7 @@ DECLARE_EVENT_CLASS(nfs4_xattr_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk(
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index afedb449b54f..1e710654af11 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -409,7 +409,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event,
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
__entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry));
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -457,7 +457,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done,
__entry->error = error < 0 ? -error : 0;
__entry->flags = flags;
__entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry));
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -512,7 +512,7 @@ TRACE_EVENT(nfs_atomic_open_enter,
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
__entry->fmode = (__force unsigned long)ctx->mode;
- __assign_str(name, ctx->dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -551,7 +551,7 @@ TRACE_EVENT(nfs_atomic_open_exit,
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
__entry->fmode = (__force unsigned long)ctx->mode;
- __assign_str(name, ctx->dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -587,7 +587,7 @@ TRACE_EVENT(nfs_create_enter,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -623,7 +623,7 @@ TRACE_EVENT(nfs_create_exit,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -654,7 +654,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event,
TP_fast_assign(
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -693,7 +693,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->error = error < 0 ? -error : 0;
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -747,7 +747,7 @@ TRACE_EVENT(nfs_link_enter,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->dir = NFS_FILEID(dir);
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -783,7 +783,7 @@ TRACE_EVENT(nfs_link_exit,
__entry->fileid = NFS_FILEID(inode);
__entry->dir = NFS_FILEID(dir);
__entry->error = error < 0 ? -error : 0;
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -819,8 +819,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event,
__entry->dev = old_dir->i_sb->s_dev;
__entry->old_dir = NFS_FILEID(old_dir);
__entry->new_dir = NFS_FILEID(new_dir);
- __assign_str(old_name, old_dentry->d_name.name);
- __assign_str(new_name, new_dentry->d_name.name);
+ __assign_str(old_name);
+ __assign_str(new_name);
),
TP_printk(
@@ -868,8 +868,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done,
__entry->error = -error;
__entry->old_dir = NFS_FILEID(old_dir);
__entry->new_dir = NFS_FILEID(new_dir);
- __assign_str(old_name, old_dentry->d_name.name);
- __assign_str(new_name, new_dentry->d_name.name);
+ __assign_str(old_name);
+ __assign_str(new_name);
),
TP_printk(
@@ -1636,8 +1636,8 @@ TRACE_EVENT(nfs_mount_assign,
),
TP_fast_assign(
- __assign_str(option, option);
- __assign_str(value, value);
+ __assign_str(option);
+ __assign_str(value);
),
TP_printk("option %s=%s",
@@ -1657,7 +1657,7 @@ TRACE_EVENT(nfs_mount_option,
),
TP_fast_assign(
- __assign_str(option, param->key);
+ __assign_str(option);
),
TP_printk("option %s", __get_str(option))
@@ -1675,7 +1675,7 @@ TRACE_EVENT(nfs_mount_path,
),
TP_fast_assign(
- __assign_str(path, path);
+ __assign_str(path);
),
TP_printk("path='%s'", __get_str(path))
@@ -1710,9 +1710,8 @@ DECLARE_EVENT_CLASS(nfs_xdr_event,
__entry->xid = be32_to_cpu(rqstp->rq_xid);
__entry->version = task->tk_client->cl_vers;
__entry->error = error;
- __assign_str(program,
- task->tk_client->cl_program->name);
- __assign_str(procedure, task->tk_msg.rpc_proc->p_name);
+ __assign_str(program);
+ __assign_str(procedure);
),
TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a5cc6199127f..b5834728f31b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2705,38 +2705,28 @@ pnfs_layout_return_unused_byclid(struct nfs_client *clp,
&range);
}
+/* Check if we have we have a valid layout but if there isn't an intersection
+ * between the request and the pgio->pg_lseg, put this pgio->pg_lseg away.
+ */
void
-pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
+pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
{
if (pgio->pg_lseg == NULL ||
- test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
+ (test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags) &&
+ pnfs_lseg_request_intersecting(pgio->pg_lseg, req)))
return;
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = NULL;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
-/*
- * Check for any intersection between the request and the pgio->pg_lseg,
- * and if none, put this pgio->pg_lseg away.
- */
-void
-pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
-{
- if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
-
void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
u64 rd_size;
- pnfs_generic_pg_check_layout(pgio);
- pnfs_generic_pg_check_range(pgio, req);
+ pnfs_generic_pg_check_layout(pgio, req);
if (pgio->pg_lseg == NULL) {
if (pgio->pg_dreq == NULL)
rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
@@ -2766,8 +2756,7 @@ void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size)
{
- pnfs_generic_pg_check_layout(pgio);
- pnfs_generic_pg_check_range(pgio, req);
+ pnfs_generic_pg_check_layout(pgio, req);
if (pgio->pg_lseg == NULL) {
pgio->pg_lseg =
pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index db57a85500ee..fa5beeaaf5da 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -257,8 +257,7 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
void unset_pnfs_layoutdriver(struct nfs_server *);
-void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio);
-void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req);
+void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio, struct nfs_page *req);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ad3a321ae997..d105e5b2659d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -695,6 +695,7 @@ static int nfs_have_delegation(struct inode *inode, fmode_t flags)
static const struct inode_operations nfs_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
+ .atomic_open = nfs_atomic_open_v23,
.link = nfs_link,
.unlink = nfs_unlink,
.symlink = nfs_symlink,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index dc03f98f7616..cbbd4866b0b7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -901,6 +901,16 @@ static struct nfs_server *nfs_try_mount_request(struct fs_context *fc)
rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
unsigned int authlist_len = ARRAY_SIZE(authlist);
+ /* make sure 'nolock'/'lock' override the 'local_lock' mount option */
+ if (ctx->lock_status) {
+ if (ctx->lock_status == NFS_LOCK_NOLOCK) {
+ ctx->flags |= NFS_MOUNT_NONLM;
+ ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
+ } else {
+ ctx->flags &= ~NFS_MOUNT_NONLM;
+ ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
+ }
+ }
status = nfs_request_mount(fc, ctx->mntfh, authlist, &authlist_len);
if (status)
return ERR_PTR(status);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 7b641095a665..50b3135d07ac 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -334,21 +334,25 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
static int export_stats_init(struct export_stats *stats)
{
stats->start_time = ktime_get_seconds();
- return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM);
+ return percpu_counter_init_many(stats->counter, 0, GFP_KERNEL,
+ EXP_STATS_COUNTERS_NUM);
}
static void export_stats_reset(struct export_stats *stats)
{
- if (stats)
- nfsd_percpu_counters_reset(stats->counter,
- EXP_STATS_COUNTERS_NUM);
+ if (stats) {
+ int i;
+
+ for (i = 0; i < EXP_STATS_COUNTERS_NUM; i++)
+ percpu_counter_set(&stats->counter[i], 0);
+ }
}
static void export_stats_destroy(struct export_stats *stats)
{
if (stats)
- nfsd_percpu_counters_destroy(stats->counter,
- EXP_STATS_COUNTERS_NUM);
+ percpu_counter_destroy_many(stats->counter,
+ EXP_STATS_COUNTERS_NUM);
}
static void svc_export_put(struct kref *ref)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ddd3e0d9cfa6..ad9083ca144b 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -159,8 +159,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode)
do {
fsnotify_group_lock(nfsd_file_fsnotify_group);
- mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
- nfsd_file_fsnotify_group);
+ mark = fsnotify_find_inode_mark(inode,
+ nfsd_file_fsnotify_group);
if (mark) {
nfm = nfsd_file_mark_get(container_of(mark,
struct nfsd_file_mark,
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
index 0e1d635ec5f9..62d2586d9902 100644
--- a/fs/nfsd/netlink.c
+++ b/fs/nfsd/netlink.c
@@ -10,6 +10,36 @@
#include <uapi/linux/nfsd_netlink.h>
+/* Common nested types */
+const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1] = {
+ [NFSD_A_SOCK_ADDR] = { .type = NLA_BINARY, },
+ [NFSD_A_SOCK_TRANSPORT_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = {
+ [NFSD_A_VERSION_MAJOR] = { .type = NLA_U32, },
+ [NFSD_A_VERSION_MINOR] = { .type = NLA_U32, },
+ [NFSD_A_VERSION_ENABLED] = { .type = NLA_FLAG, },
+};
+
+/* NFSD_CMD_THREADS_SET - do */
+static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_SCOPE + 1] = {
+ [NFSD_A_SERVER_THREADS] = { .type = NLA_U32, },
+ [NFSD_A_SERVER_GRACETIME] = { .type = NLA_U32, },
+ [NFSD_A_SERVER_LEASETIME] = { .type = NLA_U32, },
+ [NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, },
+};
+
+/* NFSD_CMD_VERSION_SET - do */
+static const struct nla_policy nfsd_version_set_nl_policy[NFSD_A_SERVER_PROTO_VERSION + 1] = {
+ [NFSD_A_SERVER_PROTO_VERSION] = NLA_POLICY_NESTED(nfsd_version_nl_policy),
+};
+
+/* NFSD_CMD_LISTENER_SET - do */
+static const struct nla_policy nfsd_listener_set_nl_policy[NFSD_A_SERVER_SOCK_ADDR + 1] = {
+ [NFSD_A_SERVER_SOCK_ADDR] = NLA_POLICY_NESTED(nfsd_sock_nl_policy),
+};
+
/* Ops table for nfsd */
static const struct genl_split_ops nfsd_nl_ops[] = {
{
@@ -19,6 +49,42 @@ static const struct genl_split_ops nfsd_nl_ops[] = {
.done = nfsd_nl_rpc_status_get_done,
.flags = GENL_CMD_CAP_DUMP,
},
+ {
+ .cmd = NFSD_CMD_THREADS_SET,
+ .doit = nfsd_nl_threads_set_doit,
+ .policy = nfsd_threads_set_nl_policy,
+ .maxattr = NFSD_A_SERVER_SCOPE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NFSD_CMD_THREADS_GET,
+ .doit = nfsd_nl_threads_get_doit,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NFSD_CMD_VERSION_SET,
+ .doit = nfsd_nl_version_set_doit,
+ .policy = nfsd_version_set_nl_policy,
+ .maxattr = NFSD_A_SERVER_PROTO_VERSION,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NFSD_CMD_VERSION_GET,
+ .doit = nfsd_nl_version_get_doit,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NFSD_CMD_LISTENER_SET,
+ .doit = nfsd_nl_listener_set_doit,
+ .policy = nfsd_listener_set_nl_policy,
+ .maxattr = NFSD_A_SERVER_SOCK_ADDR,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NFSD_CMD_LISTENER_GET,
+ .doit = nfsd_nl_listener_get_doit,
+ .flags = GENL_CMD_CAP_DO,
+ },
};
struct genl_family nfsd_nl_family __ro_after_init = {
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
index d83dd6bdee92..e3724637d64d 100644
--- a/fs/nfsd/netlink.h
+++ b/fs/nfsd/netlink.h
@@ -11,11 +11,21 @@
#include <uapi/linux/nfsd_netlink.h>
+/* Common nested types */
+extern const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1];
+extern const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1];
+
int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb);
int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb);
int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb);
+int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info);
extern struct genl_family nfsd_nl_family;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index d4be519b5734..14ec15656320 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -218,6 +218,7 @@ struct nfsd_net {
/* Simple check to find out if a given net was properly initialized */
#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
+extern bool nfsd_support_version(int vers);
extern void nfsd_netns_free_versions(struct nfsd_net *nn);
extern unsigned int nfsd_net_id;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e88aca0c6e8e..d756f443fc44 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -978,12 +978,12 @@ static int max_cb_time(struct net *net)
return max(((u32)nn->nfsd4_lease)/10, 1u) * HZ;
}
-static struct workqueue_struct *callback_wq;
-
static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
{
- trace_nfsd_cb_queue(cb->cb_clp, cb);
- return queue_work(callback_wq, &cb->cb_work);
+ struct nfs4_client *clp = cb->cb_clp;
+
+ trace_nfsd_cb_queue(clp, cb);
+ return queue_work(clp->cl_callback_wq, &cb->cb_work);
}
static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
@@ -1153,7 +1153,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp)
void nfsd4_probe_callback_sync(struct nfs4_client *clp)
{
nfsd4_probe_callback(clp);
- flush_workqueue(callback_wq);
+ flush_workqueue(clp->cl_callback_wq);
}
void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
@@ -1372,19 +1372,6 @@ static const struct rpc_call_ops nfsd4_cb_ops = {
.rpc_release = nfsd4_cb_release,
};
-int nfsd4_create_callback_queue(void)
-{
- callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0);
- if (!callback_wq)
- return -ENOMEM;
- return 0;
-}
-
-void nfsd4_destroy_callback_queue(void)
-{
- destroy_workqueue(callback_wq);
-}
-
/* must be called under the state lock */
void nfsd4_shutdown_callback(struct nfs4_client *clp)
{
@@ -1398,7 +1385,7 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
* client, destroy the rpc client, and stop:
*/
nfsd4_run_cb(&clp->cl_cb_null);
- flush_workqueue(callback_wq);
+ flush_workqueue(clp->cl_callback_wq);
nfsd41_cb_inflight_wait_complete(clp);
}
@@ -1420,9 +1407,9 @@ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
/*
* Note there isn't a lot of locking in this code; instead we depend on
- * the fact that it is run from the callback_wq, which won't run two
- * work items at once. So, for example, callback_wq handles all access
- * of cl_cb_client and all calls to rpc_create or rpc_shutdown_client.
+ * the fact that it is run from clp->cl_callback_wq, which won't run two
+ * work items at once. So, for example, clp->cl_callback_wq handles all
+ * access of cl_cb_client and all calls to rpc_create or rpc_shutdown_client.
*/
static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
{
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2927b1263f08..46bd20fe5c0f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1737,7 +1737,7 @@ static void cleanup_async_copy(struct nfsd4_copy *copy)
nfs4_put_copy(copy);
}
-static void nfsd4_send_cb_offload(struct nfsd4_copy *copy, __be32 nfserr)
+static void nfsd4_send_cb_offload(struct nfsd4_copy *copy)
{
struct nfsd4_cb_offload *cbo;
@@ -1747,12 +1747,12 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy, __be32 nfserr)
memcpy(&cbo->co_res, &copy->cp_res, sizeof(copy->cp_res));
memcpy(&cbo->co_fh, &copy->fh, sizeof(copy->fh));
- cbo->co_nfserr = nfserr;
+ cbo->co_nfserr = copy->nfserr;
nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops,
NFSPROC4_CLNT_CB_OFFLOAD);
trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid,
- &cbo->co_fh, copy->cp_count, nfserr);
+ &cbo->co_fh, copy->cp_count, copy->nfserr);
nfsd4_run_cb(&cbo->co_cb);
}
@@ -1766,7 +1766,6 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy, __be32 nfserr)
static int nfsd4_do_async_copy(void *data)
{
struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
- __be32 nfserr;
trace_nfsd_copy_do_async(copy);
if (nfsd4_ssc_is_inter(copy)) {
@@ -1777,24 +1776,25 @@ static int nfsd4_do_async_copy(void *data)
if (IS_ERR(filp)) {
switch (PTR_ERR(filp)) {
case -EBADF:
- nfserr = nfserr_wrong_type;
+ copy->nfserr = nfserr_wrong_type;
break;
default:
- nfserr = nfserr_offload_denied;
+ copy->nfserr = nfserr_offload_denied;
}
/* ss_mnt will be unmounted by the laundromat */
goto do_callback;
}
- nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
- false);
+ copy->nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
+ false);
nfsd4_cleanup_inter_ssc(copy->ss_nsui, filp, copy->nf_dst);
} else {
- nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file,
- copy->nf_dst->nf_file, false);
+ copy->nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file,
+ copy->nf_dst->nf_file, false);
}
do_callback:
- nfsd4_send_cb_offload(copy, nfserr);
+ set_bit(NFSD4_COPY_F_COMPLETED, &copy->cp_flags);
+ nfsd4_send_cb_offload(copy);
cleanup_async_copy(copy);
return 0;
}
@@ -1807,6 +1807,13 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd4_copy *async_copy = NULL;
+ /*
+ * Currently, async COPY is not reliable. Force all COPY
+ * requests to be synchronous to avoid client application
+ * hangs waiting for COPY completion.
+ */
+ nfsd4_copy_set_sync(copy, true);
+
copy->cp_clp = cstate->clp;
if (nfsd4_ssc_is_inter(copy)) {
trace_nfsd_copy_inter(copy);
@@ -2003,11 +2010,16 @@ nfsd4_offload_status(struct svc_rqst *rqstp,
struct nfsd4_copy *copy;
struct nfs4_client *clp = cstate->clp;
+ os->completed = false;
spin_lock(&clp->async_lock);
copy = find_async_copy_locked(clp, &os->stateid);
- if (copy)
+ if (copy) {
os->count = copy->cp_res.wr_bytes_written;
- else
+ if (test_bit(NFSD4_COPY_F_COMPLETED, &copy->cp_flags)) {
+ os->completed = true;
+ os->status = copy->nfserr;
+ }
+ } else
status = nfserr_bad_stateid;
spin_unlock(&clp->async_lock);
@@ -2154,6 +2166,29 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status == nfserr_same ? nfs_ok : status;
}
+static __be32
+nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation;
+
+ /*
+ * RFC 8881, section 18.39.3 says:
+ *
+ * "The server may refuse to grant the delegation. In that case, the
+ * server will return NFS4ERR_DIRDELEG_UNAVAIL."
+ *
+ * This is sub-optimal, since it means that the server would need to
+ * abort compound processing just because the delegation wasn't
+ * available. RFC8881bis should change this to allow the server to
+ * return NFS4_OK with a non-fatal status of GDD4_UNAVAIL in this
+ * situation.
+ */
+ gdd->gddrnf_status = GDD4_UNAVAIL;
+ return nfs_ok;
+}
+
#ifdef CONFIG_NFSD_PNFS
static const struct nfsd4_layout_ops *
nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
@@ -3082,6 +3117,18 @@ static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp,
* sizeof(__be32);
}
+static u32 nfsd4_get_dir_delegation_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* gddr_status */ +
+ op_encode_verifier_maxsz +
+ op_encode_stateid_maxsz +
+ 2 /* gddr_notification */ +
+ 2 /* gddr_child_attributes */ +
+ 2 /* gddr_dir_attributes */);
+}
+
#ifdef CONFIG_NFSD_PNFS
static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp,
const struct nfsd4_op *op)
@@ -3470,6 +3517,12 @@ static const struct nfsd4_operation nfsd4_ops[] = {
.op_get_currentstateid = nfsd4_get_freestateid,
.op_rsize_bop = nfsd4_only_status_rsize,
},
+ [OP_GET_DIR_DELEGATION] = {
+ .op_func = nfsd4_get_dir_delegation,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_GET_DIR_DELEGATION",
+ .op_rsize_bop = nfsd4_get_dir_delegation_rsize,
+ },
#ifdef CONFIG_NFSD_PNFS
[OP_GETDEVICEINFO] = {
.op_func = nfsd4_getdeviceinfo,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 84d4093ca713..a20c2c9d7d45 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -541,7 +541,7 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner)
}
static struct nfs4_openowner *
-find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open,
+find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
struct nfs4_client *clp)
{
struct nfs4_stateowner *so;
@@ -558,18 +558,6 @@ find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open,
return NULL;
}
-static struct nfs4_openowner *
-find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
- struct nfs4_client *clp)
-{
- struct nfs4_openowner *oo;
-
- spin_lock(&clp->cl_lock);
- oo = find_openstateowner_str_locked(hashval, open, clp);
- spin_unlock(&clp->cl_lock);
- return oo;
-}
-
static inline u32
opaque_hashval(const void *ptr, int nbytes)
{
@@ -1409,11 +1397,16 @@ static void
recalculate_deny_mode(struct nfs4_file *fp)
{
struct nfs4_ol_stateid *stp;
+ u32 old_deny;
spin_lock(&fp->fi_lock);
+ old_deny = fp->fi_share_deny;
fp->fi_share_deny = 0;
- list_for_each_entry(stp, &fp->fi_stateids, st_perfile)
+ list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap);
+ if (fp->fi_share_deny == old_deny)
+ break;
+ }
spin_unlock(&fp->fi_lock);
}
@@ -2245,6 +2238,10 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
GFP_KERNEL);
if (!clp->cl_ownerstr_hashtbl)
goto err_no_hashtbl;
+ clp->cl_callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0);
+ if (!clp->cl_callback_wq)
+ goto err_no_callback_wq;
+
for (i = 0; i < OWNER_HASH_SIZE; i++)
INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]);
INIT_LIST_HEAD(&clp->cl_sessions);
@@ -2267,6 +2264,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
spin_lock_init(&clp->cl_lock);
rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
return clp;
+err_no_callback_wq:
+ kfree(clp->cl_ownerstr_hashtbl);
err_no_hashtbl:
kfree(clp->cl_name.data);
err_no_name:
@@ -2280,6 +2279,7 @@ static void __free_client(struct kref *k)
struct nfs4_client *clp = container_of(c, struct nfs4_client, cl_nfsdfs);
free_svc_cred(&clp->cl_cred);
+ destroy_workqueue(clp->cl_callback_wq);
kfree(clp->cl_ownerstr_hashtbl);
kfree(clp->cl_name.data);
kfree(clp->cl_nii_domain.data);
@@ -2352,7 +2352,11 @@ unhash_client(struct nfs4_client *clp)
static __be32 mark_client_expired_locked(struct nfs4_client *clp)
{
- if (atomic_read(&clp->cl_rpc_users))
+ int users = atomic_read(&clp->cl_rpc_users);
+
+ trace_nfsd_mark_client_expired(clp, users);
+
+ if (users)
return nfserr_jukebox;
unhash_client_locked(clp);
return nfs_ok;
@@ -3641,12 +3645,8 @@ out_nolock:
return status;
}
-static __be32
-check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
+static __be32 check_slot_seqid(u32 seqid, u32 slot_seqid, bool slot_inuse)
{
- dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
- slot_seqid);
-
/* The slot is in use, and no response has been sent. */
if (slot_inuse) {
if (seqid == slot_seqid)
@@ -3823,10 +3823,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
}
/* RFC 8881 Section 18.36.4 Phase 2: Sequence ID processing. */
- if (conf)
+ if (conf) {
cs_slot = &conf->cl_cs_slot;
- else
+ trace_nfsd_slot_seqid_conf(conf, cr_ses);
+ } else {
cs_slot = &unconf->cl_cs_slot;
+ trace_nfsd_slot_seqid_unconf(unconf, cr_ses);
+ }
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
switch (status) {
case nfs_ok:
@@ -4221,6 +4224,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* sr_highest_slotid and the sr_target_slot id to maxslots */
seq->maxslots = session->se_fchannel.maxreqs;
+ trace_nfsd_slot_seqid_sequence(clp, seq, slot);
status = check_slot_seqid(seq->seqid, slot->sl_seqid,
slot->sl_flags & NFSD4_SLOT_INUSE);
if (status == nfserr_replay_cache) {
@@ -4662,21 +4666,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
atomic_set(&nn->nfsd_courtesy_clients, 0);
}
+enum rp_lock {
+ RP_UNLOCKED,
+ RP_LOCKED,
+ RP_UNHASHED,
+};
+
static void init_nfs4_replay(struct nfs4_replay *rp)
{
rp->rp_status = nfserr_serverfault;
rp->rp_buflen = 0;
rp->rp_buf = rp->rp_ibuf;
- mutex_init(&rp->rp_mutex);
+ atomic_set(&rp->rp_locked, RP_UNLOCKED);
}
-static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
- struct nfs4_stateowner *so)
+static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
+ struct nfs4_stateowner *so)
{
if (!nfsd4_has_session(cstate)) {
- mutex_lock(&so->so_replay.rp_mutex);
+ wait_var_event(&so->so_replay.rp_locked,
+ atomic_cmpxchg(&so->so_replay.rp_locked,
+ RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
+ if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
+ return -EAGAIN;
cstate->replay_owner = nfs4_get_stateowner(so);
}
+ return 0;
}
void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
@@ -4685,7 +4700,8 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
if (so != NULL) {
cstate->replay_owner = NULL;
- mutex_unlock(&so->so_replay.rp_mutex);
+ atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
+ wake_up_var(&so->so_replay.rp_locked);
nfs4_put_stateowner(so);
}
}
@@ -4866,34 +4882,46 @@ nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
}
static struct nfs4_openowner *
-alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
- struct nfsd4_compound_state *cstate)
+find_or_alloc_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
+ struct nfsd4_compound_state *cstate)
{
struct nfs4_client *clp = cstate->clp;
- struct nfs4_openowner *oo, *ret;
+ struct nfs4_openowner *oo, *new = NULL;
- oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
- if (!oo)
- return NULL;
- oo->oo_owner.so_ops = &openowner_ops;
- oo->oo_owner.so_is_open_owner = 1;
- oo->oo_owner.so_seqid = open->op_seqid;
- oo->oo_flags = 0;
- if (nfsd4_has_session(cstate))
- oo->oo_flags |= NFS4_OO_CONFIRMED;
- oo->oo_time = 0;
- oo->oo_last_closed_stid = NULL;
- INIT_LIST_HEAD(&oo->oo_close_lru);
+retry:
spin_lock(&clp->cl_lock);
- ret = find_openstateowner_str_locked(strhashval, open, clp);
- if (ret == NULL) {
- hash_openowner(oo, clp, strhashval);
- ret = oo;
- } else
- nfs4_free_stateowner(&oo->oo_owner);
-
+ oo = find_openstateowner_str(strhashval, open, clp);
+ if (!oo && new) {
+ hash_openowner(new, clp, strhashval);
+ spin_unlock(&clp->cl_lock);
+ return new;
+ }
spin_unlock(&clp->cl_lock);
- return ret;
+
+ if (oo && !(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+ /* Replace unconfirmed owners without checking for replay. */
+ release_openowner(oo);
+ oo = NULL;
+ }
+ if (oo) {
+ if (new)
+ nfs4_free_stateowner(&new->oo_owner);
+ return oo;
+ }
+
+ new = alloc_stateowner(openowner_slab, &open->op_owner, clp);
+ if (!new)
+ return NULL;
+ new->oo_owner.so_ops = &openowner_ops;
+ new->oo_owner.so_is_open_owner = 1;
+ new->oo_owner.so_seqid = open->op_seqid;
+ new->oo_flags = 0;
+ if (nfsd4_has_session(cstate))
+ new->oo_flags |= NFS4_OO_CONFIRMED;
+ new->oo_time = 0;
+ new->oo_last_closed_stid = NULL;
+ INIT_LIST_HEAD(&new->oo_close_lru);
+ goto retry;
}
static struct nfs4_ol_stateid *
@@ -4969,7 +4997,11 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
* Wait for the refcount to drop to 2. Since it has been unhashed,
* there should be no danger of the refcount going back up again at
* this point.
+ * Some threads with a reference might be waiting for rp_locked,
+ * so tell them to stop waiting.
*/
+ atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
+ wake_up_var(&oo->oo_owner.so_replay.rp_locked);
wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
release_all_access(s);
@@ -5342,27 +5374,19 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
clp = cstate->clp;
strhashval = ownerstr_hashval(&open->op_owner);
- oo = find_openstateowner_str(strhashval, open, clp);
+retry:
+ oo = find_or_alloc_open_stateowner(strhashval, open, cstate);
open->op_openowner = oo;
- if (!oo) {
- goto new_owner;
- }
- if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
- /* Replace unconfirmed owners without checking for replay. */
- release_openowner(oo);
- open->op_openowner = NULL;
- goto new_owner;
+ if (!oo)
+ return nfserr_jukebox;
+ if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
+ nfs4_put_stateowner(&oo->oo_owner);
+ goto retry;
}
status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
if (status)
return status;
- goto alloc_stateid;
-new_owner:
- oo = alloc_init_open_stateowner(strhashval, open, cstate);
- if (oo == NULL)
- return nfserr_jukebox;
- open->op_openowner = oo;
-alloc_stateid:
+
open->op_stp = nfs4_alloc_open_stateid(clp);
if (!open->op_stp)
return nfserr_jukebox;
@@ -6133,12 +6157,8 @@ out:
void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
struct nfsd4_open *open)
{
- if (open->op_openowner) {
- struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
-
- nfsd4_cstate_assign_replay(cstate, so);
- nfs4_put_stateowner(so);
- }
+ if (open->op_openowner)
+ nfs4_put_stateowner(&open->op_openowner->oo_owner);
if (open->op_file)
kmem_cache_free(file_slab, open->op_file);
if (open->op_stp)
@@ -7202,12 +7222,16 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
trace_nfsd_preprocess(seqid, stateid);
*stpp = NULL;
+retry:
status = nfsd4_lookup_stateid(cstate, stateid,
typemask, statusmask, &s, nn);
if (status)
return status;
stp = openlockstateid(s);
- nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
+ if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) {
+ nfs4_put_stateowner(stp->st_stateowner);
+ goto retry;
+ }
status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
if (!status)
@@ -7349,7 +7373,7 @@ out:
return status;
}
-static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
{
struct nfs4_client *clp = s->st_stid.sc_client;
bool unhashed;
@@ -7366,11 +7390,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
list_for_each_entry(stp, &reaplist, st_locks)
nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
free_ol_stateid_reaplist(&reaplist);
+ return false;
} else {
spin_unlock(&clp->cl_lock);
free_ol_stateid_reaplist(&reaplist);
- if (unhashed)
- move_to_close_lru(s, clp->net);
+ return unhashed;
}
}
@@ -7386,6 +7410,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfs4_ol_stateid *stp;
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ bool need_move_to_close_list;
dprintk("NFSD: nfsd4_close on file %pd\n",
cstate->current_fh.fh_dentry);
@@ -7410,8 +7435,10 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
*/
nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
- nfsd4_close_open_stateid(stp);
+ need_move_to_close_list = nfsd4_close_open_stateid(stp);
mutex_unlock(&stp->st_mutex);
+ if (need_move_to_close_list)
+ move_to_close_lru(stp, net);
/* v4.1+ suggests that we send a special stateid in here, since the
* clients should just ignore this anyway. Since this is not useful
@@ -8625,12 +8652,6 @@ nfs4_state_start(void)
if (ret)
return ret;
- ret = nfsd4_create_callback_queue();
- if (ret) {
- rhltable_destroy(&nfs4_file_rhltable);
- return ret;
- }
-
set_max_delegations();
return 0;
}
@@ -8671,7 +8692,6 @@ nfs4_state_shutdown_net(struct net *net)
void
nfs4_state_shutdown(void)
{
- nfsd4_destroy_callback_queue();
rhltable_destroy(&nfs4_file_rhltable);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a644460f3a5e..c7bfd2180e3f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1732,6 +1732,35 @@ nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid);
}
+static __be32
+nfsd4_decode_get_dir_delegation(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation;
+ __be32 status;
+
+ memset(gdd, 0, sizeof(*gdd));
+
+ if (xdr_stream_decode_bool(argp->xdr, &gdd->gdda_signal_deleg_avail) < 0)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_bitmap4(argp, gdd->gdda_notification_types,
+ ARRAY_SIZE(gdd->gdda_notification_types));
+ if (status)
+ return status;
+ status = nfsd4_decode_nfstime4(argp, &gdd->gdda_child_attr_delay);
+ if (status)
+ return status;
+ status = nfsd4_decode_nfstime4(argp, &gdd->gdda_dir_attr_delay);
+ if (status)
+ return status;
+ status = nfsd4_decode_bitmap4(argp, gdd->gdda_child_attributes,
+ ARRAY_SIZE(gdd->gdda_child_attributes));
+ if (status)
+ return status;
+ return nfsd4_decode_bitmap4(argp, gdd->gdda_dir_attributes,
+ ARRAY_SIZE(gdd->gdda_dir_attributes));
+}
+
#ifdef CONFIG_NFSD_PNFS
static __be32
nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
@@ -2370,7 +2399,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = {
[OP_CREATE_SESSION] = nfsd4_decode_create_session,
[OP_DESTROY_SESSION] = nfsd4_decode_destroy_session,
[OP_FREE_STATEID] = nfsd4_decode_free_stateid,
- [OP_GET_DIR_DELEGATION] = nfsd4_decode_notsupp,
+ [OP_GET_DIR_DELEGATION] = nfsd4_decode_get_dir_delegation,
#ifdef CONFIG_NFSD_PNFS
[OP_GETDEVICEINFO] = nfsd4_decode_getdeviceinfo,
[OP_GETDEVICELIST] = nfsd4_decode_notsupp,
@@ -4963,6 +4992,49 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfs_ok;
}
+static __be32
+nfsd4_encode_get_dir_delegation(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation;
+ struct xdr_stream *xdr = resp->xdr;
+ __be32 status = nfserr_resource;
+
+ switch(gdd->gddrnf_status) {
+ case GDD4_OK:
+ if (xdr_stream_encode_u32(xdr, GDD4_OK) != XDR_UNIT)
+ break;
+ status = nfsd4_encode_verifier4(xdr, &gdd->gddr_cookieverf);
+ if (status)
+ break;
+ status = nfsd4_encode_stateid4(xdr, &gdd->gddr_stateid);
+ if (status)
+ break;
+ status = nfsd4_encode_bitmap4(xdr, gdd->gddr_notification[0], 0, 0);
+ if (status)
+ break;
+ status = nfsd4_encode_bitmap4(xdr, gdd->gddr_child_attributes[0],
+ gdd->gddr_child_attributes[1],
+ gdd->gddr_child_attributes[2]);
+ if (status)
+ break;
+ status = nfsd4_encode_bitmap4(xdr, gdd->gddr_dir_attributes[0],
+ gdd->gddr_dir_attributes[1],
+ gdd->gddr_dir_attributes[2]);
+ break;
+ default:
+ pr_warn("nfsd: bad gddrnf_status (%u)\n", gdd->gddrnf_status);
+ gdd->gddrnf_will_signal_deleg_avail = 0;
+ fallthrough;
+ case GDD4_UNAVAIL:
+ if (xdr_stream_encode_u32(xdr, GDD4_UNAVAIL) != XDR_UNIT)
+ break;
+ status = nfsd4_encode_bool(xdr, gdd->gddrnf_will_signal_deleg_avail);
+ break;
+ }
+ return status;
+}
+
#ifdef CONFIG_NFSD_PNFS
static __be32
nfsd4_encode_device_addr4(struct xdr_stream *xdr,
@@ -5199,7 +5271,12 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
if (nfserr != nfs_ok)
return nfserr;
/* osr_complete<1> */
- if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ if (os->completed) {
+ if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT)
+ return nfserr_resource;
+ if (xdr_stream_encode_be32(xdr, os->status) != XDR_UNIT)
+ return nfserr_resource;
+ } else if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
return nfserr_resource;
return nfs_ok;
}
@@ -5579,7 +5656,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = {
[OP_CREATE_SESSION] = nfsd4_encode_create_session,
[OP_DESTROY_SESSION] = nfsd4_encode_noop,
[OP_FREE_STATEID] = nfsd4_encode_noop,
- [OP_GET_DIR_DELEGATION] = nfsd4_encode_noop,
+ [OP_GET_DIR_DELEGATION] = nfsd4_encode_get_dir_delegation,
#ifdef CONFIG_NFSD_PNFS
[OP_GETDEVICEINFO] = nfsd4_encode_getdeviceinfo,
[OP_GETDEVICELIST] = nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index ecd18bffeebc..202140df8f82 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -15,6 +15,7 @@
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/gss_api.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/svc.h>
#include <linux/module.h>
#include <linux/fsnotify.h>
@@ -48,12 +49,10 @@ enum {
NFSD_MaxBlkSize,
NFSD_MaxConnections,
NFSD_Filecache,
-#ifdef CONFIG_NFSD_V4
NFSD_Leasetime,
NFSD_Gracetime,
NFSD_RecoveryDir,
NFSD_V4EndGrace,
-#endif
NFSD_MaxReserved
};
@@ -406,7 +405,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
if (newthreads < 0)
return -EINVAL;
trace_nfsd_ctl_threads(net, newthreads);
- rv = nfsd_svc(newthreads, net, file->f_cred);
+ mutex_lock(&nfsd_mutex);
+ rv = nfsd_svc(newthreads, net, file->f_cred, NULL);
+ mutex_unlock(&nfsd_mutex);
if (rv < 0)
return rv;
} else
@@ -1360,7 +1361,9 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
#ifdef CONFIG_NFSD_V4
[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+#endif
[NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
#endif
/* last one */ {""}
@@ -1652,6 +1655,518 @@ int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb)
}
/**
+ * nfsd_nl_threads_set_doit - set the number of running threads
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ int nthreads = 0, count = 0, nrpools, ret = -EOPNOTSUPP, rem;
+ struct net *net = genl_info_net(info);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ const struct nlattr *attr;
+ const char *scope = NULL;
+
+ if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_THREADS))
+ return -EINVAL;
+
+ /* count number of SERVER_THREADS values */
+ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ if (nla_type(attr) == NFSD_A_SERVER_THREADS)
+ count++;
+ }
+
+ mutex_lock(&nfsd_mutex);
+
+ nrpools = nfsd_nrpools(net);
+ if (nrpools && count > nrpools)
+ count = nrpools;
+
+ /* XXX: make this handle non-global pool-modes */
+ if (count > 1)
+ goto out_unlock;
+
+ nthreads = nla_get_u32(info->attrs[NFSD_A_SERVER_THREADS]);
+ if (info->attrs[NFSD_A_SERVER_GRACETIME] ||
+ info->attrs[NFSD_A_SERVER_LEASETIME] ||
+ info->attrs[NFSD_A_SERVER_SCOPE]) {
+ ret = -EBUSY;
+ if (nn->nfsd_serv && nn->nfsd_serv->sv_nrthreads)
+ goto out_unlock;
+
+ ret = -EINVAL;
+ attr = info->attrs[NFSD_A_SERVER_GRACETIME];
+ if (attr) {
+ u32 gracetime = nla_get_u32(attr);
+
+ if (gracetime < 10 || gracetime > 3600)
+ goto out_unlock;
+
+ nn->nfsd4_grace = gracetime;
+ }
+
+ attr = info->attrs[NFSD_A_SERVER_LEASETIME];
+ if (attr) {
+ u32 leasetime = nla_get_u32(attr);
+
+ if (leasetime < 10 || leasetime > 3600)
+ goto out_unlock;
+
+ nn->nfsd4_lease = leasetime;
+ }
+
+ attr = info->attrs[NFSD_A_SERVER_SCOPE];
+ if (attr)
+ scope = nla_data(attr);
+ }
+
+ ret = nfsd_svc(nthreads, net, get_current_cred(), scope);
+
+out_unlock:
+ mutex_unlock(&nfsd_mutex);
+
+ return ret == nthreads ? 0 : ret;
+}
+
+/**
+ * nfsd_nl_threads_get_doit - get the number of running threads
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ void *hdr;
+ int err;
+
+ skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(skb, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_free_msg;
+ }
+
+ mutex_lock(&nfsd_mutex);
+
+ err = nla_put_u32(skb, NFSD_A_SERVER_GRACETIME,
+ nn->nfsd4_grace) ||
+ nla_put_u32(skb, NFSD_A_SERVER_LEASETIME,
+ nn->nfsd4_lease) ||
+ nla_put_string(skb, NFSD_A_SERVER_SCOPE,
+ nn->nfsd_name);
+ if (err)
+ goto err_unlock;
+
+ if (nn->nfsd_serv) {
+ int i;
+
+ for (i = 0; i < nfsd_nrpools(net); ++i) {
+ struct svc_pool *sp = &nn->nfsd_serv->sv_pools[i];
+
+ err = nla_put_u32(skb, NFSD_A_SERVER_THREADS,
+ atomic_read(&sp->sp_nrthreads));
+ if (err)
+ goto err_unlock;
+ }
+ } else {
+ err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, 0);
+ if (err)
+ goto err_unlock;
+ }
+
+ mutex_unlock(&nfsd_mutex);
+
+ genlmsg_end(skb, hdr);
+
+ return genlmsg_reply(skb, info);
+
+err_unlock:
+ mutex_unlock(&nfsd_mutex);
+err_free_msg:
+ nlmsg_free(skb);
+
+ return err;
+}
+
+/**
+ * nfsd_nl_version_set_doit - set the nfs enabled versions
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ const struct nlattr *attr;
+ struct nfsd_net *nn;
+ int i, rem;
+
+ if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_PROTO_VERSION))
+ return -EINVAL;
+
+ mutex_lock(&nfsd_mutex);
+
+ nn = net_generic(genl_info_net(info), nfsd_net_id);
+ if (nn->nfsd_serv) {
+ mutex_unlock(&nfsd_mutex);
+ return -EBUSY;
+ }
+
+ /* clear current supported versions. */
+ nfsd_vers(nn, 2, NFSD_CLEAR);
+ nfsd_vers(nn, 3, NFSD_CLEAR);
+ for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++)
+ nfsd_minorversion(nn, i, NFSD_CLEAR);
+
+ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ struct nlattr *tb[NFSD_A_VERSION_MAX + 1];
+ u32 major, minor = 0;
+ bool enabled;
+
+ if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION)
+ continue;
+
+ if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr,
+ nfsd_version_nl_policy, info->extack) < 0)
+ continue;
+
+ if (!tb[NFSD_A_VERSION_MAJOR])
+ continue;
+
+ major = nla_get_u32(tb[NFSD_A_VERSION_MAJOR]);
+ if (tb[NFSD_A_VERSION_MINOR])
+ minor = nla_get_u32(tb[NFSD_A_VERSION_MINOR]);
+
+ enabled = nla_get_flag(tb[NFSD_A_VERSION_ENABLED]);
+
+ switch (major) {
+ case 4:
+ nfsd_minorversion(nn, minor, enabled ? NFSD_SET : NFSD_CLEAR);
+ break;
+ case 3:
+ case 2:
+ if (!minor)
+ nfsd_vers(nn, major, enabled ? NFSD_SET : NFSD_CLEAR);
+ break;
+ default:
+ break;
+ }
+ }
+
+ mutex_unlock(&nfsd_mutex);
+
+ return 0;
+}
+
+/**
+ * nfsd_nl_version_get_doit - get the enabled status for all supported nfs versions
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfsd_net *nn;
+ int i, err;
+ void *hdr;
+
+ skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(skb, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_free_msg;
+ }
+
+ mutex_lock(&nfsd_mutex);
+ nn = net_generic(genl_info_net(info), nfsd_net_id);
+
+ for (i = 2; i <= 4; i++) {
+ int j;
+
+ for (j = 0; j <= NFSD_SUPPORTED_MINOR_VERSION; j++) {
+ struct nlattr *attr;
+
+ /* Don't record any versions the kernel doesn't have
+ * compiled in
+ */
+ if (!nfsd_support_version(i))
+ continue;
+
+ /* NFSv{2,3} does not support minor numbers */
+ if (i < 4 && j)
+ continue;
+
+ attr = nla_nest_start(skb,
+ NFSD_A_SERVER_PROTO_VERSION);
+ if (!attr) {
+ err = -EINVAL;
+ goto err_nfsd_unlock;
+ }
+
+ if (nla_put_u32(skb, NFSD_A_VERSION_MAJOR, i) ||
+ nla_put_u32(skb, NFSD_A_VERSION_MINOR, j)) {
+ err = -EINVAL;
+ goto err_nfsd_unlock;
+ }
+
+ /* Set the enabled flag if the version is enabled */
+ if (nfsd_vers(nn, i, NFSD_TEST) &&
+ (i < 4 || nfsd_minorversion(nn, j, NFSD_TEST)) &&
+ nla_put_flag(skb, NFSD_A_VERSION_ENABLED)) {
+ err = -EINVAL;
+ goto err_nfsd_unlock;
+ }
+
+ nla_nest_end(skb, attr);
+ }
+ }
+
+ mutex_unlock(&nfsd_mutex);
+ genlmsg_end(skb, hdr);
+
+ return genlmsg_reply(skb, info);
+
+err_nfsd_unlock:
+ mutex_unlock(&nfsd_mutex);
+err_free_msg:
+ nlmsg_free(skb);
+
+ return err;
+}
+
+/**
+ * nfsd_nl_listener_set_doit - set the nfs running sockets
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct svc_xprt *xprt, *tmp;
+ const struct nlattr *attr;
+ struct svc_serv *serv;
+ LIST_HEAD(permsocks);
+ struct nfsd_net *nn;
+ int err, rem;
+
+ mutex_lock(&nfsd_mutex);
+
+ err = nfsd_create_serv(net);
+ if (err) {
+ mutex_unlock(&nfsd_mutex);
+ return err;
+ }
+
+ nn = net_generic(net, nfsd_net_id);
+ serv = nn->nfsd_serv;
+
+ spin_lock_bh(&serv->sv_lock);
+
+ /* Move all of the old listener sockets to a temp list */
+ list_splice_init(&serv->sv_permsocks, &permsocks);
+
+ /*
+ * Walk the list of server_socks from userland and move any that match
+ * back to sv_permsocks
+ */
+ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
+ const char *xcl_name;
+ struct sockaddr *sa;
+
+ if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
+ continue;
+
+ if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
+ nfsd_sock_nl_policy, info->extack) < 0)
+ continue;
+
+ if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME])
+ continue;
+
+ if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa))
+ continue;
+
+ xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]);
+ sa = nla_data(tb[NFSD_A_SOCK_ADDR]);
+
+ /* Put back any matching sockets */
+ list_for_each_entry_safe(xprt, tmp, &permsocks, xpt_list) {
+ /* This shouldn't be possible */
+ if (WARN_ON_ONCE(xprt->xpt_net != net)) {
+ list_move(&xprt->xpt_list, &serv->sv_permsocks);
+ continue;
+ }
+
+ /* If everything matches, put it back */
+ if (!strcmp(xprt->xpt_class->xcl_name, xcl_name) &&
+ rpc_cmp_addr_port(sa, (struct sockaddr *)&xprt->xpt_local)) {
+ list_move(&xprt->xpt_list, &serv->sv_permsocks);
+ break;
+ }
+ }
+ }
+
+ /* For now, no removing old sockets while server is running */
+ if (serv->sv_nrthreads && !list_empty(&permsocks)) {
+ list_splice_init(&permsocks, &serv->sv_permsocks);
+ spin_unlock_bh(&serv->sv_lock);
+ err = -EBUSY;
+ goto out_unlock_mtx;
+ }
+
+ /* Close the remaining sockets on the permsocks list */
+ while (!list_empty(&permsocks)) {
+ xprt = list_first_entry(&permsocks, struct svc_xprt, xpt_list);
+ list_move(&xprt->xpt_list, &serv->sv_permsocks);
+
+ /*
+ * Newly-created sockets are born with the BUSY bit set. Clear
+ * it if there are no threads, since nothing can pick it up
+ * in that case.
+ */
+ if (!serv->sv_nrthreads)
+ clear_bit(XPT_BUSY, &xprt->xpt_flags);
+
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ spin_unlock_bh(&serv->sv_lock);
+ svc_xprt_close(xprt);
+ spin_lock_bh(&serv->sv_lock);
+ }
+
+ spin_unlock_bh(&serv->sv_lock);
+
+ /* walk list of addrs again, open any that still don't exist */
+ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
+ const char *xcl_name;
+ struct sockaddr *sa;
+ int ret;
+
+ if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
+ continue;
+
+ if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
+ nfsd_sock_nl_policy, info->extack) < 0)
+ continue;
+
+ if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME])
+ continue;
+
+ if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa))
+ continue;
+
+ xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]);
+ sa = nla_data(tb[NFSD_A_SOCK_ADDR]);
+
+ xprt = svc_find_listener(serv, xcl_name, net, sa);
+ if (xprt) {
+ svc_xprt_put(xprt);
+ continue;
+ }
+
+ ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa,
+ SVC_SOCK_ANONYMOUS,
+ get_current_cred());
+ /* always save the latest error */
+ if (ret < 0)
+ err = ret;
+ }
+
+ if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks))
+ nfsd_destroy_serv(net);
+
+out_unlock_mtx:
+ mutex_unlock(&nfsd_mutex);
+
+ return err;
+}
+
+/**
+ * nfsd_nl_listener_get_doit - get the nfs running listeners
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct svc_xprt *xprt;
+ struct svc_serv *serv;
+ struct nfsd_net *nn;
+ void *hdr;
+ int err;
+
+ skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(skb, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_free_msg;
+ }
+
+ mutex_lock(&nfsd_mutex);
+ nn = net_generic(genl_info_net(info), nfsd_net_id);
+
+ /* no nfs server? Just send empty socket list */
+ if (!nn->nfsd_serv)
+ goto out_unlock_mtx;
+
+ serv = nn->nfsd_serv;
+ spin_lock_bh(&serv->sv_lock);
+ list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+ struct nlattr *attr;
+
+ attr = nla_nest_start(skb, NFSD_A_SERVER_SOCK_ADDR);
+ if (!attr) {
+ err = -EINVAL;
+ goto err_serv_unlock;
+ }
+
+ if (nla_put_string(skb, NFSD_A_SOCK_TRANSPORT_NAME,
+ xprt->xpt_class->xcl_name) ||
+ nla_put(skb, NFSD_A_SOCK_ADDR,
+ sizeof(struct sockaddr_storage),
+ &xprt->xpt_local)) {
+ err = -EINVAL;
+ goto err_serv_unlock;
+ }
+
+ nla_nest_end(skb, attr);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+out_unlock_mtx:
+ mutex_unlock(&nfsd_mutex);
+ genlmsg_end(skb, hdr);
+
+ return genlmsg_reply(skb, info);
+
+err_serv_unlock:
+ spin_unlock_bh(&serv->sv_lock);
+ mutex_unlock(&nfsd_mutex);
+err_free_msg:
+ nlmsg_free(skb);
+
+ return err;
+}
+
+/**
* nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
* @net: a freshly-created network namespace
*
@@ -1672,7 +2187,8 @@ static __net_init int nfsd_net_init(struct net *net)
retval = nfsd_idmap_init(net);
if (retval)
goto out_idmap_error;
- retval = nfsd_stat_counters_init(nn);
+ retval = percpu_counter_init_many(nn->counter, 0, GFP_KERNEL,
+ NFSD_STATS_COUNTERS_NUM);
if (retval)
goto out_repcache_error;
memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats));
@@ -1704,7 +2220,7 @@ static __net_exit void nfsd_net_exit(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
nfsd_proc_stat_shutdown(net);
- nfsd_stat_counters_destroy(nn);
+ percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM);
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
nfsd_netns_free_versions(nn);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 16c5a05f340e..8f4f239d9f8a 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -103,7 +103,7 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp,
/*
* Function prototypes.
*/
-int nfsd_svc(int nrservs, struct net *net, const struct cred *cred);
+int nfsd_svc(int nrservs, struct net *net, const struct cred *cred, const char *scope);
int nfsd_dispatch(struct svc_rqst *rqstp);
int nfsd_nrthreads(struct net *);
@@ -230,7 +230,6 @@ void nfsd_lockd_shutdown(void);
#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC)
#define nfserr_rofs cpu_to_be32(NFSERR_ROFS)
#define nfserr_mlink cpu_to_be32(NFSERR_MLINK)
-#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP)
#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG)
#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY)
#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 40fecf7b224f..0b75305fb5f5 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -573,7 +573,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
_fh_update(fhp, exp, dentry);
if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
fh_put(fhp);
- return nfserr_opnotsupp;
+ return nfserr_stale;
}
return 0;
@@ -599,7 +599,7 @@ fh_update(struct svc_fh *fhp)
_fh_update(fhp, fhp->fh_export, dentry);
if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
- return nfserr_opnotsupp;
+ return nfserr_stale;
return 0;
out_bad:
printk(KERN_ERR "fh_update: fh not verified!\n");
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index c0d17b92b249..cd9a6a1a9fc8 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -133,8 +133,7 @@ struct svc_program nfsd_program = {
.pg_rpcbind_set = nfsd_rpcbind_set,
};
-static bool
-nfsd_support_version(int vers)
+bool nfsd_support_version(int vers)
{
if (vers >= NFSD_MINVERS && vers < NFSD_NRVERS)
return nfsd_version[vers] != NULL;
@@ -769,13 +768,14 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
* this is the first time nrservs is nonzero.
*/
int
-nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
+nfsd_svc(int nrservs, struct net *net, const struct cred *cred, const char *scope)
{
int error;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct svc_serv *serv;
- mutex_lock(&nfsd_mutex);
+ lockdep_assert_held(&nfsd_mutex);
+
dprintk("nfsd: creating service\n");
nrservs = max(nrservs, 0);
@@ -785,7 +785,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
if (nrservs == 0 && nn->nfsd_serv == NULL)
goto out;
- strscpy(nn->nfsd_name, utsname()->nodename,
+ strscpy(nn->nfsd_name, scope ? scope : utsname()->nodename,
sizeof(nn->nfsd_name));
error = nfsd_create_serv(net);
@@ -804,7 +804,6 @@ out_put:
if (serv->sv_nrthreads == 0)
nfsd_destroy_serv(net);
out:
- mutex_unlock(&nfsd_mutex);
return error;
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 2ed0fcf879fd..ffc217099d19 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -408,6 +408,8 @@ struct nfs4_client {
1 << NFSD4_CLIENT_CB_KILL)
#define NFSD4_CLIENT_CB_RECALL_ANY (6)
unsigned long cl_flags;
+
+ struct workqueue_struct *cl_callback_wq;
const struct cred *cl_cb_cred;
struct rpc_clnt *cl_cb_client;
u32 cl_cb_ident;
@@ -486,7 +488,7 @@ struct nfs4_replay {
unsigned int rp_buflen;
char *rp_buf;
struct knfsd_fh rp_openfh;
- struct mutex rp_mutex;
+ atomic_t rp_locked;
char rp_ibuf[NFSD4_REPLAY_ISIZE];
};
@@ -735,8 +737,6 @@ extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *
extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
-extern int nfsd4_create_callback_queue(void);
-extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index be52fb1e928e..bb22893f1157 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -73,48 +73,6 @@ static int nfsd_show(struct seq_file *seq, void *v)
DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
-int nfsd_percpu_counters_init(struct percpu_counter *counters, int num)
-{
- int i, err = 0;
-
- for (i = 0; !err && i < num; i++)
- err = percpu_counter_init(&counters[i], 0, GFP_KERNEL);
-
- if (!err)
- return 0;
-
- for (; i > 0; i--)
- percpu_counter_destroy(&counters[i-1]);
-
- return err;
-}
-
-void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num)
-{
- int i;
-
- for (i = 0; i < num; i++)
- percpu_counter_set(&counters[i], 0);
-}
-
-void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num)
-{
- int i;
-
- for (i = 0; i < num; i++)
- percpu_counter_destroy(&counters[i]);
-}
-
-int nfsd_stat_counters_init(struct nfsd_net *nn)
-{
- return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM);
-}
-
-void nfsd_stat_counters_destroy(struct nfsd_net *nn)
-{
- nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM);
-}
-
void nfsd_proc_stat_init(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index d2753e975dfd..04aacb6c36e2 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,11 +10,6 @@
#include <uapi/linux/nfsd/stats.h>
#include <linux/percpu_counter.h>
-int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
-void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
-void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
-int nfsd_stat_counters_init(struct nfsd_net *nn);
-void nfsd_stat_counters_destroy(struct nfsd_net *nn);
void nfsd_proc_stat_init(struct net *net);
void nfsd_proc_stat_shutdown(struct net *net);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 1cd2076210b1..77bbd23aa150 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -104,7 +104,7 @@ TRACE_EVENT(nfsd_compound,
TP_fast_assign(
__entry->xid = be32_to_cpu(rqst->rq_xid);
__entry->opcnt = opcnt;
- __assign_str(tag, tag);
+ __assign_str(tag);
),
TP_printk("xid=0x%08x opcnt=%u tag=%s",
__entry->xid, __entry->opcnt, __get_str(tag)
@@ -127,7 +127,7 @@ TRACE_EVENT(nfsd_compound_status,
__entry->args_opcnt = args_opcnt;
__entry->resp_opcnt = resp_opcnt;
__entry->status = be32_to_cpu(status);
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("op=%u/%u %s status=%d",
__entry->resp_opcnt, __entry->args_opcnt,
@@ -318,7 +318,7 @@ TRACE_EVENT(nfsd_exp_find_key,
TP_fast_assign(
__entry->fsidtype = key->ek_fsidtype;
memcpy(__entry->fsid, key->ek_fsid, 4*6);
- __assign_str(auth_domain, key->ek_client->name);
+ __assign_str(auth_domain);
__entry->status = status;
),
TP_printk("fsid=%x::%s domain=%s status=%d",
@@ -342,8 +342,8 @@ TRACE_EVENT(nfsd_expkey_update,
TP_fast_assign(
__entry->fsidtype = key->ek_fsidtype;
memcpy(__entry->fsid, key->ek_fsid, 4*6);
- __assign_str(auth_domain, key->ek_client->name);
- __assign_str(path, exp_path);
+ __assign_str(auth_domain);
+ __assign_str(path);
__entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags);
),
TP_printk("fsid=%x::%s domain=%s path=%s cache=%s",
@@ -365,8 +365,8 @@ TRACE_EVENT(nfsd_exp_get_by_name,
__field(int, status)
),
TP_fast_assign(
- __assign_str(path, key->ex_path.dentry->d_name.name);
- __assign_str(auth_domain, key->ex_client->name);
+ __assign_str(path);
+ __assign_str(auth_domain);
__entry->status = status;
),
TP_printk("path=%s domain=%s status=%d",
@@ -385,8 +385,8 @@ TRACE_EVENT(nfsd_export_update,
__field(bool, cache)
),
TP_fast_assign(
- __assign_str(path, key->ex_path.dentry->d_name.name);
- __assign_str(auth_domain, key->ex_client->name);
+ __assign_str(path);
+ __assign_str(auth_domain);
__entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags);
),
TP_printk("path=%s domain=%s cache=%s",
@@ -485,7 +485,7 @@ TRACE_EVENT(nfsd_dirent,
TP_fast_assign(
__entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0;
__entry->ino = ino;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("fh_hash=0x%08x ino=%llu name=%s",
__entry->fh_hash, __entry->ino, __get_str(name)
@@ -749,6 +749,76 @@ TRACE_EVENT_CONDITION(nfsd_seq4_status,
)
);
+DECLARE_EVENT_CLASS(nfsd_cs_slot_class,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ const struct nfsd4_create_session *cs
+ ),
+ TP_ARGS(clp, cs),
+ TP_STRUCT__entry(
+ __field(u32, seqid)
+ __field(u32, slot_seqid)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
+ ),
+ TP_fast_assign(
+ const struct nfsd4_clid_slot *slot = &clp->cl_cs_slot;
+
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen);
+ __entry->seqid = cs->seqid;
+ __entry->slot_seqid = slot->sl_seqid;
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x seqid=%u slot_seqid=%u",
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+ __entry->seqid, __entry->slot_seqid
+ )
+);
+
+#define DEFINE_CS_SLOT_EVENT(name) \
+DEFINE_EVENT(nfsd_cs_slot_class, nfsd_##name, \
+ TP_PROTO( \
+ const struct nfs4_client *clp, \
+ const struct nfsd4_create_session *cs \
+ ), \
+ TP_ARGS(clp, cs))
+
+DEFINE_CS_SLOT_EVENT(slot_seqid_conf);
+DEFINE_CS_SLOT_EVENT(slot_seqid_unconf);
+
+TRACE_EVENT(nfsd_slot_seqid_sequence,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ const struct nfsd4_sequence *seq,
+ const struct nfsd4_slot *slot
+ ),
+ TP_ARGS(clp, seq, slot),
+ TP_STRUCT__entry(
+ __field(u32, seqid)
+ __field(u32, slot_seqid)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
+ __field(bool, in_use)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen);
+ __entry->seqid = seq->seqid;
+ __entry->slot_seqid = slot->sl_seqid;
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x seqid=%u slot_seqid=%u (%sin use)",
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+ __entry->seqid, __entry->slot_seqid,
+ __entry->in_use ? "" : "not "
+ )
+);
+
DECLARE_EVENT_CLASS(nfsd_clientid_class,
TP_PROTO(const clientid_t *clid),
TP_ARGS(clid),
@@ -778,6 +848,30 @@ DEFINE_CLIENTID_EVENT(purged);
DEFINE_CLIENTID_EVENT(renew);
DEFINE_CLIENTID_EVENT(stale);
+TRACE_EVENT(nfsd_mark_client_expired,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ int cl_rpc_users
+ ),
+ TP_ARGS(clp, cl_rpc_users),
+ TP_STRUCT__entry(
+ __field(int, cl_rpc_users)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
+ ),
+ TP_fast_assign(
+ __entry->cl_rpc_users = cl_rpc_users;
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x cl_rpc_users=%d",
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+ __entry->cl_rpc_users)
+);
+
DECLARE_EVENT_CLASS(nfsd_net_class,
TP_PROTO(const struct nfsd_net *nn),
TP_ARGS(nn),
@@ -906,7 +1000,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
__entry->flavor = clp->cl_cred.cr_flavor;
memcpy(__entry->verifier, (void *)&clp->cl_verifier,
NFS4_VERIFIER_SIZE);
- __assign_str(name, clp->cl_name.data);
+ __assign_str(name);
),
TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x",
__entry->addr, __get_str(name),
@@ -1425,7 +1519,7 @@ TRACE_EVENT(nfsd_cb_setup,
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- __assign_str(netid, netid);
+ __assign_str(netid);
__entry->authflavor = authflavor;
__assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
clp->cl_cb_conn.cb_addrlen)
@@ -1534,7 +1628,7 @@ TRACE_EVENT(nfsd_cb_seq_status,
__entry->seq_status = cb->cb_seq_status;
),
TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
- " sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d\n",
+ " sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d",
__entry->task_id, __entry->client_id,
__entry->cl_boot, __entry->cl_id,
__entry->seqno, __entry->reserved,
@@ -1573,7 +1667,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
__entry->slot_seqno = session->se_cb_seq_nr;
),
TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
- " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u\n",
+ " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
__entry->task_id, __entry->client_id,
__entry->cl_boot, __entry->cl_id,
__entry->seqno, __entry->reserved,
@@ -1770,7 +1864,7 @@ TRACE_EVENT(nfsd_ctl_unlock_ip,
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
- __assign_str(address, address);
+ __assign_str(address);
),
TP_printk("address=%s",
__get_str(address)
@@ -1789,7 +1883,7 @@ TRACE_EVENT(nfsd_ctl_unlock_fs,
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
- __assign_str(path, path);
+ __assign_str(path);
),
TP_printk("path=%s",
__get_str(path)
@@ -1813,8 +1907,8 @@ TRACE_EVENT(nfsd_ctl_filehandle,
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->maxsize = maxsize;
- __assign_str(domain, domain);
- __assign_str(path, path);
+ __assign_str(domain);
+ __assign_str(path);
),
TP_printk("domain=%s path=%s maxsize=%d",
__get_str(domain), __get_str(path), __entry->maxsize
@@ -1874,7 +1968,7 @@ TRACE_EVENT(nfsd_ctl_version,
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
- __assign_str(mesg, mesg);
+ __assign_str(mesg);
),
TP_printk("%s",
__get_str(mesg)
@@ -1915,7 +2009,7 @@ TRACE_EVENT(nfsd_ctl_ports_addxprt,
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->port = port;
- __assign_str(transport, transport);
+ __assign_str(transport);
),
TP_printk("transport=%s port=%d",
__get_str(transport), __entry->port
@@ -1976,9 +2070,9 @@ TRACE_EVENT(nfsd_ctl_time,
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->time = time;
- __assign_str(name, name);
+ __assign_str(name);
),
- TP_printk("file=%s time=%d\n",
+ TP_printk("file=%s time=%d",
__get_str(name), __entry->time
)
);
@@ -1995,7 +2089,7 @@ TRACE_EVENT(nfsd_ctl_recoverydir,
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
- __assign_str(recdir, recdir);
+ __assign_str(recdir);
),
TP_printk("recdir=%s",
__get_str(recdir)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2e41eb4c3cec..29b1f3613800 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1422,7 +1422,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
* Callers expect new file metadata to be committed even
* if the attributes have not changed.
*/
- if (iap->ia_valid)
+ if (nfsd_attrs_valid(attrs))
status = nfsd_setattr(rqstp, resfhp, attrs, NULL);
else
status = nfserrno(commit_metadata(resfhp));
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index c60fdb6200fd..57cd70062048 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -60,6 +60,14 @@ static inline void nfsd_attrs_free(struct nfsd_attrs *attrs)
posix_acl_release(attrs->na_dpacl);
}
+static inline bool nfsd_attrs_valid(struct nfsd_attrs *attrs)
+{
+ struct iattr *iap = attrs->na_iattr;
+
+ return (iap->ia_valid || (attrs->na_seclabel &&
+ attrs->na_seclabel->len));
+}
+
__be32 nfserrno (int errno);
int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
struct svc_export **expp);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 415516c1b27e..fbdd42cde1fa 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -518,6 +518,24 @@ struct nfsd4_free_stateid {
stateid_t fr_stateid; /* request */
};
+struct nfsd4_get_dir_delegation {
+ /* request */
+ u32 gdda_signal_deleg_avail;
+ u32 gdda_notification_types[1];
+ struct timespec64 gdda_child_attr_delay;
+ struct timespec64 gdda_dir_attr_delay;
+ u32 gdda_child_attributes[3];
+ u32 gdda_dir_attributes[3];
+ /* response */
+ u32 gddrnf_status;
+ nfs4_verifier gddr_cookieverf;
+ stateid_t gddr_stateid;
+ u32 gddr_notification[1];
+ u32 gddr_child_attributes[3];
+ u32 gddr_dir_attributes[3];
+ bool gddrnf_will_signal_deleg_avail;
+};
+
/* also used for NVERIFY */
struct nfsd4_verify {
u32 ve_bmval[3]; /* request */
@@ -674,8 +692,10 @@ struct nfsd4_copy {
#define NFSD4_COPY_F_INTRA (1)
#define NFSD4_COPY_F_SYNCHRONOUS (2)
#define NFSD4_COPY_F_COMMITTED (3)
+#define NFSD4_COPY_F_COMPLETED (4)
/* response */
+ __be32 nfserr;
struct nfsd42_write_res cp_res;
struct knfsd_fh fh;
@@ -735,7 +755,8 @@ struct nfsd4_offload_status {
/* response */
u64 count;
- u32 status;
+ __be32 status;
+ bool completed;
};
struct nfsd4_copy_notify {
@@ -797,6 +818,7 @@ struct nfsd4_op {
struct nfsd4_reclaim_complete reclaim_complete;
struct nfsd4_test_stateid test_stateid;
struct nfsd4_free_stateid free_stateid;
+ struct nfsd4_get_dir_delegation get_dir_delegation;
struct nfsd4_getdeviceinfo getdeviceinfo;
struct nfsd4_layoutget layoutget;
struct nfsd4_layoutcommit layoutcommit;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 65659fa0372e..a139970e4804 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1857,13 +1857,22 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
}
/**
- * nilfs_btree_convert_and_insert -
- * @bmap:
- * @key:
- * @ptr:
- * @keys:
- * @ptrs:
- * @n:
+ * nilfs_btree_convert_and_insert - Convert and insert entries into a B-tree
+ * @btree: NILFS B-tree structure
+ * @key: Key of the new entry to be inserted
+ * @ptr: Pointer (block number) associated with the key to be inserted
+ * @keys: Array of keys to be inserted in addition to @key
+ * @ptrs: Array of pointers associated with @keys
+ * @n: Number of keys and pointers in @keys and @ptrs
+ *
+ * This function is used to insert a new entry specified by @key and @ptr,
+ * along with additional entries specified by @keys and @ptrs arrays, into a
+ * NILFS B-tree.
+ * It prepares the necessary changes by allocating the required blocks and any
+ * necessary intermediate nodes. It converts configurations from other forms of
+ * block mapping (the one that currently exists is direct mapping) to a B-tree.
+ *
+ * Return: 0 on success or a negative error code on failure.
*/
int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
__u64 key, __u64 ptr,
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index aee40db7a036..a002a44ff161 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -174,7 +174,6 @@ Eend:
dir->i_ino, (folio->index << PAGE_SHIFT) + offs,
(unsigned long)le64_to_cpu(p->inode));
fail:
- folio_set_error(folio);
return false;
}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index bf9a11d58817..1c9ae36a03ab 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -175,6 +175,7 @@ int nilfs_init_gcinode(struct inode *inode)
/**
* nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
+ * @nilfs: NILFS filesystem instance
*/
void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
{
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 2e29b98ba8ba..728e90be3570 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -335,8 +335,8 @@ void __nilfs_error(struct super_block *sb, const char *function,
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
-extern int nilfs_store_magic_and_option(struct super_block *,
- struct nilfs_super_block *, char *);
+extern int nilfs_store_magic(struct super_block *sb,
+ struct nilfs_super_block *sbp);
extern int nilfs_check_feature_compatibility(struct super_block *,
struct nilfs_super_block *);
extern void nilfs_set_log_cursor(struct nilfs_super_block *,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 49a70c68bf3c..b638dc06df2f 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -563,6 +563,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
* checkpoint
* @nilfs: nilfs object
* @sb: super block instance
+ * @root: NILFS root instance
* @ri: pointer to a nilfs_recovery_info
*/
static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
@@ -698,9 +699,15 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
return;
bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize);
- BUG_ON(!bh);
+ if (WARN_ON(!bh))
+ return; /* should never happen */
+
+ lock_buffer(bh);
memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
set_buffer_dirty(bh);
+ unlock_buffer(bh);
+
err = sync_dirty_buffer(bh);
if (unlikely(err))
nilfs_warn(nilfs->ns_sb,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index aa5290cb7467..6be7dd423fbd 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1725,14 +1725,8 @@ static void nilfs_end_folio_io(struct folio *folio, int err)
return;
}
- if (!err) {
- if (!nilfs_folio_buffers_clean(folio))
- filemap_dirty_folio(folio->mapping, folio);
- folio_clear_error(folio);
- } else {
+ if (err || !nilfs_folio_buffers_clean(folio))
filemap_dirty_folio(folio->mapping, folio);
- folio_set_error(folio);
- }
folio_end_writeback(folio);
}
@@ -2790,7 +2784,7 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
if (!nilfs->ns_writer)
return -ENOMEM;
- inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL);
+ inode_attach_wb(nilfs->ns_bdev->bd_mapping->host, NULL);
err = nilfs_segctor_start_thread(nilfs->ns_writer);
if (unlikely(err))
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ac24ed109ce9..e835e1f5a712 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -29,13 +29,13 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
-#include <linux/parser.h>
#include <linux/crc32.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "nilfs.h"
#include "export.h"
#include "mdt.h"
@@ -61,7 +61,6 @@ struct kmem_cache *nilfs_segbuf_cachep;
struct kmem_cache *nilfs_btree_path_cache;
static int nilfs_setup_super(struct super_block *sb, int is_mount);
-static int nilfs_remount(struct super_block *sb, int *flags, char *data);
void __nilfs_msg(struct super_block *sb, const char *fmt, ...)
{
@@ -702,105 +701,98 @@ static const struct super_operations nilfs_sops = {
.freeze_fs = nilfs_freeze,
.unfreeze_fs = nilfs_unfreeze,
.statfs = nilfs_statfs,
- .remount_fs = nilfs_remount,
.show_options = nilfs_show_options
};
enum {
- Opt_err_cont, Opt_err_panic, Opt_err_ro,
- Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
- Opt_discard, Opt_nodiscard, Opt_err,
+ Opt_err, Opt_barrier, Opt_snapshot, Opt_order, Opt_norecovery,
+ Opt_discard,
};
-static match_table_t tokens = {
- {Opt_err_cont, "errors=continue"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_snapshot, "cp=%u"},
- {Opt_order, "order=%s"},
- {Opt_norecovery, "norecovery"},
- {Opt_discard, "discard"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_err, NULL}
+static const struct constant_table nilfs_param_err[] = {
+ {"continue", NILFS_MOUNT_ERRORS_CONT},
+ {"panic", NILFS_MOUNT_ERRORS_PANIC},
+ {"remount-ro", NILFS_MOUNT_ERRORS_RO},
+ {}
};
-static int parse_options(char *options, struct super_block *sb, int is_remount)
-{
- struct the_nilfs *nilfs = sb->s_fs_info;
- char *p;
- substring_t args[MAX_OPT_ARGS];
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
+static const struct fs_parameter_spec nilfs_param_spec[] = {
+ fsparam_enum ("errors", Opt_err, nilfs_param_err),
+ fsparam_flag_no ("barrier", Opt_barrier),
+ fsparam_u64 ("cp", Opt_snapshot),
+ fsparam_string ("order", Opt_order),
+ fsparam_flag ("norecovery", Opt_norecovery),
+ fsparam_flag_no ("discard", Opt_discard),
+ {}
+};
- if (!*p)
- continue;
+struct nilfs_fs_context {
+ unsigned long ns_mount_opt;
+ __u64 cno;
+};
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_barrier:
- nilfs_set_opt(nilfs, BARRIER);
- break;
- case Opt_nobarrier:
+static int nilfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct nilfs_fs_context *nilfs = fc->fs_private;
+ int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, nilfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_barrier:
+ if (result.negated)
nilfs_clear_opt(nilfs, BARRIER);
- break;
- case Opt_order:
- if (strcmp(args[0].from, "relaxed") == 0)
- /* Ordered data semantics */
- nilfs_clear_opt(nilfs, STRICT_ORDER);
- else if (strcmp(args[0].from, "strict") == 0)
- /* Strict in-order semantics */
- nilfs_set_opt(nilfs, STRICT_ORDER);
- else
- return 0;
- break;
- case Opt_err_panic:
- nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC);
- break;
- case Opt_err_ro:
- nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO);
- break;
- case Opt_err_cont:
- nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT);
- break;
- case Opt_snapshot:
- if (is_remount) {
- nilfs_err(sb,
- "\"%s\" option is invalid for remount",
- p);
- return 0;
- }
- break;
- case Opt_norecovery:
- nilfs_set_opt(nilfs, NORECOVERY);
- break;
- case Opt_discard:
- nilfs_set_opt(nilfs, DISCARD);
- break;
- case Opt_nodiscard:
- nilfs_clear_opt(nilfs, DISCARD);
- break;
- default:
- nilfs_err(sb, "unrecognized mount option \"%s\"", p);
- return 0;
+ else
+ nilfs_set_opt(nilfs, BARRIER);
+ break;
+ case Opt_order:
+ if (strcmp(param->string, "relaxed") == 0)
+ /* Ordered data semantics */
+ nilfs_clear_opt(nilfs, STRICT_ORDER);
+ else if (strcmp(param->string, "strict") == 0)
+ /* Strict in-order semantics */
+ nilfs_set_opt(nilfs, STRICT_ORDER);
+ else
+ return -EINVAL;
+ break;
+ case Opt_err:
+ nilfs->ns_mount_opt &= ~NILFS_MOUNT_ERROR_MODE;
+ nilfs->ns_mount_opt |= result.uint_32;
+ break;
+ case Opt_snapshot:
+ if (is_remount) {
+ struct super_block *sb = fc->root->d_sb;
+
+ nilfs_err(sb,
+ "\"%s\" option is invalid for remount",
+ param->key);
+ return -EINVAL;
+ }
+ if (result.uint_64 == 0) {
+ nilfs_err(NULL,
+ "invalid option \"cp=0\": invalid checkpoint number 0");
+ return -EINVAL;
}
+ nilfs->cno = result.uint_64;
+ break;
+ case Opt_norecovery:
+ nilfs_set_opt(nilfs, NORECOVERY);
+ break;
+ case Opt_discard:
+ if (result.negated)
+ nilfs_clear_opt(nilfs, DISCARD);
+ else
+ nilfs_set_opt(nilfs, DISCARD);
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
-}
-
-static inline void
-nilfs_set_default_options(struct super_block *sb,
- struct nilfs_super_block *sbp)
-{
- struct the_nilfs *nilfs = sb->s_fs_info;
- nilfs->ns_mount_opt =
- NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
+ return 0;
}
static int nilfs_setup_super(struct super_block *sb, int is_mount)
@@ -857,9 +849,8 @@ struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
}
-int nilfs_store_magic_and_option(struct super_block *sb,
- struct nilfs_super_block *sbp,
- char *data)
+int nilfs_store_magic(struct super_block *sb,
+ struct nilfs_super_block *sbp)
{
struct the_nilfs *nilfs = sb->s_fs_info;
@@ -870,14 +861,12 @@ int nilfs_store_magic_and_option(struct super_block *sb,
sb->s_flags |= SB_NOATIME;
#endif
- nilfs_set_default_options(sb, sbp);
-
nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid);
nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid);
nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
- return !parse_options(data, sb, 0) ? -EINVAL : 0;
+ return 0;
}
int nilfs_check_feature_compatibility(struct super_block *sb,
@@ -1035,17 +1024,17 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
/**
* nilfs_fill_super() - initialize a super block instance
* @sb: super_block
- * @data: mount options
- * @silent: silent mode flag
+ * @fc: filesystem context
*
* This function is called exclusively by nilfs->ns_mount_mutex.
* So, the recovery process is protected from other simultaneous mounts.
*/
static int
-nilfs_fill_super(struct super_block *sb, void *data, int silent)
+nilfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct the_nilfs *nilfs;
struct nilfs_root *fsroot;
+ struct nilfs_fs_context *ctx = fc->fs_private;
__u64 cno;
int err;
@@ -1055,10 +1044,13 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = nilfs;
- err = init_nilfs(nilfs, sb, (char *)data);
+ err = init_nilfs(nilfs, sb);
if (err)
goto failed_nilfs;
+ /* Copy in parsed mount options */
+ nilfs->ns_mount_opt = ctx->ns_mount_opt;
+
sb->s_op = &nilfs_sops;
sb->s_export_op = &nilfs_export_ops;
sb->s_root = NULL;
@@ -1117,34 +1109,25 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
return err;
}
-static int nilfs_remount(struct super_block *sb, int *flags, char *data)
+static int nilfs_reconfigure(struct fs_context *fc)
{
+ struct nilfs_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
struct the_nilfs *nilfs = sb->s_fs_info;
- unsigned long old_sb_flags;
- unsigned long old_mount_opt;
int err;
sync_filesystem(sb);
- old_sb_flags = sb->s_flags;
- old_mount_opt = nilfs->ns_mount_opt;
-
- if (!parse_options(data, sb, 1)) {
- err = -EINVAL;
- goto restore_opts;
- }
- sb->s_flags = (sb->s_flags & ~SB_POSIXACL);
err = -EINVAL;
if (!nilfs_valid_fs(nilfs)) {
nilfs_warn(sb,
"couldn't remount because the filesystem is in an incomplete recovery state");
- goto restore_opts;
+ goto ignore_opts;
}
-
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
- if (*flags & SB_RDONLY) {
+ if (fc->sb_flags & SB_RDONLY) {
sb->s_flags |= SB_RDONLY;
/*
@@ -1172,138 +1155,67 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
"couldn't remount RDWR because of unsupported optional features (%llx)",
(unsigned long long)features);
err = -EROFS;
- goto restore_opts;
+ goto ignore_opts;
}
sb->s_flags &= ~SB_RDONLY;
root = NILFS_I(d_inode(sb->s_root))->i_root;
err = nilfs_attach_log_writer(sb, root);
- if (err)
- goto restore_opts;
+ if (err) {
+ sb->s_flags |= SB_RDONLY;
+ goto ignore_opts;
+ }
down_write(&nilfs->ns_sem);
nilfs_setup_super(sb, true);
up_write(&nilfs->ns_sem);
}
out:
- return 0;
-
- restore_opts:
- sb->s_flags = old_sb_flags;
- nilfs->ns_mount_opt = old_mount_opt;
- return err;
-}
-
-struct nilfs_super_data {
- __u64 cno;
- int flags;
-};
-
-static int nilfs_parse_snapshot_option(const char *option,
- const substring_t *arg,
- struct nilfs_super_data *sd)
-{
- unsigned long long val;
- const char *msg = NULL;
- int err;
-
- if (!(sd->flags & SB_RDONLY)) {
- msg = "read-only option is not specified";
- goto parse_error;
- }
-
- err = kstrtoull(arg->from, 0, &val);
- if (err) {
- if (err == -ERANGE)
- msg = "too large checkpoint number";
- else
- msg = "malformed argument";
- goto parse_error;
- } else if (val == 0) {
- msg = "invalid checkpoint number 0";
- goto parse_error;
- }
- sd->cno = val;
- return 0;
-
-parse_error:
- nilfs_err(NULL, "invalid option \"%s\": %s", option, msg);
- return 1;
-}
-
-/**
- * nilfs_identify - pre-read mount options needed to identify mount instance
- * @data: mount options
- * @sd: nilfs_super_data
- */
-static int nilfs_identify(char *data, struct nilfs_super_data *sd)
-{
- char *p, *options = data;
- substring_t args[MAX_OPT_ARGS];
- int token;
- int ret = 0;
-
- do {
- p = strsep(&options, ",");
- if (p != NULL && *p) {
- token = match_token(p, tokens, args);
- if (token == Opt_snapshot)
- ret = nilfs_parse_snapshot_option(p, &args[0],
- sd);
- }
- if (!options)
- break;
- BUG_ON(options == data);
- *(options - 1) = ',';
- } while (!ret);
- return ret;
-}
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL);
+ /* Copy over parsed remount options */
+ nilfs->ns_mount_opt = ctx->ns_mount_opt;
-static int nilfs_set_bdev_super(struct super_block *s, void *data)
-{
- s->s_dev = *(dev_t *)data;
return 0;
-}
-static int nilfs_test_bdev_super(struct super_block *s, void *data)
-{
- return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
+ ignore_opts:
+ return err;
}
-static struct dentry *
-nilfs_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
+static int
+nilfs_get_tree(struct fs_context *fc)
{
- struct nilfs_super_data sd = { .flags = flags };
+ struct nilfs_fs_context *ctx = fc->fs_private;
struct super_block *s;
dev_t dev;
int err;
- if (nilfs_identify(data, &sd))
- return ERR_PTR(-EINVAL);
+ if (ctx->cno && !(fc->sb_flags & SB_RDONLY)) {
+ nilfs_err(NULL,
+ "invalid option \"cp=%llu\": read-only option is not specified",
+ ctx->cno);
+ return -EINVAL;
+ }
- err = lookup_bdev(dev_name, &dev);
+ err = lookup_bdev(fc->source, &dev);
if (err)
- return ERR_PTR(err);
+ return err;
- s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags,
- &dev);
+ s = sget_dev(fc, dev);
if (IS_ERR(s))
- return ERR_CAST(s);
+ return PTR_ERR(s);
if (!s->s_root) {
- err = setup_bdev_super(s, flags, NULL);
+ err = setup_bdev_super(s, fc->sb_flags, fc);
if (!err)
- err = nilfs_fill_super(s, data,
- flags & SB_SILENT ? 1 : 0);
+ err = nilfs_fill_super(s, fc);
if (err)
goto failed_super;
s->s_flags |= SB_ACTIVE;
- } else if (!sd.cno) {
+ } else if (!ctx->cno) {
if (nilfs_tree_is_busy(s->s_root)) {
- if ((flags ^ s->s_flags) & SB_RDONLY) {
+ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
nilfs_err(s,
"the device already has a %s mount.",
sb_rdonly(s) ? "read-only" : "read/write");
@@ -1312,37 +1224,75 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
} else {
/*
- * Try remount to setup mount states if the current
+ * Try reconfigure to setup mount states if the current
* tree is not mounted and only snapshots use this sb.
+ *
+ * Since nilfs_reconfigure() requires fc->root to be
+ * set, set it first and release it on failure.
*/
- err = nilfs_remount(s, &flags, data);
- if (err)
+ fc->root = dget(s->s_root);
+ err = nilfs_reconfigure(fc);
+ if (err) {
+ dput(fc->root);
+ fc->root = NULL; /* prevent double release */
goto failed_super;
+ }
+ return 0;
}
}
- if (sd.cno) {
+ if (ctx->cno) {
struct dentry *root_dentry;
- err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
+ err = nilfs_attach_snapshot(s, ctx->cno, &root_dentry);
if (err)
goto failed_super;
- return root_dentry;
+ fc->root = root_dentry;
+ return 0;
}
- return dget(s->s_root);
+ fc->root = dget(s->s_root);
+ return 0;
failed_super:
deactivate_locked_super(s);
- return ERR_PTR(err);
+ return err;
+}
+
+static void nilfs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations nilfs_context_ops = {
+ .parse_param = nilfs_parse_param,
+ .get_tree = nilfs_get_tree,
+ .reconfigure = nilfs_reconfigure,
+ .free = nilfs_free_fc,
+};
+
+static int nilfs_init_fs_context(struct fs_context *fc)
+{
+ struct nilfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns_mount_opt = NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
+ fc->fs_private = ctx;
+ fc->ops = &nilfs_context_ops;
+
+ return 0;
}
struct file_system_type nilfs_fs_type = {
.owner = THIS_MODULE,
.name = "nilfs2",
- .mount = nilfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = nilfs_init_fs_context,
+ .parameters = nilfs_param_spec,
};
MODULE_ALIAS_FS("nilfs2");
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 2ae2c1bbf6d1..f41d7b6d432c 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -592,7 +592,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
struct nilfs_super_block **sbp = nilfs->ns_sbp;
struct buffer_head **sbh = nilfs->ns_sbh;
u64 sb2off, devsize = bdev_nr_bytes(nilfs->ns_bdev);
- int valid[2], swp = 0;
+ int valid[2], swp = 0, older;
if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) {
nilfs_err(sb, "device size too small");
@@ -648,9 +648,25 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
if (swp)
nilfs_swap_super_block(nilfs);
+ /*
+ * Calculate the array index of the older superblock data.
+ * If one has been dropped, set index 0 pointing to the remaining one,
+ * otherwise set index 1 pointing to the old one (including if both
+ * are the same).
+ *
+ * Divided case valid[0] valid[1] swp -> older
+ * -------------------------------------------------------------
+ * Both SBs are invalid 0 0 N/A (Error)
+ * SB1 is invalid 0 1 1 0
+ * SB2 is invalid 1 0 0 0
+ * SB2 is newer 1 1 1 0
+ * SB2 is older or the same 1 1 0 1
+ */
+ older = valid[1] ^ swp;
+
nilfs->ns_sbwcount = 0;
nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
- nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
+ nilfs->ns_prot_seq = le64_to_cpu(sbp[older]->s_last_seq);
*sbpp = sbp[0];
return 0;
}
@@ -659,7 +675,6 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
* init_nilfs - initialize a NILFS instance.
* @nilfs: the_nilfs structure
* @sb: super block
- * @data: mount options
*
* init_nilfs() performs common initialization per block device (e.g.
* reading the super block, getting disk layout information, initializing
@@ -668,7 +683,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
* Return Value: On success, 0 is returned. On error, a negative error
* code is returned.
*/
-int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
{
struct nilfs_super_block *sbp;
int blocksize;
@@ -686,7 +701,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
if (err)
goto out;
- err = nilfs_store_magic_and_option(sb, sbp, data);
+ err = nilfs_store_magic(sb, sbp);
if (err)
goto failed_sbh;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index cd4ae1b8ae16..85da0629415d 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -219,10 +219,6 @@ THE_NILFS_FNS(PURGING, purging)
#define nilfs_set_opt(nilfs, opt) \
((nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt)
#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
-#define nilfs_write_opt(nilfs, mask, opt) \
- ((nilfs)->ns_mount_opt = \
- (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \
- NILFS_MOUNT_##opt)) \
/**
* struct nilfs_root - nilfs root object
@@ -276,7 +272,7 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
struct the_nilfs *alloc_nilfs(struct super_block *sb);
void destroy_nilfs(struct the_nilfs *nilfs);
-int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs);
void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs);
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 3464fa7e8538..f3669403fabf 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -162,7 +162,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
if (!S_ISDIR(inode->i_mode))
return;
- fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
+ fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
if (!fsn_mark)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
fsnotify_group_lock(dnotify_group);
/* add the new_fsn_mark or find an old one. */
- fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
+ fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
if (fsn_mark) {
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
spin_lock(&fsn_mark->lock);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index fbdc63cc10d9..9ec313e9f6e1 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1076,7 +1076,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
}
static int fanotify_remove_mark(struct fsnotify_group *group,
- fsnotify_connp_t *connp, __u32 mask,
+ void *obj, unsigned int obj_type, __u32 mask,
unsigned int flags, __u32 umask)
{
struct fsnotify_mark *fsn_mark = NULL;
@@ -1084,7 +1084,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
int destroy_mark;
fsnotify_group_lock(group);
- fsn_mark = fsnotify_find_mark(connp, group);
+ fsn_mark = fsnotify_find_mark(obj, obj_type, group);
if (!fsn_mark) {
fsnotify_group_unlock(group);
return -ENOENT;
@@ -1105,30 +1105,6 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
return 0;
}
-static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
- struct vfsmount *mnt, __u32 mask,
- unsigned int flags, __u32 umask)
-{
- return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
- mask, flags, umask);
-}
-
-static int fanotify_remove_sb_mark(struct fsnotify_group *group,
- struct super_block *sb, __u32 mask,
- unsigned int flags, __u32 umask)
-{
- return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
- flags, umask);
-}
-
-static int fanotify_remove_inode_mark(struct fsnotify_group *group,
- struct inode *inode, __u32 mask,
- unsigned int flags, __u32 umask)
-{
- return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
- flags, umask);
-}
-
static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
unsigned int fan_flags)
{
@@ -1249,7 +1225,7 @@ static int fanotify_set_mark_fsid(struct fsnotify_group *group,
}
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
- fsnotify_connp_t *connp,
+ void *obj,
unsigned int obj_type,
unsigned int fan_flags,
struct fan_fsid *fsid)
@@ -1288,7 +1264,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
}
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0);
+ ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0);
if (ret)
goto out_put_mark;
@@ -1344,7 +1320,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
}
static int fanotify_add_mark(struct fsnotify_group *group,
- fsnotify_connp_t *connp, unsigned int obj_type,
+ void *obj, unsigned int obj_type,
__u32 mask, unsigned int fan_flags,
struct fan_fsid *fsid)
{
@@ -1353,9 +1329,9 @@ static int fanotify_add_mark(struct fsnotify_group *group,
int ret = 0;
fsnotify_group_lock(group);
- fsn_mark = fsnotify_find_mark(connp, group);
+ fsn_mark = fsnotify_find_mark(obj, obj_type, group);
if (!fsn_mark) {
- fsn_mark = fanotify_add_new_mark(group, connp, obj_type,
+ fsn_mark = fanotify_add_new_mark(group, obj, obj_type,
fan_flags, fsid);
if (IS_ERR(fsn_mark)) {
fsnotify_group_unlock(group);
@@ -1392,42 +1368,6 @@ out:
return ret;
}
-static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
- struct vfsmount *mnt, __u32 mask,
- unsigned int flags, struct fan_fsid *fsid)
-{
- return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
-}
-
-static int fanotify_add_sb_mark(struct fsnotify_group *group,
- struct super_block *sb, __u32 mask,
- unsigned int flags, struct fan_fsid *fsid)
-{
- return fanotify_add_mark(group, &sb->s_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
-}
-
-static int fanotify_add_inode_mark(struct fsnotify_group *group,
- struct inode *inode, __u32 mask,
- unsigned int flags, struct fan_fsid *fsid)
-{
- pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
-
- /*
- * If some other task has this inode open for write we should not add
- * an ignore mask, unless that ignore mask is supposed to survive
- * modification changes anyway.
- */
- if ((flags & FANOTIFY_MARK_IGNORE_BITS) &&
- !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
- inode_is_open_for_write(inode))
- return 0;
-
- return fanotify_add_mark(group, &inode->i_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
-}
-
static struct fsnotify_event *fanotify_alloc_overflow_event(void)
{
struct fanotify_event *oevent;
@@ -1576,13 +1516,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
INIT_LIST_HEAD(&group->fanotify_data.access_list);
switch (class) {
case FAN_CLASS_NOTIF:
- group->priority = FS_PRIO_0;
+ group->priority = FSNOTIFY_PRIO_NORMAL;
break;
case FAN_CLASS_CONTENT:
- group->priority = FS_PRIO_1;
+ group->priority = FSNOTIFY_PRIO_CONTENT;
break;
case FAN_CLASS_PRE_CONTENT:
- group->priority = FS_PRIO_2;
+ group->priority = FSNOTIFY_PRIO_PRE_CONTENT;
break;
default:
fd = -EINVAL;
@@ -1750,6 +1690,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
unsigned int obj_type, fid_mode;
+ void *obj;
u32 umask = 0;
int ret;
@@ -1833,12 +1774,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
goto fput_and_out;
/*
- * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not
- * allowed to set permissions events.
+ * Permission events require minimum priority FAN_CLASS_CONTENT.
*/
ret = -EINVAL;
if (mask & FANOTIFY_PERM_EVENTS &&
- group->priority == FS_PRIO_0)
+ group->priority < FSNOTIFY_PRIO_CONTENT)
goto fput_and_out;
if (mask & FAN_FS_ERROR &&
@@ -1908,17 +1848,34 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
/* inode held in place by reference to path; group by fget on fd */
- if (mark_type == FAN_MARK_INODE)
+ if (mark_type == FAN_MARK_INODE) {
inode = path.dentry->d_inode;
- else
+ obj = inode;
+ } else {
mnt = path.mnt;
+ if (mark_type == FAN_MARK_MOUNT)
+ obj = mnt;
+ else
+ obj = mnt->mnt_sb;
+ }
- ret = mnt ? -EINVAL : -EISDIR;
- /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
- if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE &&
- (mnt || S_ISDIR(inode->i_mode)) &&
- !(flags & FAN_MARK_IGNORED_SURV_MODIFY))
- goto path_put_and_out;
+ /*
+ * If some other task has this inode open for write we should not add
+ * an ignore mask, unless that ignore mask is supposed to survive
+ * modification changes anyway.
+ */
+ if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
+ !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
+ ret = mnt ? -EINVAL : -EISDIR;
+ /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
+ if (ignore == FAN_MARK_IGNORE &&
+ (mnt || S_ISDIR(inode->i_mode)))
+ goto path_put_and_out;
+
+ ret = 0;
+ if (inode && inode_is_open_for_write(inode))
+ goto path_put_and_out;
+ }
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
if (mnt || !S_ISDIR(inode->i_mode)) {
@@ -1936,26 +1893,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
/* create/update an inode mark */
switch (mark_cmd) {
case FAN_MARK_ADD:
- if (mark_type == FAN_MARK_MOUNT)
- ret = fanotify_add_vfsmount_mark(group, mnt, mask,
- flags, fsid);
- else if (mark_type == FAN_MARK_FILESYSTEM)
- ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
- flags, fsid);
- else
- ret = fanotify_add_inode_mark(group, inode, mask,
- flags, fsid);
+ ret = fanotify_add_mark(group, obj, obj_type, mask, flags,
+ fsid);
break;
case FAN_MARK_REMOVE:
- if (mark_type == FAN_MARK_MOUNT)
- ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
- flags, umask);
- else if (mark_type == FAN_MARK_FILESYSTEM)
- ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
- flags, umask);
- else
- ret = fanotify_remove_inode_mark(group, inode, mask,
- flags, umask);
+ ret = fanotify_remove_mark(group, obj, obj_type, mask, flags,
+ umask);
break;
default:
ret = -EINVAL;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 5c430736ec12..dec553034027 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -41,29 +41,25 @@ static void show_fdinfo(struct seq_file *m, struct file *f,
#if defined(CONFIG_EXPORTFS)
static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
{
- struct {
- struct file_handle handle;
- u8 pad[MAX_HANDLE_SZ];
- } f;
+ DEFINE_FLEX(struct file_handle, f, f_handle, handle_bytes, MAX_HANDLE_SZ);
int size, ret, i;
- f.handle.handle_bytes = sizeof(f.pad);
- size = f.handle.handle_bytes >> 2;
+ size = f->handle_bytes >> 2;
- ret = exportfs_encode_fid(inode, (struct fid *)f.handle.f_handle, &size);
+ ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size);
if ((ret == FILEID_INVALID) || (ret < 0)) {
WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
return;
}
- f.handle.handle_type = ret;
- f.handle.handle_bytes = size * sizeof(u32);
+ f->handle_type = ret;
+ f->handle_bytes = size * sizeof(u32);
seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
- f.handle.handle_bytes, f.handle.handle_type);
+ f->handle_bytes, f->handle_type);
- for (i = 0; i < f.handle.handle_bytes; i++)
- seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
+ for (i = 0; i < f->handle_bytes; i++)
+ seq_printf(m, "%02x", (int)f->f_handle[i]);
}
#else
static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 2fc105a72a8f..ff69ae24c4e8 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -89,11 +89,25 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
void fsnotify_sb_delete(struct super_block *sb)
{
+ struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+
+ /* Were any marks ever added to any object on this sb? */
+ if (!sbinfo)
+ return;
+
fsnotify_unmount_inodes(sb);
fsnotify_clear_marks_by_sb(sb);
/* Wait for outstanding object references from connectors */
- wait_var_event(&sb->s_fsnotify_connectors,
- !atomic_long_read(&sb->s_fsnotify_connectors));
+ wait_var_event(fsnotify_sb_watched_objects(sb),
+ !atomic_long_read(fsnotify_sb_watched_objects(sb)));
+ WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT));
+ WARN_ON(fsnotify_sb_has_priority_watchers(sb,
+ FSNOTIFY_PRIO_PRE_CONTENT));
+}
+
+void fsnotify_sb_free(struct super_block *sb)
+{
+ kfree(sb->s_fsnotify_info);
}
/*
@@ -489,6 +503,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
{
const struct path *path = fsnotify_data_path(data, data_type);
struct super_block *sb = fsnotify_data_sb(data, data_type);
+ struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
struct fsnotify_iter_info iter_info = {};
struct mount *mnt = NULL;
struct inode *inode2 = NULL;
@@ -525,7 +540,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
* SRCU because we have no references to any objects and do not
* need SRCU to keep them "alive".
*/
- if (!sb->s_fsnotify_marks &&
+ if ((!sbinfo || !sbinfo->sb_marks) &&
(!mnt || !mnt->mnt_fsnotify_marks) &&
(!inode || !inode->i_fsnotify_marks) &&
(!inode2 || !inode2->i_fsnotify_marks))
@@ -552,8 +567,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
- iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
- fsnotify_first_mark(&sb->s_fsnotify_marks);
+ if (sbinfo) {
+ iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
+ fsnotify_first_mark(&sbinfo->sb_marks);
+ }
if (mnt) {
iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] =
fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index fde74eb333cc..2d059f789ee3 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -9,39 +9,58 @@
#include "../mount.h"
+/*
+ * fsnotify_connp_t is what we embed in objects which connector can be attached
+ * to.
+ */
+typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;
+
static inline struct inode *fsnotify_conn_inode(
struct fsnotify_mark_connector *conn)
{
- return container_of(conn->obj, struct inode, i_fsnotify_marks);
+ return conn->obj;
}
static inline struct mount *fsnotify_conn_mount(
struct fsnotify_mark_connector *conn)
{
- return container_of(conn->obj, struct mount, mnt_fsnotify_marks);
+ return real_mount(conn->obj);
}
static inline struct super_block *fsnotify_conn_sb(
struct fsnotify_mark_connector *conn)
{
- return container_of(conn->obj, struct super_block, s_fsnotify_marks);
+ return conn->obj;
}
-static inline struct super_block *fsnotify_connector_sb(
- struct fsnotify_mark_connector *conn)
+static inline struct super_block *fsnotify_object_sb(void *obj,
+ enum fsnotify_obj_type obj_type)
{
- switch (conn->type) {
+ switch (obj_type) {
case FSNOTIFY_OBJ_TYPE_INODE:
- return fsnotify_conn_inode(conn)->i_sb;
+ return ((struct inode *)obj)->i_sb;
case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
- return fsnotify_conn_mount(conn)->mnt.mnt_sb;
+ return ((struct vfsmount *)obj)->mnt_sb;
case FSNOTIFY_OBJ_TYPE_SB:
- return fsnotify_conn_sb(conn);
+ return (struct super_block *)obj;
default:
return NULL;
}
}
+static inline struct super_block *fsnotify_connector_sb(
+ struct fsnotify_mark_connector *conn)
+{
+ return fsnotify_object_sb(conn->obj, conn->type);
+}
+
+static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb)
+{
+ struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+
+ return sbinfo ? &sbinfo->sb_marks : NULL;
+}
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
@@ -67,7 +86,7 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
/* run the list of all marks associated with sb and destroy them */
static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
{
- fsnotify_destroy_marks(&sb->s_fsnotify_marks);
+ fsnotify_destroy_marks(fsnotify_sb_marks(sb));
}
/*
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 85d8fdd55329..4ffc30606e0b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -544,7 +544,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
int create = (arg & IN_MASK_CREATE);
int ret;
- fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
+ fsn_mark = fsnotify_find_inode_mark(inode, group);
if (!fsn_mark)
return -ENOENT;
else if (create) {
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d6944ff86ffa..c3eefa70633c 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,6 +97,21 @@ void fsnotify_get_mark(struct fsnotify_mark *mark)
refcount_inc(&mark->refcnt);
}
+static fsnotify_connp_t *fsnotify_object_connp(void *obj,
+ enum fsnotify_obj_type obj_type)
+{
+ switch (obj_type) {
+ case FSNOTIFY_OBJ_TYPE_INODE:
+ return &((struct inode *)obj)->i_fsnotify_marks;
+ case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
+ return &real_mount(obj)->mnt_fsnotify_marks;
+ case FSNOTIFY_OBJ_TYPE_SB:
+ return fsnotify_sb_marks(obj);
+ default:
+ return NULL;
+ }
+}
+
static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
{
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
@@ -116,10 +131,69 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
return *fsnotify_conn_mask_p(conn);
}
+static void fsnotify_get_sb_watched_objects(struct super_block *sb)
+{
+ atomic_long_inc(fsnotify_sb_watched_objects(sb));
+}
+
+static void fsnotify_put_sb_watched_objects(struct super_block *sb)
+{
+ if (atomic_long_dec_and_test(fsnotify_sb_watched_objects(sb)))
+ wake_up_var(fsnotify_sb_watched_objects(sb));
+}
+
static void fsnotify_get_inode_ref(struct inode *inode)
{
ihold(inode);
- atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
+ fsnotify_get_sb_watched_objects(inode->i_sb);
+}
+
+static void fsnotify_put_inode_ref(struct inode *inode)
+{
+ fsnotify_put_sb_watched_objects(inode->i_sb);
+ iput(inode);
+}
+
+/*
+ * Grab or drop watched objects reference depending on whether the connector
+ * is attached and has any marks attached.
+ */
+static void fsnotify_update_sb_watchers(struct super_block *sb,
+ struct fsnotify_mark_connector *conn)
+{
+ struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+ bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED;
+ struct fsnotify_mark *first_mark = NULL;
+ unsigned int highest_prio = 0;
+
+ if (conn->obj)
+ first_mark = hlist_entry_safe(conn->list.first,
+ struct fsnotify_mark, obj_list);
+ if (first_mark)
+ highest_prio = first_mark->group->priority;
+ if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM))
+ highest_prio = 0;
+
+ /*
+ * If the highest priority of group watching this object is prio,
+ * then watched object has a reference on counters [0..prio].
+ * Update priority >= 1 watched objects counters.
+ */
+ for (unsigned int p = conn->prio + 1; p <= highest_prio; p++)
+ atomic_long_inc(&sbinfo->watched_objects[p]);
+ for (unsigned int p = conn->prio; p > highest_prio; p--)
+ atomic_long_dec(&sbinfo->watched_objects[p]);
+ conn->prio = highest_prio;
+
+ /* Update priority >= 0 (a.k.a total) watched objects counter */
+ BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0);
+ if (first_mark && !is_watched) {
+ conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED;
+ fsnotify_get_sb_watched_objects(sb);
+ } else if (!first_mark && is_watched) {
+ conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED;
+ fsnotify_put_sb_watched_objects(sb);
+ }
}
/*
@@ -213,35 +287,12 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
-static void fsnotify_put_inode_ref(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
-
- iput(inode);
- if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
- wake_up_var(&sb->s_fsnotify_connectors);
-}
-
-static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn)
-{
- struct super_block *sb = fsnotify_connector_sb(conn);
-
- if (sb)
- atomic_long_inc(&sb->s_fsnotify_connectors);
-}
-
-static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn)
-{
- struct super_block *sb = fsnotify_connector_sb(conn);
-
- if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
- wake_up_var(&sb->s_fsnotify_connectors);
-}
-
static void *fsnotify_detach_connector_from_object(
struct fsnotify_mark_connector *conn,
unsigned int *type)
{
+ fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type);
+ struct super_block *sb = fsnotify_connector_sb(conn);
struct inode *inode = NULL;
*type = conn->type;
@@ -261,10 +312,10 @@ static void *fsnotify_detach_connector_from_object(
fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
}
- fsnotify_put_sb_connectors(conn);
- rcu_assign_pointer(*(conn->obj), NULL);
+ rcu_assign_pointer(*connp, NULL);
conn->obj = NULL;
conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
+ fsnotify_update_sb_watchers(sb, conn);
return inode;
}
@@ -316,6 +367,11 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
objp = fsnotify_detach_connector_from_object(conn, &type);
free_conn = true;
} else {
+ struct super_block *sb = fsnotify_connector_sb(conn);
+
+ /* Update watched objects after detaching mark */
+ if (sb)
+ fsnotify_update_sb_watchers(sb, conn);
objp = __fsnotify_recalc_mask(conn);
type = conn->type;
}
@@ -536,8 +592,28 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
return -1;
}
+static int fsnotify_attach_info_to_sb(struct super_block *sb)
+{
+ struct fsnotify_sb_info *sbinfo;
+
+ /* sb info is freed on fsnotify_sb_delete() */
+ sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
+ if (!sbinfo)
+ return -ENOMEM;
+
+ /*
+ * cmpxchg() provides the barrier so that callers of fsnotify_sb_info()
+ * will observe an initialized structure
+ */
+ if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) {
+ /* Someone else created sbinfo for us */
+ kfree(sbinfo);
+ }
+ return 0;
+}
+
static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- unsigned int obj_type)
+ void *obj, unsigned int obj_type)
{
struct fsnotify_mark_connector *conn;
@@ -547,10 +623,9 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
spin_lock_init(&conn->lock);
INIT_HLIST_HEAD(&conn->list);
conn->flags = 0;
+ conn->prio = 0;
conn->type = obj_type;
- conn->obj = connp;
- conn->flags = 0;
- fsnotify_get_sb_connectors(conn);
+ conn->obj = obj;
/*
* cmpxchg() provides the barrier so that readers of *connp can see
@@ -558,10 +633,8 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
*/
if (cmpxchg(connp, NULL, conn)) {
/* Someone else created list structure for us */
- fsnotify_put_sb_connectors(conn);
kmem_cache_free(fsnotify_mark_connector_cachep, conn);
}
-
return 0;
}
@@ -598,24 +671,36 @@ out:
* to which group and for which inodes. These marks are ordered according to
* priority, highest number first, and then by the group's location in memory.
*/
-static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
- fsnotify_connp_t *connp,
+static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
unsigned int obj_type, int add_flags)
{
+ struct super_block *sb = fsnotify_object_sb(obj, obj_type);
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
+ fsnotify_connp_t *connp;
int cmp;
int err = 0;
if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
return -EINVAL;
+ /*
+ * Attach the sb info before attaching a connector to any object on sb.
+ * The sb info will remain attached as long as sb lives.
+ */
+ if (!fsnotify_sb_info(sb)) {
+ err = fsnotify_attach_info_to_sb(sb);
+ if (err)
+ return err;
+ }
+
+ connp = fsnotify_object_connp(obj, obj_type);
restart:
spin_lock(&mark->lock);
conn = fsnotify_grab_connector(connp);
if (!conn) {
spin_unlock(&mark->lock);
- err = fsnotify_attach_connector_to_object(connp, obj_type);
+ err = fsnotify_attach_connector_to_object(connp, obj, obj_type);
if (err)
return err;
goto restart;
@@ -649,6 +734,7 @@ restart:
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
+ fsnotify_update_sb_watchers(sb, conn);
/*
* Since connector is attached to object using cmpxchg() we are
* guaranteed that connector initialization is fully visible by anyone
@@ -667,7 +753,7 @@ out_err:
* event types should be delivered to which group.
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
- fsnotify_connp_t *connp, unsigned int obj_type,
+ void *obj, unsigned int obj_type,
int add_flags)
{
struct fsnotify_group *group = mark->group;
@@ -688,7 +774,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags);
+ ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags);
if (ret)
goto err;
@@ -706,14 +792,14 @@ err:
return ret;
}
-int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
+int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
unsigned int obj_type, int add_flags)
{
int ret;
struct fsnotify_group *group = mark->group;
fsnotify_group_lock(group);
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags);
+ ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags);
fsnotify_group_unlock(group);
return ret;
}
@@ -723,12 +809,16 @@ EXPORT_SYMBOL_GPL(fsnotify_add_mark);
* Given a list of marks, find the mark associated with given group. If found
* take a reference to that mark and return it, else return NULL.
*/
-struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
+struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
struct fsnotify_group *group)
{
+ fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type);
struct fsnotify_mark_connector *conn;
struct fsnotify_mark *mark;
+ if (!connp)
+ return NULL;
+
conn = fsnotify_grab_connector(connp);
if (!conn)
return NULL;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b82185075de7..f0467d3b3c88 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2283,8 +2283,6 @@ unlock:
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
out:
- if (ret < 0)
- ret = -EIO;
return ret;
}
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 960080753d3b..2b8fa3e782fb 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1784,6 +1784,9 @@ static int o2net_accept_one(struct socket *sock, int *more)
struct o2nm_node *node = NULL;
struct o2nm_node *local_node = NULL;
struct o2net_sock_container *sc = NULL;
+ struct proto_accept_arg arg = {
+ .flags = O_NONBLOCK,
+ };
struct o2net_node *nn;
unsigned int nofs_flag;
@@ -1802,7 +1805,7 @@ static int o2net_accept_one(struct socket *sock, int *more)
new_sock->type = sock->type;
new_sock->ops = sock->ops;
- ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, false);
+ ret = sock->ops->accept(sock, new_sock, &arg);
if (ret < 0)
goto out;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 5c04dde99981..2018501b2249 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1274,7 +1274,7 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
{
struct dlm_query_nodeinfo *qn;
struct dlm_ctxt *dlm = NULL;
- int locked = 0, status = -EINVAL;
+ int status = -EINVAL;
qn = (struct dlm_query_nodeinfo *) msg->buf;
@@ -1290,12 +1290,11 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
}
spin_lock(&dlm->spinlock);
- locked = 1;
if (dlm->joining_node != qn->qn_nodenum) {
mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
"joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
dlm->joining_node);
- goto bail;
+ goto unlock;
}
/* Support for node query was added in 1.1 */
@@ -1305,14 +1304,14 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
"but active dlm protocol is %d.%d\n", qn->qn_nodenum,
qn->qn_domain, dlm->dlm_locking_proto.pv_major,
dlm->dlm_locking_proto.pv_minor);
- goto bail;
+ goto unlock;
}
status = dlm_match_nodes(dlm, qn);
+unlock:
+ spin_unlock(&dlm->spinlock);
bail:
- if (locked)
- spin_unlock(&dlm->spinlock);
spin_unlock(&dlm_domain_lock);
return status;
@@ -1528,7 +1527,6 @@ static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
{
int status, node, live;
- status = 0;
node = -1;
while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index b8b6a191b5cb..96b684763b39 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -255,9 +255,9 @@ static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb,
if (fh_len < 3 || fh_type > 2)
return NULL;
- handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32;
- handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]);
- handle.ih_generation = le32_to_cpu(fid->raw[2]);
+ handle.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[0]) << 32;
+ handle.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[1]);
+ handle.ih_generation = le32_to_cpu((__force __le32)fid->raw[2]);
return ocfs2_get_dentry(sb, &handle);
}
@@ -269,9 +269,9 @@ static struct dentry *ocfs2_fh_to_parent(struct super_block *sb,
if (fh_type != 2 || fh_len < 6)
return NULL;
- parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32;
- parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]);
- parent.ih_generation = le32_to_cpu(fid->raw[5]);
+ parent.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[3]) << 32;
+ parent.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[4]);
+ parent.ih_generation = le32_to_cpu((__force __le32)fid->raw[5]);
return ocfs2_get_dentry(sb, &parent);
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0da8e7bd3261..ccc57038a977 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1936,6 +1936,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
inode_lock(inode);
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
+ inode_dio_wait(inode);
/*
* This prevents concurrent writes on other nodes
*/
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 999111bfc271..2cc5c99fe941 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1621,6 +1621,7 @@ static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info
}
static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
+__acquires(&oi->ip_lock)
{
struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
@@ -1628,6 +1629,7 @@ static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
}
static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci)
+__releases(&oi->ip_lock)
{
struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index b1550ba73f96..71beef7f8a60 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -125,6 +125,7 @@ int ocfs2_fileattr_set(struct mnt_idmap *idmap,
ocfs2_inode->ip_attr = flags;
ocfs2_set_inode_flags(inode);
+ inode_set_ctime_current(inode);
status = ocfs2_mark_inode_dirty(handle, inode, bh);
if (status < 0)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index c803c10dd97e..5df34561c551 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -212,14 +212,15 @@ static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
unsigned int num_clusters)
{
- spin_lock(&osb->osb_lock);
- if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
- osb->local_alloc_state == OCFS2_LA_THROTTLED)
- if (num_clusters >= osb->local_alloc_default_bits) {
+ if (num_clusters >= osb->local_alloc_default_bits) {
+ spin_lock(&osb->osb_lock);
+ if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+ osb->local_alloc_state == OCFS2_LA_THROTTLED) {
cancel_delayed_work(&osb->la_enable_wq);
osb->local_alloc_state = OCFS2_LA_ENABLED;
}
- spin_unlock(&osb->osb_lock);
+ spin_unlock(&osb->osb_lock);
+ }
}
void ocfs2_la_enable_worker(struct work_struct *work)
@@ -335,7 +336,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
"found = %u, set = %u, taken = %u, off = %u\n",
num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
le32_to_cpu(alloc->id1.bitmap1.i_total),
- OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+ le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off));
status = -EINVAL;
goto bail;
@@ -863,14 +864,8 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
numfound = bitoff = startoff = 0;
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
- while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
- if (bitoff == left) {
- /* mlog(0, "bitoff (%d) == left", bitoff); */
- break;
- }
- /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
- "numfound = %d\n", bitoff, startoff, numfound);*/
-
+ while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) <
+ left) {
/* Ok, we found a zero bit... is it contig. or do we
* start over?*/
if (bitoff == startoff) {
@@ -976,9 +971,9 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
start = count = 0;
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
- while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
- != -1) {
- if ((bit_off < left) && (bit_off == start)) {
+ while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) <
+ left) {
+ if (bit_off == start) {
count++;
start++;
continue;
@@ -1002,8 +997,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
goto bail;
}
}
- if (bit_off >= left)
- break;
+
count = 1;
start = bit_off + 1;
}
@@ -1220,7 +1214,7 @@ retry_enospc:
OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
trace_ocfs2_local_alloc_new_window_result(
- OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
+ le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off),
le32_to_cpu(alloc->id1.bitmap1.i_total));
bail:
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 1f9ed117e78b..f9d6a4f9ca92 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -685,7 +685,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
}
ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
- goal_bit, len);
+ goal_bit, len, 0, 0);
if (ret) {
ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
le16_to_cpu(gd->bg_chain));
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 9221a33f917b..4d1ea8703fcd 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -566,7 +566,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
fe->i_last_eb_blk = 0;
strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
- ktime_get_real_ts64(&ts);
+ ktime_get_coarse_real_ts64(&ts);
fe->i_atime = fe->i_ctime = fe->i_mtime =
cpu_to_le64(ts.tv_sec);
fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -797,6 +797,7 @@ static int ocfs2_link(struct dentry *old_dentry,
ocfs2_set_links_count(fe, inode->i_nlink);
fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
@@ -993,6 +994,7 @@ static int ocfs2_unlink(struct inode *dir,
drop_nlink(inode);
drop_nlink(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, fe_bh);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7aebdbf5cc0a..c93689b568fe 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -883,7 +883,8 @@ struct ocfs2_group_desc
__le16 bg_free_bits_count; /* Free bits count */
__le16 bg_chain; /* What chain I am in. */
/*10*/ __le32 bg_generation;
- __le32 bg_reserved1;
+ __le16 bg_contig_free_bits; /* max contig free bits length */
+ __le16 bg_reserved1;
__le64 bg_next_group; /* Next group in my list, in
blocks */
/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 9898c11bdfa1..60e208b01c8d 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -82,7 +82,7 @@ DECLARE_EVENT_CLASS(ocfs2__string,
__string(name,name)
),
TP_fast_assign(
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%s", __get_str(name))
);
@@ -1289,7 +1289,7 @@ DECLARE_EVENT_CLASS(ocfs2__file_ops,
__entry->dentry = dentry;
__entry->ino = ino;
__entry->d_len = d_len;
- __assign_str(d_name, d_name);
+ __assign_str(d_name);
__entry->para = para;
),
TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
@@ -1425,7 +1425,7 @@ TRACE_EVENT(ocfs2_setattr,
__entry->dentry = dentry;
__entry->ino = ino;
__entry->d_len = d_len;
- __assign_str(d_name, d_name);
+ __assign_str(d_name);
__entry->ia_valid = ia_valid;
__entry->ia_mode = ia_mode;
__entry->ia_uid = ia_uid;
@@ -1683,7 +1683,7 @@ TRACE_EVENT(ocfs2_parse_options,
),
TP_fast_assign(
__entry->is_remount = is_remount;
- __assign_str(options, options);
+ __assign_str(options);
),
TP_printk("%d %s", __entry->is_remount, __get_str(options))
);
@@ -1718,8 +1718,8 @@ TRACE_EVENT(ocfs2_initialize_super,
__field(int, cluster_bits)
),
TP_fast_assign(
- __assign_str(label, label);
- __assign_str(uuid_str, uuid_str);
+ __assign_str(label);
+ __assign_str(uuid_str);
__entry->root_dir = root_dir;
__entry->system_dir = system_dir;
__entry->cluster_bits = cluster_bits;
@@ -1746,7 +1746,7 @@ TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
__field(int, credits)
),
TP_fast_assign(
- __assign_str(name, name);
+ __assign_str(name);
__entry->meta = meta;
__entry->clusters = clusters;
__entry->credits = credits;
@@ -1770,7 +1770,7 @@ DECLARE_EVENT_CLASS(ocfs2__xattr_find,
),
TP_fast_assign(
__entry->ino = ino;
- __assign_str(name, name);
+ __assign_str(name);
__entry->name_index = name_index;
__entry->hash = hash;
__entry->location = location;
@@ -2019,7 +2019,7 @@ TRACE_EVENT(ocfs2_sync_dquot_helper,
__entry->dq_id = dq_id;
__entry->dq_type = dq_type;
__entry->type = type;
- __assign_str(s_id, s_id);
+ __assign_str(s_id);
),
TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
__entry->type, __get_str(s_id))
@@ -2060,7 +2060,7 @@ TRACE_EVENT(ocfs2_dx_dir_search,
TP_fast_assign(
__entry->ino = ino;
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
__entry->major_hash = major_hash;
__entry->minor_hash = minor_hash;
__entry->blkno = blkno;
@@ -2088,7 +2088,7 @@ TRACE_EVENT(ocfs2_find_files_on_disk,
),
TP_fast_assign(
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
__entry->blkno = blkno;
__entry->dir = dir;
),
@@ -2107,7 +2107,7 @@ TRACE_EVENT(ocfs2_check_dir_for_entry,
TP_fast_assign(
__entry->dir = dir;
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%llu %.*s", __entry->dir,
__entry->namelen, __get_str(name))
@@ -2135,7 +2135,7 @@ TRACE_EVENT(ocfs2_dx_dir_index_root_block,
__entry->major_hash = major_hash;
__entry->minor_hash = minor_hash;
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
__entry->num_used = num_used;
),
TP_printk("%llu %x %x %.*s %u", __entry->dir,
@@ -2171,7 +2171,7 @@ DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
__entry->dir = dir;
__entry->dentry = dentry;
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->dir_blkno = dir_blkno;
__entry->extra = extra;
),
@@ -2217,7 +2217,7 @@ TRACE_EVENT(ocfs2_mknod,
__entry->dir = dir;
__entry->dentry = dentry;
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->dir_blkno = dir_blkno;
__entry->dev = dev;
__entry->mode = mode;
@@ -2241,9 +2241,9 @@ TRACE_EVENT(ocfs2_link,
TP_fast_assign(
__entry->ino = ino;
__entry->old_len = old_len;
- __assign_str(old_name, old_name);
+ __assign_str(old_name);
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%llu %.*s %.*s", __entry->ino,
__entry->old_len, __get_str(old_name),
@@ -2279,9 +2279,9 @@ TRACE_EVENT(ocfs2_rename,
__entry->new_dir = new_dir;
__entry->new_dentry = new_dentry;
__entry->old_len = old_len;
- __assign_str(old_name, old_name);
+ __assign_str(old_name);
__entry->new_len = new_len;
- __assign_str(new_name, new_name);
+ __assign_str(new_name);
),
TP_printk("%p %p %p %p %.*s %.*s",
__entry->old_dir, __entry->old_dentry,
@@ -2301,7 +2301,7 @@ TRACE_EVENT(ocfs2_rename_target_exists,
),
TP_fast_assign(
__entry->new_len = new_len;
- __assign_str(new_name, new_name);
+ __assign_str(new_name);
),
TP_printk("%.*s", __entry->new_len, __get_str(new_name))
);
@@ -2344,7 +2344,7 @@ TRACE_EVENT(ocfs2_symlink_begin,
__entry->dentry = dentry;
__entry->symname = symname;
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
__entry->symname, __entry->len, __get_str(name))
@@ -2360,7 +2360,7 @@ TRACE_EVENT(ocfs2_blkno_stringify,
),
TP_fast_assign(
__entry->blkno = blkno;
- __assign_str(name, name);
+ __assign_str(name);
__entry->namelen = namelen;
),
TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
@@ -2381,7 +2381,7 @@ TRACE_EVENT(ocfs2_orphan_del,
),
TP_fast_assign(
__entry->dir = dir;
- __assign_str(name, name);
+ __assign_str(name);
__entry->namelen = namelen;
),
TP_printk("%llu %s %d", __entry->dir, __get_str(name),
@@ -2403,7 +2403,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate,
TP_fast_assign(
__entry->dentry = dentry;
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
);
@@ -2420,7 +2420,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate_negative,
),
TP_fast_assign(
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->pgen = pgen;
__entry->gen = gen;
),
@@ -2445,7 +2445,7 @@ TRACE_EVENT(ocfs2_find_local_alias,
),
TP_fast_assign(
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%.*s", __entry->len, __get_str(name))
);
@@ -2462,7 +2462,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock,
),
TP_fast_assign(
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->parent = parent;
__entry->fsdata = fsdata;
),
@@ -2480,7 +2480,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock_found,
__field(unsigned long long, ino)
),
TP_fast_assign(
- __assign_str(name, name);
+ __assign_str(name);
__entry->parent = parent;
__entry->ino = ino;
),
@@ -2527,7 +2527,7 @@ TRACE_EVENT(ocfs2_get_parent,
TP_fast_assign(
__entry->child = child;
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->ino = ino;
),
TP_printk("%p %.*s %llu", __entry->child, __entry->len,
@@ -2551,7 +2551,7 @@ TRACE_EVENT(ocfs2_encode_fh_begin,
TP_fast_assign(
__entry->dentry = dentry;
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->fh = fh;
__entry->len = len;
__entry->connectable = connectable;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3f80a56d0d60..1f303b1adf1a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -630,7 +630,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
rb->rf_records.rl_count =
cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
spin_lock(&osb->osb_lock);
- rb->rf_generation = osb->s_next_generation++;
+ rb->rf_generation = cpu_to_le32(osb->s_next_generation++);
spin_unlock(&osb->osb_lock);
ocfs2_journal_dirty(handle, new_bh);
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index a9d1296d736d..1fe61974d9f0 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -414,7 +414,7 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
start = search_start;
while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
- start)) != -1) {
+ start)) < resmap->m_bitmap_len) {
/* Search reached end of the region */
if (offset >= (search_start + search_len))
break;
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index d65d43c61857..c4a4016d3866 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -91,6 +91,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
u16 old_bg_clusters;
+ u16 contig_bits;
+ __le16 old_bg_contig_free_bits;
trace_ocfs2_update_last_group_and_inode(new_clusters,
first_new_cluster);
@@ -122,6 +124,11 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
+ contig_bits = ocfs2_find_max_contig_free_bits(group->bg_bitmap,
+ le16_to_cpu(group->bg_bits), 0);
+ old_bg_contig_free_bits = group->bg_contig_free_bits;
+ group->bg_contig_free_bits = cpu_to_le16(contig_bits);
+
ocfs2_journal_dirty(handle, group_bh);
/* update the inode accordingly. */
@@ -160,6 +167,7 @@ out_rollback:
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
+ group->bg_contig_free_bits = old_bg_contig_free_bits;
}
out:
if (ret)
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 166c8918c825..f7b483f0de2a 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -50,6 +50,10 @@ struct ocfs2_suballoc_result {
u64 sr_blkno; /* The first allocated block */
unsigned int sr_bit_offset; /* The bit in the bg */
unsigned int sr_bits; /* How many bits we claimed */
+ unsigned int sr_max_contig_bits; /* The length for contiguous
+ * free bits, only available
+ * for cluster group
+ */
};
static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
@@ -1272,6 +1276,26 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
return ret;
}
+u16 ocfs2_find_max_contig_free_bits(void *bitmap,
+ u16 total_bits, u16 start)
+{
+ u16 offset, free_bits;
+ u16 contig_bits = 0;
+
+ while (start < total_bits) {
+ offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start);
+ if (offset == total_bits)
+ break;
+
+ start = ocfs2_find_next_bit(bitmap, total_bits, offset);
+ free_bits = start - offset;
+ if (contig_bits < free_bits)
+ contig_bits = free_bits;
+ }
+
+ return contig_bits;
+}
+
static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
struct buffer_head *bg_bh,
unsigned int bits_wanted,
@@ -1280,6 +1304,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
{
void *bitmap;
u16 best_offset, best_size;
+ u16 prev_best_size = 0;
int offset, start, found, status = 0;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
@@ -1290,10 +1315,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
found = start = best_offset = best_size = 0;
bitmap = bg->bg_bitmap;
- while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
- if (offset == total_bits)
- break;
-
+ while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) <
+ total_bits) {
if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
/* We found a zero, but we can't use it as it
* hasn't been put to disk yet! */
@@ -1308,6 +1331,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
/* got a zero after some ones */
found = 1;
start = offset + 1;
+ prev_best_size = best_size;
}
if (found > best_size) {
best_size = found;
@@ -1320,6 +1344,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
}
}
+ /* best_size will be allocated, we save prev_best_size */
+ res->sr_max_contig_bits = prev_best_size;
if (best_size) {
res->sr_bit_offset = best_offset;
res->sr_bits = best_size;
@@ -1337,11 +1363,16 @@ int ocfs2_block_group_set_bits(handle_t *handle,
struct ocfs2_group_desc *bg,
struct buffer_head *group_bh,
unsigned int bit_off,
- unsigned int num_bits)
+ unsigned int num_bits,
+ unsigned int max_contig_bits,
+ int fastpath)
{
int status;
void *bitmap = bg->bg_bitmap;
int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+ unsigned int start = bit_off + num_bits;
+ u16 contig_bits;
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
/* All callers get the descriptor via
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
@@ -1373,6 +1404,29 @@ int ocfs2_block_group_set_bits(handle_t *handle,
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
+ /*
+ * this is optimize path, caller set old contig value
+ * in max_contig_bits to bypass finding action.
+ */
+ if (fastpath) {
+ bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
+ } else if (ocfs2_is_cluster_bitmap(alloc_inode)) {
+ /*
+ * Usually, the block group bitmap allocates only 1 bit
+ * at a time, while the cluster group allocates n bits
+ * each time. Therefore, we only save the contig bits for
+ * the cluster group.
+ */
+ contig_bits = ocfs2_find_max_contig_free_bits(bitmap,
+ le16_to_cpu(bg->bg_bits), start);
+ if (contig_bits > max_contig_bits)
+ max_contig_bits = contig_bits;
+ bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
+ ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits);
+ } else {
+ bg->bg_contig_free_bits = 0;
+ }
+
ocfs2_journal_dirty(handle, group_bh);
bail:
@@ -1486,7 +1540,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
BUG_ON(!ocfs2_is_cluster_bitmap(inode));
- if (gd->bg_free_bits_count) {
+ if (le16_to_cpu(gd->bg_contig_free_bits) &&
+ le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted)
+ return -ENOSPC;
+
+ /* ->bg_contig_free_bits may un-initialized, so compare again */
+ if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) {
max_bits = le16_to_cpu(gd->bg_bits);
/* Tail groups in cluster bitmaps which aren't cpg
@@ -1530,13 +1589,6 @@ static int ocfs2_cluster_group_search(struct inode *inode,
* of bits. */
if (min_bits <= res->sr_bits)
search = 0; /* success */
- else if (res->sr_bits) {
- /*
- * Don't show bits which we'll be returning
- * for allocation to the local alloc bitmap.
- */
- ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
- }
}
return search;
@@ -1555,7 +1607,7 @@ static int ocfs2_block_group_search(struct inode *inode,
BUG_ON(min_bits != 1);
BUG_ON(ocfs2_is_cluster_bitmap(inode));
- if (bg->bg_free_bits_count) {
+ if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) {
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
le16_to_cpu(bg->bg_bits),
@@ -1715,7 +1767,8 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
}
ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
- res->sr_bit_offset, res->sr_bits);
+ res->sr_bit_offset, res->sr_bits,
+ res->sr_max_contig_bits, 0);
if (ret < 0) {
ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
res->sr_bits,
@@ -1849,7 +1902,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
bg,
group_bh,
res->sr_bit_offset,
- res->sr_bits);
+ res->sr_bits,
+ res->sr_max_contig_bits,
+ 0);
if (status < 0) {
ocfs2_rollback_alloc_dinode_counts(alloc_inode,
ac->ac_bh, res->sr_bits, chain);
@@ -1951,7 +2006,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
if (i == victim)
continue;
- if (!cl->cl_recs[i].c_free)
+ if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted)
continue;
ac->ac_chain = i;
@@ -2163,7 +2218,9 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
bg,
bg_bh,
res->sr_bit_offset,
- res->sr_bits);
+ res->sr_bits,
+ res->sr_max_contig_bits,
+ 0);
if (ret < 0) {
ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
ac->ac_bh, res->sr_bits, chain);
@@ -2382,11 +2439,13 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
struct buffer_head *group_bh,
unsigned int bit_off,
unsigned int num_bits,
+ unsigned int max_contig_bits,
void (*undo_fn)(unsigned int bit,
unsigned long *bmap))
{
int status;
unsigned int tmp;
+ u16 contig_bits;
struct ocfs2_group_desc *undo_bg = NULL;
struct journal_head *jh;
@@ -2433,6 +2492,20 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
num_bits);
}
+ /*
+ * TODO: even 'num_bits == 1' (the worst case, release 1 cluster),
+ * we still need to rescan whole bitmap.
+ */
+ if (ocfs2_is_cluster_bitmap(alloc_inode)) {
+ contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
+ le16_to_cpu(bg->bg_bits), 0);
+ if (contig_bits > max_contig_bits)
+ max_contig_bits = contig_bits;
+ bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
+ } else {
+ bg->bg_contig_free_bits = 0;
+ }
+
if (undo_fn)
spin_unlock(&jh->b_state_lock);
@@ -2459,6 +2532,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
struct ocfs2_chain_list *cl = &fe->id2.i_chain;
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *group;
+ __le16 old_bg_contig_free_bits = 0;
/* The alloc_bh comes from ocfs2_free_dinode() or
* ocfs2_free_clusters(). The callers have all locked the
@@ -2483,9 +2557,11 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
+ if (ocfs2_is_cluster_bitmap(alloc_inode))
+ old_bg_contig_free_bits = group->bg_contig_free_bits;
status = ocfs2_block_group_clear_bits(handle, alloc_inode,
group, group_bh,
- start_bit, count, undo_fn);
+ start_bit, count, 0, undo_fn);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -2496,7 +2572,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
if (status < 0) {
mlog_errno(status);
ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
- start_bit, count);
+ start_bit, count,
+ le16_to_cpu(old_bg_contig_free_bits), 1);
goto bail;
}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 9c74eace3adc..b481b834857d 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -79,12 +79,16 @@ void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
struct buffer_head *di_bh,
u32 num_bits,
u16 chain);
+u16 ocfs2_find_max_contig_free_bits(void *bitmap,
+ u16 total_bits, u16 start);
int ocfs2_block_group_set_bits(handle_t *handle,
struct inode *alloc_inode,
struct ocfs2_group_desc *bg,
struct buffer_head *group_bh,
unsigned int bit_off,
- unsigned int num_bits);
+ unsigned int num_bits,
+ unsigned int max_contig_bits,
+ int fastpath);
int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
diff --git a/fs/open.c b/fs/open.c
index ee8460c83c77..89cafb572061 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -902,10 +902,10 @@ cleanup_inode:
}
static int do_dentry_open(struct file *f,
- struct inode *inode,
int (*open)(struct inode *, struct file *))
{
static const struct file_operations empty_fops = {};
+ struct inode *inode = f->f_path.dentry->d_inode;
int error;
path_get(&f->f_path);
@@ -1047,7 +1047,7 @@ int finish_open(struct file *file, struct dentry *dentry,
BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
file->f_path.dentry = dentry;
- return do_dentry_open(file, d_backing_inode(dentry), open);
+ return do_dentry_open(file, open);
}
EXPORT_SYMBOL(finish_open);
@@ -1086,7 +1086,7 @@ EXPORT_SYMBOL(file_path);
int vfs_open(const struct path *path, struct file *file)
{
file->f_path = *path;
- return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
+ return do_dentry_open(file, NULL);
}
struct file *dentry_open(const struct path *path, int flags,
@@ -1155,7 +1155,6 @@ EXPORT_SYMBOL(dentry_create);
* kernel_file_open - open a file for kernel internal use
* @path: path of the file to open
* @flags: open flags
- * @inode: the inode
* @cred: credentials for open
*
* Open a file for use by in-kernel consumers. The file is not accounted
@@ -1165,7 +1164,7 @@ EXPORT_SYMBOL(dentry_create);
* Return: Opened file on success, an error pointer on failure.
*/
struct file *kernel_file_open(const struct path *path, int flags,
- struct inode *inode, const struct cred *cred)
+ const struct cred *cred)
{
struct file *f;
int error;
@@ -1175,7 +1174,7 @@ struct file *kernel_file_open(const struct path *path, int flags,
return f;
f->f_path = *path;
- error = do_dentry_open(f, inode, NULL);
+ error = do_dentry_open(f, NULL);
if (error) {
fput(f);
f = ERR_PTR(error);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 0f8b4a719237..116f542442dd 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -14,6 +14,7 @@
#include <linux/posix_acl_xattr.h>
#include <linux/atomic.h>
#include <linux/ratelimit.h>
+#include <linux/backing-file.h>
#include "overlayfs.h"
static unsigned short ovl_redirect_max = 256;
@@ -260,14 +261,13 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
* may not use to instantiate the new dentry.
*/
static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
- struct dentry *newdentry, bool hardlink)
+ struct dentry *newdentry, bool hardlink, struct file *tmpfile)
{
struct ovl_inode_params oip = {
.upperdentry = newdentry,
.newinode = inode,
};
- ovl_dir_modified(dentry->d_parent, false);
ovl_dentry_set_upper_alias(dentry);
ovl_dentry_init_reval(dentry, newdentry, NULL);
@@ -295,6 +295,9 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
inc_nlink(inode);
}
+ if (tmpfile)
+ d_mark_tmpfile(tmpfile, inode);
+
d_instantiate(dentry, inode);
if (inode != oip.newinode) {
pr_warn_ratelimited("newly created inode found in cache (%pd2)\n",
@@ -327,9 +330,6 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct dentry *newdentry;
int err;
- if (!attr->hardlink && !IS_POSIXACL(udir))
- attr->mode &= ~current_umask();
-
inode_lock_nested(udir, I_MUTEX_PARENT);
newdentry = ovl_create_real(ofs, udir,
ovl_lookup_upper(ofs, dentry->d_name.name,
@@ -345,7 +345,8 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
ovl_set_opaque(dentry, newdentry);
}
- err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink);
+ ovl_dir_modified(dentry->d_parent, false);
+ err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink, NULL);
if (err)
goto out_cleanup;
out_unlock:
@@ -529,7 +530,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
}
- err = ovl_instantiate(dentry, inode, newdentry, hardlink);
+ ovl_dir_modified(dentry->d_parent, false);
+ err = ovl_instantiate(dentry, inode, newdentry, hardlink, NULL);
if (err) {
ovl_cleanup(ofs, udir, newdentry);
dput(newdentry);
@@ -551,12 +553,35 @@ out_cleanup:
goto out_dput;
}
+static int ovl_setup_cred_for_create(struct dentry *dentry, struct inode *inode,
+ umode_t mode, const struct cred *old_cred)
+{
+ int err;
+ struct cred *override_cred;
+
+ override_cred = prepare_creds();
+ if (!override_cred)
+ return -ENOMEM;
+
+ override_cred->fsuid = inode->i_uid;
+ override_cred->fsgid = inode->i_gid;
+ err = security_dentry_create_files_as(dentry, mode, &dentry->d_name,
+ old_cred, override_cred);
+ if (err) {
+ put_cred(override_cred);
+ return err;
+ }
+ put_cred(override_creds(override_cred));
+ put_cred(override_cred);
+
+ return 0;
+}
+
static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
struct ovl_cattr *attr, bool origin)
{
int err;
const struct cred *old_cred;
- struct cred *override_cred;
struct dentry *parent = dentry->d_parent;
old_cred = ovl_override_creds(dentry->d_sb);
@@ -572,10 +597,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
}
if (!attr->hardlink) {
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (!override_cred)
- goto out_revert_creds;
/*
* In the creation cases(create, mkdir, mknod, symlink),
* ovl should transfer current's fs{u,g}id to underlying
@@ -589,17 +610,9 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
* create a new inode, so just use the ovl mounter's
* fs{u,g}id.
*/
- override_cred->fsuid = inode->i_uid;
- override_cred->fsgid = inode->i_gid;
- err = security_dentry_create_files_as(dentry,
- attr->mode, &dentry->d_name, old_cred,
- override_cred);
- if (err) {
- put_cred(override_cred);
+ err = ovl_setup_cred_for_create(dentry, inode, attr->mode, old_cred);
+ if (err)
goto out_revert_creds;
- }
- put_cred(override_creds(override_cred));
- put_cred(override_cred);
}
if (!ovl_dentry_is_whiteout(dentry))
@@ -1290,6 +1303,100 @@ out:
return err;
}
+static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
+ struct inode *inode, umode_t mode)
+{
+ const struct cred *old_cred;
+ struct path realparentpath;
+ struct file *realfile;
+ struct dentry *newdentry;
+ /* It's okay to set O_NOATIME, since the owner will be current fsuid */
+ int flags = file->f_flags | OVL_OPEN_FLAGS;
+ int err;
+
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ return err;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred);
+ if (err)
+ goto out_revert_creds;
+
+ ovl_path_upper(dentry->d_parent, &realparentpath);
+ realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath,
+ mode, current_cred());
+ err = PTR_ERR_OR_ZERO(realfile);
+ pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err);
+ if (err)
+ goto out_revert_creds;
+
+ /* ovl_instantiate() consumes the newdentry reference on success */
+ newdentry = dget(realfile->f_path.dentry);
+ err = ovl_instantiate(dentry, inode, newdentry, false, file);
+ if (!err) {
+ file->private_data = realfile;
+ } else {
+ dput(newdentry);
+ fput(realfile);
+ }
+out_revert_creds:
+ revert_creds(old_cred);
+ return err;
+}
+
+static int ovl_dummy_open(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
+static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
+ struct file *file, umode_t mode)
+{
+ int err;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode;
+
+ if (!OVL_FS(dentry->d_sb)->tmpfile)
+ return -EOPNOTSUPP;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ return err;
+
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, mode, 0);
+ if (!inode)
+ goto drop_write;
+
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+ err = ovl_create_tmpfile(file, dentry, inode, inode->i_mode);
+ if (err)
+ goto put_inode;
+
+ /*
+ * Check if the preallocated inode was actually used. Having something
+ * else assigned to the dentry shouldn't happen as that would indicate
+ * that the backing tmpfile "leaked" out of overlayfs.
+ */
+ err = -EIO;
+ if (WARN_ON(inode != d_inode(dentry)))
+ goto put_realfile;
+
+ /* inode reference was transferred to dentry */
+ inode = NULL;
+ err = finish_open(file, dentry, ovl_dummy_open);
+put_realfile:
+ /* Without FMODE_OPENED ->release() won't be called on @file */
+ if (!(file->f_mode & FMODE_OPENED))
+ fput(file->private_data);
+put_inode:
+ iput(inode);
+drop_write:
+ ovl_drop_write(dentry);
+ return err;
+}
+
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
@@ -1310,4 +1417,5 @@ const struct inode_operations ovl_dir_inode_operations = {
.update_time = ovl_update_time,
.fileattr_get = ovl_fileattr_get,
.fileattr_set = ovl_fileattr_set,
+ .tmpfile = ovl_tmpfile,
};
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 05536964d37f..1a411cae57ed 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -24,9 +24,6 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
-/* No atime modification on underlying */
-#define OVL_OPEN_FLAGS (O_NOATIME)
-
static struct file *ovl_open_realfile(const struct file *file,
const struct path *realpath)
{
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c63b31a460be..35fd3e3e1778 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -8,7 +8,6 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/xattr.h>
-#include <linux/posix_acl.h>
#include <linux/ratelimit.h>
#include <linux/fiemap.h>
#include <linux/fileattr.h>
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index ee949f3e7c77..0bfe35da4b7b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -175,6 +175,9 @@ static inline int ovl_metadata_digest_size(const struct ovl_metacopy *metacopy)
return (int)metacopy->len - OVL_METACOPY_MIN_SIZE;
}
+/* No atime modification on underlying */
+#define OVL_OPEN_FLAGS (O_NOATIME)
+
extern const char *const ovl_xattr_table[][2];
static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
{
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index d285d1d7baad..edc9216f6e27 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1376,7 +1376,7 @@ int ovl_ensure_verity_loaded(struct path *datapath)
* If this inode was not yet opened, the verity info hasn't been
* loaded yet, so we need to do that here to force it into memory.
*/
- filp = kernel_file_open(datapath, O_RDONLY, inode, current_cred());
+ filp = kernel_file_open(datapath, O_RDONLY, current_cred());
if (IS_ERR(filp))
return PTR_ERR(filp);
fput(filp);
diff --git a/fs/pidfs.c b/fs/pidfs.c
index a63d5d24aa02..dbb9d854d1c5 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -169,6 +169,24 @@ static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return -EOPNOTSUPP;
}
+
+/*
+ * User space expects pidfs inodes to have no file type in st_mode.
+ *
+ * In particular, 'lsof' has this legacy logic:
+ *
+ * type = s->st_mode & S_IFMT;
+ * switch (type) {
+ * ...
+ * case 0:
+ * if (!strcmp(p, "anon_inode"))
+ * Lf->ntype = Ntype = N_ANON_INODE;
+ *
+ * to detect our old anon_inode logic.
+ *
+ * Rather than mess with our internal sane inode data, just fix it
+ * up here in getattr() by masking off the format bits.
+ */
static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
@@ -176,6 +194,7 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_inode(path->dentry);
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ stat->mode &= ~S_IFMT;
return 0;
}
@@ -199,12 +218,13 @@ static const struct super_operations pidfs_sops = {
.statfs = simple_statfs,
};
+/*
+ * 'lsof' has knowledge of out historical anon_inode use, and expects
+ * the pidfs dentry name to start with 'anon_inode'.
+ */
static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
- struct inode *inode = d_inode(dentry);
- struct pid *pid = inode->i_private;
-
- return dynamic_dname(buffer, buflen, "pidfd:[%llu]", pid->ino);
+ return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
}
static const struct dentry_operations pidfs_dentry_operations = {
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index f4b1c8b42a51..586bbc84ca04 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -39,10 +39,8 @@ static int seq_show(struct seq_file *m, void *v)
spin_lock(&files->file_lock);
file = files_lookup_fd_locked(files, fd);
if (file) {
- struct fdtable *fdt = files_fdtable(files);
-
f_flags = file->f_flags;
- if (close_on_exec(fd, fdt))
+ if (close_on_exec(fd, files))
f_flags |= O_CLOEXEC;
get_file(file);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index dcd513dccf55..d19434e2a58e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -451,15 +451,13 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
+ if (pde->proc_ops->proc_get_unmapped_area)
+ return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);
- get_area = pde->proc_ops->proc_get_unmapped_area;
#ifdef CONFIG_MMU
- if (!get_area)
- get_area = current->mm->get_unmapped_area;
+ return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags);
#endif
- if (get_area)
- return get_area(file, orig_addr, len, pgoff, flags);
+
return orig_addr;
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 45af9a989d40..245171d9164b 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap);
#ifdef CONFIG_ZSWAP
- seq_printf(m, "Zswap: %8lu kB\n",
- (unsigned long)(zswap_pool_total_size >> 10));
+ show_val_kb(m, "Zswap: ", zswap_total_pages());
seq_printf(m, "Zswapped: %8lu kB\n",
(unsigned long)atomic_read(&zswap_stored_pages) <<
(PAGE_SHIFT - 10));
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9223856c934b..2fb64bdb64eb 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -107,10 +107,13 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
return ((kflags >> kbit) & 1) << ubit;
}
-u64 stable_page_flags(struct page *page)
+u64 stable_page_flags(const struct page *page)
{
- u64 k;
- u64 u;
+ const struct folio *folio;
+ unsigned long k;
+ unsigned long mapping;
+ bool is_anon;
+ u64 u = 0;
/*
* pseudo flag: KPF_NOPAGE
@@ -118,49 +121,47 @@ u64 stable_page_flags(struct page *page)
*/
if (!page)
return 1 << KPF_NOPAGE;
+ folio = page_folio(page);
- k = page->flags;
- u = 0;
+ k = folio->flags;
+ mapping = (unsigned long)folio->mapping;
+ is_anon = mapping & PAGE_MAPPING_ANON;
/*
* pseudo flags for the well known (anonymous) memory mapped pages
*/
if (page_mapped(page))
u |= 1 << KPF_MMAP;
- if (PageAnon(page))
+ if (is_anon) {
u |= 1 << KPF_ANON;
- if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ if (mapping & PAGE_MAPPING_KSM)
+ u |= 1 << KPF_KSM;
+ }
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
- if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
- if (PageTail(page))
+ if (page == &folio->page)
+ u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head);
+ else
u |= 1 << KPF_COMPOUND_TAIL;
- if (PageHuge(page))
+ if (folio_test_hugetlb(folio))
u |= 1 << KPF_HUGE;
/*
- * PageTransCompound can be true for non-huge compound pages (slab
- * pages or pages allocated by drivers with __GFP_COMP) because it
- * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
+ * We need to check PageLRU/PageAnon
* to make sure a given page is a thp, not a non-huge compound page.
*/
- else if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
-
- if (PageLRU(head) || PageAnon(head))
+ else if (folio_test_large(folio)) {
+ if ((k & (1 << PG_lru)) || is_anon)
u |= 1 << KPF_THP;
- else if (is_huge_zero_page(head)) {
+ else if (is_huge_zero_folio(folio)) {
u |= 1 << KPF_ZERO_PAGE;
u |= 1 << KPF_THP;
}
} else if (is_zero_pfn(page_to_pfn(page)))
u |= 1 << KPF_ZERO_PAGE;
-
/*
* Caveats on high order pages: PG_buddy and PG_slab will only be set
* on the head page.
@@ -174,16 +175,17 @@ u64 stable_page_flags(struct page *page)
u |= 1 << KPF_OFFLINE;
if (PageTable(page))
u |= 1 << KPF_PGTABLE;
+ if (folio_test_slab(folio))
+ u |= 1 << KPF_SLAB;
- if (page_is_idle(page))
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
+ u |= kpf_copy_bit(k, KPF_IDLE, PG_idle);
+#else
+ if (folio_test_idle(folio))
u |= 1 << KPF_IDLE;
+#endif
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
-
- u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
- if (PageTail(page) && PageSlab(page))
- u |= 1 << KPF_SLAB;
-
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate);
@@ -194,7 +196,8 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active);
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
- if (PageSwapCache(page))
+#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache))
+ if ((k & SWAPCACHE) == SWAPCACHE)
u |= 1 << KPF_SWAPCACHE;
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
@@ -202,7 +205,10 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
#ifdef CONFIG_MEMORY_FAILURE
- u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
+ if (u & (1 << KPF_HUGE))
+ u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
+ else
+ u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison);
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
@@ -228,7 +234,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
{
const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
unsigned long src = *ppos;
unsigned long pfn;
ssize_t ret = 0;
@@ -245,9 +250,9 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
* TODO: ZONE_DEVICE support requires to identify
* memmaps that were actually initialized.
*/
- ppage = pfn_to_online_page(pfn);
+ struct page *page = pfn_to_online_page(pfn);
- if (put_user(stable_page_flags(ppage), out)) {
+ if (put_user(stable_page_flags(page), out)) {
ret = -EFAULT;
break;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 102f48668c35..e5a5f015ff03 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -411,14 +411,14 @@ struct mem_size_stats {
};
static void smaps_page_accumulate(struct mem_size_stats *mss,
- struct page *page, unsigned long size, unsigned long pss,
+ struct folio *folio, unsigned long size, unsigned long pss,
bool dirty, bool locked, bool private)
{
mss->pss += pss;
- if (PageAnon(page))
+ if (folio_test_anon(folio))
mss->pss_anon += pss;
- else if (PageSwapBacked(page))
+ else if (folio_test_swapbacked(folio))
mss->pss_shmem += pss;
else
mss->pss_file += pss;
@@ -426,7 +426,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
if (locked)
mss->pss_locked += pss;
- if (dirty || PageDirty(page)) {
+ if (dirty || folio_test_dirty(folio)) {
mss->pss_dirty += pss;
if (private)
mss->private_dirty += size;
@@ -444,6 +444,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
bool compound, bool young, bool dirty, bool locked,
bool migration)
{
+ struct folio *folio = page_folio(page);
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -451,27 +452,28 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
* First accumulate quantities that depend only on |size| and the type
* of the compound page.
*/
- if (PageAnon(page)) {
+ if (folio_test_anon(folio)) {
mss->anonymous += size;
- if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+ if (!folio_test_swapbacked(folio) && !dirty &&
+ !folio_test_dirty(folio))
mss->lazyfree += size;
}
- if (PageKsm(page))
+ if (folio_test_ksm(folio))
mss->ksm += size;
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || page_is_young(page) || PageReferenced(page))
+ if (young || folio_test_young(folio) || folio_test_referenced(folio))
mss->referenced += size;
/*
* Then accumulate quantities that may depend on sharing, or that may
* differ page-by-page.
*
- * page_count(page) == 1 guarantees the page is mapped exactly once.
+ * refcount == 1 guarantees the page is mapped exactly once.
* If any subpage of the compound page mapped with PTE it would elevate
- * page_count().
+ * the refcount.
*
* The page_mapcount() is called to get a snapshot of the mapcount.
* Without holding the page lock this snapshot can be slightly wrong as
@@ -480,9 +482,9 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
* especially for migration entries. Treat regular migration entries
* as mapcount == 1.
*/
- if ((page_count(page) == 1) || migration) {
- smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
- locked, true);
+ if ((folio_ref_count(folio) == 1) || migration) {
+ smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
+ dirty, locked, true);
return;
}
for (i = 0; i < nr; i++, page++) {
@@ -490,8 +492,8 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
unsigned long pss = PAGE_SIZE << PSS_SHIFT;
if (mapcount >= 2)
pss /= mapcount;
- smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
- mapcount < 2);
+ smaps_page_accumulate(mss, folio, PAGE_SIZE, pss,
+ dirty, locked, mapcount < 2);
}
}
@@ -576,6 +578,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ struct folio *folio;
bool migration = false;
if (pmd_present(*pmd)) {
@@ -590,11 +593,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
}
if (IS_ERR_OR_NULL(page))
return;
- if (PageAnon(page))
+ folio = page_folio(page);
+ if (folio_test_anon(folio))
mss->anonymous_thp += HPAGE_PMD_SIZE;
- else if (PageSwapBacked(page))
+ else if (folio_test_swapbacked(folio))
mss->shmem_thp += HPAGE_PMD_SIZE;
- else if (is_zone_device_page(page))
+ else if (folio_is_zone_device(folio))
/* pass */;
else
mss->file_thp += HPAGE_PMD_SIZE;
@@ -726,19 +730,20 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
- struct page *page = NULL;
- pte_t ptent = ptep_get(pte);
+ pte_t ptent = huge_ptep_get(pte);
+ struct folio *folio = NULL;
if (pte_present(ptent)) {
- page = vm_normal_page(vma, addr, ptent);
+ folio = page_folio(pte_page(ptent));
} else if (is_swap_pte(ptent)) {
swp_entry_t swpent = pte_to_swp_entry(ptent);
if (is_pfn_swap_entry(swpent))
- page = pfn_swap_entry_to_page(swpent);
+ folio = pfn_swap_entry_folio(swpent);
}
- if (page) {
- if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))
+ if (folio) {
+ if (folio_likely_mapped_shared(folio) ||
+ hugetlb_pmd_shared(pte))
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else
mss->private_hugetlb += huge_page_size(hstate_vma(vma));
@@ -866,8 +871,8 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %8u\n",
- !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false,
- true, THP_ORDERS_ALL));
+ !!thp_vma_allowable_orders(vma, vma->vm_flags,
+ TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -1161,7 +1166,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
- struct page *page;
+ struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -1173,12 +1178,12 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!pmd_present(*pmd))
goto out;
- page = pmd_page(*pmd);
+ folio = pmd_folio(*pmd);
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
- test_and_clear_page_young(page);
- ClearPageReferenced(page);
+ folio_test_clear_young(folio);
+ folio_clear_referenced(folio);
out:
spin_unlock(ptl);
return 0;
@@ -1200,14 +1205,14 @@ out:
if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio)
continue;
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
- test_and_clear_page_young(page);
- ClearPageReferenced(page);
+ folio_test_clear_young(folio);
+ folio_clear_referenced(folio);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -1574,12 +1579,13 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
pte = huge_ptep_get(ptep);
if (pte_present(pte)) {
- struct page *page = pte_page(pte);
+ struct folio *folio = page_folio(pte_page(pte));
- if (!PageAnon(page))
+ if (!folio_test_anon(folio))
flags |= PM_FILE;
- if (page_mapcount(page) == 1)
+ if (!folio_likely_mapped_shared(folio) &&
+ !hugetlb_pmd_shared(ptep))
flags |= PM_MMAP_EXCLUSIVE;
if (huge_pte_uffd_wp(pte))
@@ -2551,28 +2557,29 @@ struct numa_maps_private {
static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
unsigned long nr_pages)
{
+ struct folio *folio = page_folio(page);
int count = page_mapcount(page);
md->pages += nr_pages;
- if (pte_dirty || PageDirty(page))
+ if (pte_dirty || folio_test_dirty(folio))
md->dirty += nr_pages;
- if (PageSwapCache(page))
+ if (folio_test_swapcache(folio))
md->swapcache += nr_pages;
- if (PageActive(page) || PageUnevictable(page))
+ if (folio_test_active(folio) || folio_test_unevictable(folio))
md->active += nr_pages;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
md->writeback += nr_pages;
- if (PageAnon(page))
+ if (folio_test_anon(folio))
md->anon += nr_pages;
if (count > md->mapcount_max)
md->mapcount_max = count;
- md->node[page_to_nid(page)] += nr_pages;
+ md->node[folio_nid(folio)] += nr_pages;
}
static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 1fb213f379a5..b52d85f8ad59 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -383,6 +383,8 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
/* leave now if filled buffer already */
if (!iov_iter_count(iter))
return acc;
+
+ cond_resched();
}
list_for_each_entry(m, &vmcore_list, list) {
@@ -1370,9 +1372,8 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
vdd_hdr->n_type = NT_VMCOREDD;
- strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME,
- sizeof(vdd_hdr->name));
- memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
+ strscpy_pad(vdd_hdr->name, VMCOREDD_NOTE_NAME);
+ strscpy_pad(vdd_hdr->dump_name, data->dump_name);
}
/**
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index dacbee455c03..627eb2f72ef3 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -410,7 +410,7 @@ static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
if (dquot)
/* Even in case of error we have to continue */
ret = mark_dquot_dirty(dquot);
- if (!err)
+ if (!err && ret < 0)
err = ret;
}
return err;
@@ -1737,7 +1737,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
if (reserve)
goto out_flush_warn;
- mark_all_dquot_dirty(dquots);
+ ret = mark_all_dquot_dirty(dquots);
out_flush_warn:
srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn);
@@ -1786,7 +1786,7 @@ int dquot_alloc_inode(struct inode *inode)
warn_put_all:
spin_unlock(&inode->i_lock);
if (ret == 0)
- mark_all_dquot_dirty(dquots);
+ ret = mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn);
return ret;
@@ -1990,7 +1990,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
qsize_t inode_usage = 1;
struct dquot __rcu **dquots;
struct dquot *transfer_from[MAXQUOTAS] = {};
- int cnt, index, ret = 0;
+ int cnt, index, ret = 0, err;
char is_valid[MAXQUOTAS] = {};
struct dquot_warn warn_to[MAXQUOTAS];
struct dquot_warn warn_from_inodes[MAXQUOTAS];
@@ -2087,8 +2087,12 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
* mark_all_dquot_dirty().
*/
index = srcu_read_lock(&dquot_srcu);
- mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
- mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
+ err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
+ if (err < 0)
+ ret = err;
+ err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
+ if (err < 0)
+ ret = err;
srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn_to);
@@ -2098,7 +2102,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
if (is_valid[cnt])
transfer_to[cnt] = transfer_from[cnt];
- return 0;
+ return ret;
over_quota:
/* Back out changes we already did */
for (cnt--; cnt >= 0; cnt--) {
@@ -2726,6 +2730,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
struct mem_dqblk *dm = &dquot->dq_dqb;
int check_blim = 0, check_ilim = 0;
struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+ int ret;
if (di->d_fieldmask & ~VFS_QC_MASK)
return -EINVAL;
@@ -2807,8 +2812,9 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
else
set_bit(DQ_FAKE_B, &dquot->dq_flags);
spin_unlock(&dquot->dq_dqb_lock);
- mark_dquot_dirty(dquot);
-
+ ret = mark_dquot_dirty(dquot);
+ if (ret < 0)
+ return ret;
return 0;
}
@@ -3016,11 +3022,10 @@ static int __init dquot_init(void)
if (!dquot_hash)
panic("Cannot create dquot hash table");
- for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
- ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
- if (ret)
- panic("Cannot create dquot stat counters");
- }
+ ret = percpu_counter_init_many(dqstats.counter, 0, GFP_KERNEL,
+ _DQST_DQSTAT_LAST);
+ if (ret)
+ panic("Cannot create dquot stat counters");
/* Find power-of-two hlist_heads which can fit into allocation */
nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index c7a1aa3c882b..b45c7edc3225 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
}
const struct file_operations ramfs_file_operations = {
diff --git a/fs/read_write.c b/fs/read_write.c
index 2115d1f40bd5..ef6339391351 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -392,7 +392,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
kiocb.ki_pos = (ppos ? *ppos : 0);
iov_iter_ubuf(&iter, ITER_DEST, buf, len);
- ret = call_read_iter(filp, &kiocb, &iter);
+ ret = filp->f_op->read_iter(&kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
if (ppos)
*ppos = kiocb.ki_pos;
@@ -494,7 +494,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
kiocb.ki_pos = (ppos ? *ppos : 0);
iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
- ret = call_write_iter(filp, &kiocb, &iter);
+ ret = filp->f_op->write_iter(&kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
if (ret > 0 && ppos)
*ppos = kiocb.ki_pos;
@@ -736,9 +736,9 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
kiocb.ki_pos = (ppos ? *ppos : 0);
if (type == READ)
- ret = call_read_iter(filp, &kiocb, iter);
+ ret = filp->f_op->read_iter(&kiocb, iter);
else
- ret = call_write_iter(filp, &kiocb, iter);
+ ret = filp->f_op->write_iter(&kiocb, iter);
BUG_ON(ret == -EIOCBQUEUED);
if (ppos)
*ppos = kiocb.ki_pos;
@@ -799,7 +799,7 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
if (ret < 0)
return ret;
- ret = call_read_iter(file, iocb, iter);
+ ret = file->f_op->read_iter(iocb, iter);
out:
if (ret >= 0)
fsnotify_access(file);
@@ -860,7 +860,7 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
return ret;
kiocb_start_write(iocb);
- ret = call_write_iter(file, iocb, iter);
+ ret = file->f_op->write_iter(iocb, iter);
if (ret != -EIOCBQUEUED)
kiocb_end_write(iocb);
if (ret > 0)
@@ -1667,6 +1667,7 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
return 0;
}
+EXPORT_SYMBOL_GPL(generic_write_check_limits);
/* Like generic_write_checks(), but takes size of write instead of iter. */
int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index e2f7a264e3ff..11e9ecf24b63 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -102,19 +102,9 @@ that start on a node aligned boundary (there are reasons to want to node
align files), and he invented and implemented indirect items and
unformatted nodes as the solution.
-Konstantin Shvachko, with the help of the Russian version of a VC,
-tried to put me in a position where I was forced into giving control
-of the project to him. (Fortunately, as the person paying the money
-for all salaries from my dayjob I owned all copyrights, and you can't
-really force takeovers of sole proprietorships.) This was something
-curious, because he never really understood the value of our project,
-why we should do what we do, or why innovation was possible in
-general, but he was sure that he ought to be controlling it. Every
-innovation had to be forced past him while he was with us. He added
-two years to the time required to complete reiserfs, and was a net
-loss for me. Mikhail Gilula was a brilliant innovator who also left
-in a destructive way that erased the value of his contributions, and
-that he was shown much generosity just makes it more painful.
+Konstantin Shvachko was taking part in the early days.
+
+Mikhail Gilula was a brilliant innovator that has shown much generosity.
Grigory Zaigralin was an extremely effective system administrator for
our group.
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 1d825459ee6e..c1daedc50f4c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2503,8 +2503,8 @@ out:
* start/recovery path as __block_write_full_folio, along with special
* code to handle reiserfs tails.
*/
-static int reiserfs_write_full_folio(struct folio *folio,
- struct writeback_control *wbc)
+static int reiserfs_write_folio(struct folio *folio,
+ struct writeback_control *wbc, void *data)
{
struct inode *inode = folio->mapping->host;
unsigned long end_index = inode->i_size >> PAGE_SHIFT;
@@ -2721,12 +2721,11 @@ static int reiserfs_read_folio(struct file *f, struct folio *folio)
return block_read_full_folio(folio, reiserfs_get_block);
}
-static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
+static int reiserfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
- struct inode *inode = folio->mapping->host;
- reiserfs_wait_on_write_block(inode->i_sb);
- return reiserfs_write_full_folio(folio, wbc);
+ reiserfs_wait_on_write_block(mapping->host->i_sb);
+ return write_cache_pages(mapping, wbc, reiserfs_write_folio, NULL);
}
static void reiserfs_truncate_failed_write(struct inode *inode)
@@ -3405,7 +3404,7 @@ out:
}
const struct address_space_operations reiserfs_address_space_operations = {
- .writepage = reiserfs_writepage,
+ .writepages = reiserfs_writepages,
.read_folio = reiserfs_read_folio,
.readahead = reiserfs_readahead,
.release_folio = reiserfs_release_folio,
@@ -3415,4 +3414,5 @@ const struct address_space_operations reiserfs_address_space_operations = {
.bmap = reiserfs_aop_bmap,
.direct_IO = reiserfs_direct_IO,
.dirty_folio = reiserfs_dirty_folio,
+ .migrate_folio = buffer_migrate_folio,
};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e539ccd39e1e..e477ee0ff35d 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2626,8 +2626,7 @@ static int journal_init_dev(struct super_block *super,
MAJOR(jdev), MINOR(jdev), result);
return result;
} else if (jdev != super->s_dev)
- set_blocksize(file_bdev(journal->j_bdev_file),
- super->s_blocksize);
+ set_blocksize(journal->j_bdev_file, super->s_blocksize);
return 0;
}
@@ -2643,7 +2642,7 @@ static int journal_init_dev(struct super_block *super,
return result;
}
- set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize);
+ set_blocksize(journal->j_bdev_file, super->s_blocksize);
reiserfs_info(super,
"journal_init_dev: journal device: %pg\n",
file_bdev(journal->j_bdev_file));
diff --git a/fs/remap_range.c b/fs/remap_range.c
index de07f978ce3e..28246dfc8485 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -99,8 +99,7 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
return 0;
}
-static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
- bool write)
+int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write)
{
int mask = write ? MAY_WRITE : MAY_READ;
loff_t tmp;
@@ -118,6 +117,7 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
return fsnotify_file_area_perm(file, mask, &pos, len);
}
+EXPORT_SYMBOL_GPL(remap_verify_area);
/*
* Ensure that we don't remap a partial EOF block in the middle of something
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 5d4b0fd3a59e..262576573eb5 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2465,7 +2465,8 @@ int
cifs_revalidate_mapping(struct inode *inode)
{
int rc;
- unsigned long *flags = &CIFS_I(inode)->flags;
+ struct cifsInodeInfo *cifs_inode = CIFS_I(inode);
+ unsigned long *flags = &cifs_inode->flags;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
/* swapfiles are not supposed to be shared */
@@ -2482,6 +2483,7 @@ cifs_revalidate_mapping(struct inode *inode)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)
goto skip_invalidate;
+ cifs_inode->netfs.zero_point = cifs_inode->netfs.remote_i_size;
rc = filemap_invalidate_inode(inode, true, 0, LLONG_MAX);
if (rc) {
cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n",
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index af97389e983e..36d47ce59631 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -518,7 +518,7 @@ DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class,
__entry->xid = xid;
__entry->tid = tid;
__entry->sesid = sesid;
- __assign_str(path, full_path);
+ __assign_str(path);
),
TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s",
__entry->xid, __entry->sesid, __entry->tid,
@@ -762,7 +762,7 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class,
),
TP_fast_assign(
__entry->xid = xid;
- __assign_str(func_name, func_name);
+ __assign_str(func_name);
__entry->rc = rc;
),
TP_printk("\t%s: xid=%u rc=%d",
@@ -815,7 +815,7 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class,
),
TP_fast_assign(
__entry->xid = xid;
- __assign_str(func_name, func_name);
+ __assign_str(func_name);
),
TP_printk("\t%s: xid=%u",
__get_str(func_name), __entry->xid)
@@ -852,7 +852,7 @@ DECLARE_EVENT_CLASS(smb3_tcon_class,
__entry->xid = xid;
__entry->tid = tid;
__entry->sesid = sesid;
- __assign_str(name, unc_name);
+ __assign_str(name);
__entry->rc = rc;
),
TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d",
@@ -896,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_open_enter_class,
__entry->xid = xid;
__entry->tid = tid;
__entry->sesid = sesid;
- __assign_str(path, full_path);
+ __assign_str(path);
__entry->create_options = create_options;
__entry->desired_access = desired_access;
),
@@ -1098,7 +1098,7 @@ DECLARE_EVENT_CLASS(smb3_connect_class,
__entry->conn_id = conn_id;
pss = (struct sockaddr_storage *)__entry->dst_addr;
*pss = *dst_addr;
- __assign_str(hostname, hostname);
+ __assign_str(hostname);
),
TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc",
__entry->conn_id,
@@ -1134,7 +1134,7 @@ DECLARE_EVENT_CLASS(smb3_connect_err_class,
__entry->rc = rc;
pss = (struct sockaddr_storage *)__entry->dst_addr;
*pss = *dst_addr;
- __assign_str(hostname, hostname);
+ __assign_str(hostname);
),
TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc",
__entry->rc,
@@ -1166,7 +1166,7 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class,
TP_fast_assign(
__entry->currmid = currmid;
__entry->conn_id = conn_id;
- __assign_str(hostname, hostname);
+ __assign_str(hostname);
),
TP_printk("conn_id=0x%llx server=%s current_mid=%llu",
__entry->conn_id,
@@ -1255,7 +1255,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class,
TP_fast_assign(
__entry->currmid = currmid;
__entry->conn_id = conn_id;
- __assign_str(hostname, hostname);
+ __assign_str(hostname);
__entry->credits = credits;
__entry->credits_to_add = credits_to_add;
__entry->in_flight = in_flight;
diff --git a/fs/splice.c b/fs/splice.c
index 218e24b1ac40..60aed8de21f8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -362,7 +362,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
- ret = call_read_iter(in, &kiocb, &to);
+ ret = in->f_op->read_iter(&kiocb, &to);
if (ret > 0) {
keep = DIV_ROUND_UP(ret, PAGE_SIZE);
@@ -740,7 +740,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
init_sync_kiocb(&kiocb, out);
kiocb.ki_pos = sd.pos;
- ret = call_write_iter(out, &kiocb, &from);
+ ret = out->f_op->write_iter(&kiocb, &from);
sd.pos = kiocb.ki_pos;
if (ret <= 0)
break;
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index e8df6430444b..a8c1e7f9a609 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -375,8 +375,6 @@ void squashfs_fill_page(struct page *page, struct squashfs_cache_entry *buffer,
flush_dcache_page(page);
if (copied == avail)
SetPageUptodate(page);
- else
- SetPageError(page);
}
/* Copy data into page cache */
@@ -471,7 +469,7 @@ static int squashfs_read_folio(struct file *file, struct folio *folio)
res = read_blocklist(inode, index, &block);
if (res < 0)
- goto error_out;
+ goto out;
if (res == 0)
res = squashfs_readpage_sparse(page, expected);
@@ -483,8 +481,6 @@ static int squashfs_read_folio(struct file *file, struct folio *folio)
if (!res)
return 0;
-error_out:
- SetPageError(page);
out:
pageaddr = kmap_atomic(page);
memset(pageaddr, 0, PAGE_SIZE);
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 763a3f7a75f6..2a689ce71de9 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -106,14 +106,13 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
return 0;
mark_errored:
- /* Decompression failed, mark pages as errored. Target_page is
+ /* Decompression failed. Target_page is
* dealt with by the caller
*/
for (i = 0; i < pages; i++) {
if (page[i] == NULL || page[i] == target_page)
continue;
flush_dcache_page(page[i]);
- SetPageError(page[i]);
unlock_page(page[i]);
put_page(page[i]);
}
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 11e4539b9eae..65aae7e2a859 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -62,27 +62,21 @@
*/
static int get_dir_index_using_name(struct super_block *sb,
u64 *next_block, int *next_offset, u64 index_start,
- int index_offset, int i_count, const char *name,
- int len)
+ int index_offset, int i_count, const char *name)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
int i, length = 0, err;
unsigned int size;
struct squashfs_dir_index *index;
- char *str;
TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count);
- index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL);
+ index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
if (index == NULL) {
ERROR("Failed to allocate squashfs_dir_index\n");
goto out;
}
- str = &index->name[SQUASHFS_NAME_LEN + 1];
- strncpy(str, name, len);
- str[len] = '\0';
-
for (i = 0; i < i_count; i++) {
err = squashfs_read_metadata(sb, index, &index_start,
&index_offset, sizeof(*index));
@@ -101,7 +95,7 @@ static int get_dir_index_using_name(struct super_block *sb,
index->name[size] = '\0';
- if (strcmp(index->name, str) > 0)
+ if (strcmp(index->name, name) > 0)
break;
length = le32_to_cpu(index->index);
@@ -153,7 +147,7 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
length = get_dir_index_using_name(dir->i_sb, &block, &offset,
squashfs_i(dir)->dir_idx_start,
squashfs_i(dir)->dir_idx_offset,
- squashfs_i(dir)->dir_idx_cnt, name, len);
+ squashfs_i(dir)->dir_idx_cnt, name);
while (length < i_size_read(dir)) {
/*
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 2bf977a52c2c..6ef735bd841a 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -32,20 +32,19 @@
static int squashfs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct squashfs_sb_info *msblk = sb->s_fs_info;
- int index = page->index << PAGE_SHIFT;
+ int index = folio_pos(folio);
u64 block = squashfs_i(inode)->start;
int offset = squashfs_i(inode)->offset;
int length = min_t(int, i_size_read(inode) - index, PAGE_SIZE);
- int bytes, copied;
+ int bytes, copied, error;
void *pageaddr;
struct squashfs_cache_entry *entry;
TRACE("Entered squashfs_symlink_readpage, page index %ld, start block "
- "%llx, offset %x\n", page->index, block, offset);
+ "%llx, offset %x\n", folio->index, block, offset);
/*
* Skip index bytes into symlink metadata.
@@ -57,14 +56,15 @@ static int squashfs_symlink_read_folio(struct file *file, struct folio *folio)
ERROR("Unable to read symlink [%llx:%x]\n",
squashfs_i(inode)->start,
squashfs_i(inode)->offset);
- goto error_out;
+ error = bytes;
+ goto out;
}
}
/*
* Read length bytes from symlink metadata. Squashfs_read_metadata
* is not used here because it can sleep and we want to use
- * kmap_atomic to map the page. Instead call the underlying
+ * kmap_local to map the folio. Instead call the underlying
* squashfs_cache_get routine. As length bytes may overlap metadata
* blocks, we may need to call squashfs_cache_get multiple times.
*/
@@ -75,29 +75,26 @@ static int squashfs_symlink_read_folio(struct file *file, struct folio *folio)
squashfs_i(inode)->start,
squashfs_i(inode)->offset);
squashfs_cache_put(entry);
- goto error_out;
+ error = entry->error;
+ goto out;
}
- pageaddr = kmap_atomic(page);
+ pageaddr = kmap_local_folio(folio, 0);
copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
length - bytes);
if (copied == length - bytes)
memset(pageaddr + length, 0, PAGE_SIZE - length);
else
block = entry->next_index;
- kunmap_atomic(pageaddr);
+ kunmap_local(pageaddr);
squashfs_cache_put(entry);
}
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
-
-error_out:
- SetPageError(page);
- unlock_page(page);
- return 0;
+ flush_dcache_folio(folio);
+ error = 0;
+out:
+ folio_end_read(folio, error == 0);
+ return error;
}
diff --git a/fs/super.c b/fs/super.c
index 69ce6c600968..b72f1d288e95 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -274,6 +274,7 @@ static void destroy_super_work(struct work_struct *work)
{
struct super_block *s = container_of(work, struct super_block,
destroy_work);
+ fsnotify_sb_free(s);
security_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 7cd64021d453..d1995e2d6c94 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -785,3 +785,30 @@ int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
return len;
}
EXPORT_SYMBOL_GPL(sysfs_emit_at);
+
+/**
+ * sysfs_bin_attr_simple_read - read callback to simply copy from memory.
+ * @file: attribute file which is being read.
+ * @kobj: object to which the attribute belongs.
+ * @attr: attribute descriptor.
+ * @buf: destination buffer.
+ * @off: offset in bytes from which to read.
+ * @count: maximum number of bytes to read.
+ *
+ * Simple ->read() callback for bin_attributes backed by a buffer in memory.
+ * The @private and @size members in struct bin_attribute must be set to the
+ * buffer's location and size before the bin_attribute is created in sysfs.
+ *
+ * Bounds check for @off and @count is done in sysfs_kf_bin_read().
+ * Negative value check for @off is done in vfs_setpos() and default_llseek().
+ *
+ * Returns number of bytes written to @buf.
+ */
+ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t off, size_t count)
+{
+ memcpy(buf, attr->private + off, count);
+ return count;
+}
+EXPORT_SYMBOL_GPL(sysfs_bin_attr_simple_read);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ceac4b5937c..97c59585208c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -39,7 +39,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
struct address_space *mapping = inode->i_mapping;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
loff_t size;
unsigned int end;
vm_fault_t ret = VM_FAULT_LOCKED;
@@ -48,31 +48,31 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
filemap_invalidate_lock_shared(mapping);
- lock_page(page);
+ folio_lock(folio);
size = i_size_read(inode);
- if (page->mapping != inode->i_mapping || page_offset(page) >= size) {
- unlock_page(page);
+ if (folio->mapping != inode->i_mapping || folio_pos(folio) >= size) {
+ folio_unlock(folio);
ret = VM_FAULT_NOPAGE;
goto out_unlock;
}
/* Space is already allocated for in-ICB file */
if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
goto out_dirty;
- if (page->index == size >> PAGE_SHIFT)
+ if (folio->index == size >> PAGE_SHIFT)
end = size & ~PAGE_MASK;
else
end = PAGE_SIZE;
- err = __block_write_begin(page, 0, end, udf_get_block);
+ err = __block_write_begin(&folio->page, 0, end, udf_get_block);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
ret = vmf_fs_error(err);
goto out_unlock;
}
- block_commit_write(page, 0, end);
+ block_commit_write(&folio->page, 0, end);
out_dirty:
- set_page_dirty(page);
- wait_for_stable_page(page);
+ folio_mark_dirty(folio);
+ folio_wait_stable(folio);
out_unlock:
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 2f831a3a91af..2fb21c5ffccf 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -208,19 +208,14 @@ static int udf_writepages(struct address_space *mapping,
return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL);
}
-static void udf_adinicb_readpage(struct page *page)
+static void udf_adinicb_read_folio(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- char *kaddr;
+ struct inode *inode = folio->mapping->host;
struct udf_inode_info *iinfo = UDF_I(inode);
loff_t isize = i_size_read(inode);
- kaddr = kmap_local_page(page);
- memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize);
- memset(kaddr + isize, 0, PAGE_SIZE - isize);
- flush_dcache_page(page);
- SetPageUptodate(page);
- kunmap_local(kaddr);
+ folio_fill_tail(folio, 0, iinfo->i_data + iinfo->i_lenEAttr, isize);
+ folio_mark_uptodate(folio);
}
static int udf_read_folio(struct file *file, struct folio *folio)
@@ -228,7 +223,7 @@ static int udf_read_folio(struct file *file, struct folio *folio)
struct udf_inode_info *iinfo = UDF_I(file_inode(file));
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
- udf_adinicb_readpage(&folio->page);
+ udf_adinicb_read_folio(folio);
folio_unlock(folio);
return 0;
}
@@ -254,7 +249,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct udf_inode_info *iinfo = UDF_I(file_inode(file));
- struct page *page;
+ struct folio *folio;
int ret;
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
@@ -266,12 +261,13 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
}
if (WARN_ON_ONCE(pos >= PAGE_SIZE))
return -EIO;
- page = grab_cache_page_write_begin(mapping, 0);
- if (!page)
- return -ENOMEM;
- *pagep = page;
- if (!PageUptodate(page))
- udf_adinicb_readpage(page);
+ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ *pagep = &folio->page;
+ if (!folio_test_uptodate(folio))
+ udf_adinicb_read_folio(folio);
return 0;
}
@@ -280,17 +276,19 @@ static int udf_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file_inode(file);
+ struct folio *folio;
loff_t last_pos;
if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
return generic_write_end(file, mapping, pos, len, copied, page,
fsdata);
+ folio = page_folio(page);
last_pos = pos + copied;
if (last_pos > inode->i_size)
i_size_write(inode, last_pos);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
@@ -341,7 +339,7 @@ const struct address_space_operations udf_aops = {
*/
int udf_expand_file_adinicb(struct inode *inode)
{
- struct page *page;
+ struct folio *folio;
struct udf_inode_info *iinfo = UDF_I(inode);
int err;
@@ -357,12 +355,13 @@ int udf_expand_file_adinicb(struct inode *inode)
return 0;
}
- page = find_or_create_page(inode->i_mapping, 0, GFP_KERNEL);
- if (!page)
- return -ENOMEM;
+ folio = __filemap_get_folio(inode->i_mapping, 0,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_KERNEL);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- if (!PageUptodate(page))
- udf_adinicb_readpage(page);
+ if (!folio_test_uptodate(folio))
+ udf_adinicb_read_folio(folio);
down_write(&iinfo->i_data_sem);
memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00,
iinfo->i_lenAlloc);
@@ -371,22 +370,22 @@ int udf_expand_file_adinicb(struct inode *inode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
else
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
- set_page_dirty(page);
- unlock_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
up_write(&iinfo->i_data_sem);
err = filemap_fdatawrite(inode->i_mapping);
if (err) {
/* Restore everything back so that we don't lose data... */
- lock_page(page);
+ folio_lock(folio);
down_write(&iinfo->i_data_sem);
- memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr,
- inode->i_size);
- unlock_page(page);
+ memcpy_from_folio(iinfo->i_data + iinfo->i_lenEAttr,
+ folio, 0, inode->i_size);
+ folio_unlock(folio);
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
iinfo->i_lenAlloc = inode->i_size;
up_write(&iinfo->i_data_sem);
}
- put_page(page);
+ folio_put(folio);
mark_inode_dirty(inode);
return err;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 2217f7ed7a49..9381a66c6ce5 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -630,7 +630,7 @@ static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param)
if (!uopt->nls_map) {
errorf(fc, "iocharset %s not found",
param->string);
- return -EINVAL;;
+ return -EINVAL;
}
}
break;
@@ -895,7 +895,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
int ret;
struct timestamp *ts;
- outstr = kmalloc(128, GFP_KERNEL);
+ outstr = kzalloc(128, GFP_KERNEL);
if (!outstr)
return -ENOMEM;
@@ -921,11 +921,11 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32);
if (ret < 0) {
- strcpy(UDF_SB(sb)->s_volume_ident, "InvalidName");
+ strscpy_pad(UDF_SB(sb)->s_volume_ident, "InvalidName");
pr_warn("incorrect volume identification, setting to "
"'InvalidName'\n");
} else {
- strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
+ strscpy_pad(UDF_SB(sb)->s_volume_ident, outstr);
}
udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index f7eaf7b14594..fe03745d09b1 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -99,18 +99,17 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
static int udf_symlink_filler(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct buffer_head *bh = NULL;
unsigned char *symlink;
int err = 0;
- unsigned char *p = page_address(page);
+ unsigned char *p = folio_address(folio);
struct udf_inode_info *iinfo = UDF_I(inode);
/* We don't support symlinks longer than one block */
if (inode->i_size > inode->i_sb->s_blocksize) {
err = -ENAMETOOLONG;
- goto out_unlock;
+ goto out;
}
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
@@ -120,24 +119,15 @@ static int udf_symlink_filler(struct file *file, struct folio *folio)
if (!bh) {
if (!err)
err = -EFSCORRUPTED;
- goto out_err;
+ goto out;
}
symlink = bh->b_data;
}
err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE);
brelse(bh);
- if (err)
- goto out_err;
-
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
-
-out_err:
- SetPageError(page);
-out_unlock:
- unlock_page(page);
+out:
+ folio_end_read(folio, err == 0);
return err;
}
@@ -147,12 +137,12 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap,
{
struct dentry *dentry = path->dentry;
struct inode *inode = d_backing_inode(dentry);
- struct page *page;
+ struct folio *folio;
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- page = read_mapping_page(inode->i_mapping, 0, NULL);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ folio = read_mapping_folio(inode->i_mapping, 0, NULL);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
/*
* UDF uses non-trivial encoding of symlinks so i_size does not match
* number of characters reported by readlink(2) which apparently some
@@ -162,8 +152,8 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap,
* let's report the length of string returned by readlink(2) for
* st_size.
*/
- stat->size = strlen(page_address(page));
- put_page(page);
+ stat->size = strlen(folio_address(folio));
+ folio_put(folio);
return 0;
}
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 758163af39c2..78ecc633606f 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -46,13 +46,18 @@ udf_disk_stamp_to_time(struct timespec64 *dest, struct timestamp src)
dest->tv_sec = mktime64(year, src.month, src.day, src.hour, src.minute,
src.second);
dest->tv_sec -= offset * 60;
- dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
- src.hundredsOfMicroseconds * 100 + src.microseconds);
+
/*
* Sanitize nanosecond field since reportedly some filesystems are
* recorded with bogus sub-second values.
*/
- dest->tv_nsec %= NSEC_PER_SEC;
+ if (src.centiseconds < 100 && src.hundredsOfMicroseconds < 100 &&
+ src.microseconds < 100) {
+ dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
+ src.hundredsOfMicroseconds * 100 + src.microseconds);
+ } else {
+ dest->tv_nsec = 0;
+ }
}
void
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index 0e51c0025a16..e309afe2b2bb 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -18,13 +18,13 @@ ifdef REGENERATE_UTF8DATA
quiet_cmd_utf8data = GEN $@
cmd_utf8data = $< \
- -a $(srctree)/$(src)/DerivedAge.txt \
- -c $(srctree)/$(src)/DerivedCombiningClass.txt \
- -p $(srctree)/$(src)/DerivedCoreProperties.txt \
- -d $(srctree)/$(src)/UnicodeData.txt \
- -f $(srctree)/$(src)/CaseFolding.txt \
- -n $(srctree)/$(src)/NormalizationCorrections.txt \
- -t $(srctree)/$(src)/NormalizationTest.txt \
+ -a $(src)/DerivedAge.txt \
+ -c $(src)/DerivedCombiningClass.txt \
+ -p $(src)/DerivedCoreProperties.txt \
+ -d $(src)/UnicodeData.txt \
+ -f $(src)/CaseFolding.txt \
+ -n $(src)/NormalizationCorrections.txt \
+ -t $(src)/NormalizationTest.txt \
-o $@
$(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 2a564f813314..eee7320ab0b0 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -658,7 +658,10 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
struct userfaultfd_fork_ctx *fctx;
octx = vma->vm_userfaultfd_ctx.ctx;
- if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ if (!octx)
+ return 0;
+
+ if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 76674ad5833e..c50447548d65 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -4,8 +4,8 @@
# All Rights Reserved.
#
-ccflags-y += -I $(srctree)/$(src) # needed for trace events
-ccflags-y += -I $(srctree)/$(src)/libxfs
+ccflags-y += -I $(src) # needed for trace events
+ccflags-y += -I $(src)/libxfs
obj-$(CONFIG_XFS_FS) += xfs.o
@@ -34,6 +34,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_dir2_node.o \
xfs_dir2_sf.o \
xfs_dquot_buf.o \
+ xfs_exchmaps.o \
xfs_ialloc.o \
xfs_ialloc_btree.o \
xfs_iext_tree.o \
@@ -41,6 +42,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_inode_buf.o \
xfs_log_rlimit.o \
xfs_ag_resv.o \
+ xfs_parent.o \
xfs_rmap.o \
xfs_rmap_btree.o \
xfs_refcount.o \
@@ -49,6 +51,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_symlink_remote.o \
xfs_trans_inode.o \
xfs_trans_resv.o \
+ xfs_trans_space.o \
xfs_types.o \
)
# xfs_rtbitmap is shared with libxfs
@@ -67,6 +70,7 @@ xfs-y += xfs_aops.o \
xfs_dir2_readdir.o \
xfs_discard.o \
xfs_error.o \
+ xfs_exchrange.o \
xfs_export.o \
xfs_extent_busy.o \
xfs_file.o \
@@ -74,6 +78,7 @@ xfs-y += xfs_aops.o \
xfs_fsmap.o \
xfs_fsops.o \
xfs_globals.o \
+ xfs_handle.o \
xfs_health.o \
xfs_icache.o \
xfs_ioctl.o \
@@ -101,6 +106,7 @@ xfs-y += xfs_log.o \
xfs_buf_item.o \
xfs_buf_item_recover.o \
xfs_dquot_item_recover.o \
+ xfs_exchmaps_item.o \
xfs_extfree_item.o \
xfs_attr_item.o \
xfs_icreate_item.o \
@@ -157,11 +163,13 @@ xfs-y += $(addprefix scrub/, \
common.o \
dabtree.o \
dir.o \
+ dirtree.o \
fscounters.o \
health.o \
ialloc.o \
inode.o \
iscan.o \
+ listxattr.o \
nlinks.o \
parent.o \
readdir.o \
@@ -170,6 +178,7 @@ xfs-y += $(addprefix scrub/, \
scrub.o \
symlink.o \
xfarray.o \
+ xfblob.o \
xfile.o \
)
@@ -191,23 +200,32 @@ ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
alloc_repair.o \
+ attr_repair.o \
bmap_repair.o \
cow_repair.o \
+ dir_repair.o \
+ dirtree_repair.o \
+ findparent.o \
fscounters_repair.o \
ialloc_repair.o \
inode_repair.o \
newbt.o \
nlinks_repair.o \
+ orphanage.o \
+ parent_repair.o \
rcbag_btree.o \
rcbag.o \
reap.o \
refcount_repair.o \
repair.o \
rmap_repair.o \
+ symlink_repair.o \
+ tempfile.o \
)
xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
rtbitmap_repair.o \
+ rtsummary_repair.o \
)
xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index dc1873f76bff..240e079cb3fb 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -194,7 +194,7 @@ xfs_initialize_perag_data(
pag = xfs_perag_get(mp, index);
error = xfs_alloc_read_agf(pag, NULL, 0, NULL);
if (!error)
- error = xfs_ialloc_read_agi(pag, NULL, NULL);
+ error = xfs_ialloc_read_agi(pag, NULL, 0, NULL);
if (error) {
xfs_perag_put(pag);
return error;
@@ -931,7 +931,7 @@ xfs_ag_shrink_space(
int error, err2;
ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1);
- error = xfs_ialloc_read_agi(pag, *tpp, &agibp);
+ error = xfs_ialloc_read_agi(pag, *tpp, 0, &agibp);
if (error)
return error;
@@ -963,9 +963,7 @@ xfs_ag_shrink_space(
* Disable perag reservations so it doesn't cause the allocation request
* to fail. We'll reestablish reservation before we return.
*/
- error = xfs_ag_resv_free(pag);
- if (error)
- return error;
+ xfs_ag_resv_free(pag);
/* internal log shouldn't also show up in the free space btrees */
error = xfs_alloc_vextent_exact_bno(&args,
@@ -1062,7 +1060,7 @@ xfs_ag_extend_space(
ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1);
- error = xfs_ialloc_read_agi(pag, tp, &bp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &bp);
if (error)
return error;
@@ -1119,7 +1117,7 @@ xfs_ag_get_geometry(
int error;
/* Lock the AG headers. */
- error = xfs_ialloc_read_agi(pag, NULL, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, NULL, 0, &agi_bp);
if (error)
return error;
error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index da1057bd0e60..216423df939e 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -126,14 +126,13 @@ xfs_ag_resv_needed(
}
/* Clean out a reservation */
-static int
+static void
__xfs_ag_resv_free(
struct xfs_perag *pag,
enum xfs_ag_resv_type type)
{
struct xfs_ag_resv *resv;
xfs_extlen_t oldresv;
- int error;
trace_xfs_ag_resv_free(pag, type, 0);
@@ -149,30 +148,19 @@ __xfs_ag_resv_free(
oldresv = resv->ar_orig_reserved;
else
oldresv = resv->ar_reserved;
- error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+ xfs_add_fdblocks(pag->pag_mount, oldresv);
resv->ar_reserved = 0;
resv->ar_asked = 0;
resv->ar_orig_reserved = 0;
-
- if (error)
- trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
- error, _RET_IP_);
- return error;
}
/* Free a per-AG reservation. */
-int
+void
xfs_ag_resv_free(
struct xfs_perag *pag)
{
- int error;
- int err2;
-
- error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
- err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
- if (err2 && !error)
- error = err2;
- return error;
+ __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
+ __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
}
static int
@@ -216,7 +204,7 @@ __xfs_ag_resv_init(
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
error = -ENOSPC;
else
- error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
+ error = xfs_dec_fdblocks(mp, hidden_space, true);
if (error) {
trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
error, _RET_IP_);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
index b74b210008ea..ff20ed93de77 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.h
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -6,7 +6,7 @@
#ifndef __XFS_AG_RESV_H__
#define __XFS_AG_RESV_H__
-int xfs_ag_resv_free(struct xfs_perag *pag);
+void xfs_ag_resv_free(struct xfs_perag *pag);
int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp);
bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 9da52e92172a..6cb8b2ddc541 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -79,7 +79,7 @@ xfs_prealloc_blocks(
}
/*
- * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
+ * The number of blocks per AG that we withhold from xfs_dec_fdblocks to
* guarantee that we can refill the AGFL prior to allocating space in a nearly
* full AG. Although the space described by the free space btrees, the
* blocks used by the freesp btrees themselves, and the blocks owned by the
@@ -89,7 +89,7 @@ xfs_prealloc_blocks(
* until the fs goes down, we subtract this many AG blocks from the incore
* fdblocks to ensure user allocation does not overcommit the space the
* filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
- * withhold space from xfs_mod_fdblocks, so we do not account for that here.
+ * withhold space from xfs_dec_fdblocks, so we do not account for that here.
*/
#define XFS_ALLOCBT_AGFL_RESERVE 4
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 673a4b6d2e8d..430cd3244c14 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -26,6 +26,7 @@
#include "xfs_trace.h"
#include "xfs_attr_item.h"
#include "xfs_xattr.h"
+#include "xfs_parent.h"
struct kmem_cache *xfs_attr_intent_cache;
@@ -87,6 +88,8 @@ xfs_attr_is_leaf(
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec imap;
+ ASSERT(!xfs_need_iread_extents(ifp));
+
if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS)
return false;
@@ -224,11 +227,21 @@ int
xfs_attr_get_ilocked(
struct xfs_da_args *args)
{
+ int error;
+
xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (!xfs_inode_hasattr(args->dp))
return -ENOATTR;
+ /*
+ * The incore attr fork iext tree must be loaded for xfs_attr_is_leaf
+ * to work correctly.
+ */
+ error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_getvalue(args);
if (xfs_attr_is_leaf(args->dp))
@@ -264,9 +277,11 @@ xfs_attr_get(
if (xfs_is_shutdown(args->dp->i_mount))
return -EIO;
+ if (!args->owner)
+ args->owner = args->dp->i_ino;
args->geo = args->dp->i_mount->m_attr_geo;
args->whichfork = XFS_ATTR_FORK;
- args->hashval = xfs_da_hashname(args->name, args->namelen);
+ xfs_attr_sethash(args);
/* Entirely possible to look up a name which doesn't exist */
args->op_flags = XFS_DA_OP_OKNOENT;
@@ -363,7 +378,7 @@ xfs_attr_try_sf_addname(
* Commit the shortform mods, and we're done.
* NOTE: this is also the error path (EEXIST, etc).
*/
- if (!error && !(args->op_flags & XFS_DA_OP_NOTIME))
+ if (!error)
xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
if (xfs_has_wsync(dp->i_mount))
@@ -401,6 +416,50 @@ out:
return error;
}
+/* Compute the hash value for a user/root/secure extended attribute */
+xfs_dahash_t
+xfs_attr_hashname(
+ const uint8_t *name,
+ int namelen)
+{
+ return xfs_da_hashname(name, namelen);
+}
+
+/* Compute the hash value for any extended attribute from any namespace. */
+xfs_dahash_t
+xfs_attr_hashval(
+ struct xfs_mount *mp,
+ unsigned int attr_flags,
+ const uint8_t *name,
+ int namelen,
+ const void *value,
+ int valuelen)
+{
+ ASSERT(xfs_attr_check_namespace(attr_flags));
+
+ if (attr_flags & XFS_ATTR_PARENT)
+ return xfs_parent_hashattr(mp, name, namelen, value, valuelen);
+
+ return xfs_attr_hashname(name, namelen);
+}
+
+/*
+ * PPTR_REPLACE operations require the caller to set the old and new names and
+ * values explicitly. Update the canonical fields to the new name and value
+ * here now that the removal phase has finished.
+ */
+static void
+xfs_attr_update_pptr_replace_args(
+ struct xfs_da_args *args)
+{
+ ASSERT(args->new_namelen > 0);
+ args->name = args->new_name;
+ args->namelen = args->new_namelen;
+ args->value = args->new_value;
+ args->valuelen = args->new_valuelen;
+ xfs_attr_sethash(args);
+}
+
/*
* Handle the state change on completion of a multi-state attr operation.
*
@@ -418,14 +477,15 @@ xfs_attr_complete_op(
enum xfs_delattr_state replace_state)
{
struct xfs_da_args *args = attr->xattri_da_args;
- bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
+
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ replace_state = XFS_DAS_DONE;
+ else if (xfs_attr_intent_op(attr) == XFS_ATTRI_OP_FLAGS_PPTR_REPLACE)
+ xfs_attr_update_pptr_replace_args(args);
args->op_flags &= ~XFS_DA_OP_REPLACE;
args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
- if (do_replace)
- return replace_state;
-
- return XFS_DAS_DONE;
+ return replace_state;
}
static int
@@ -647,8 +707,8 @@ xfs_attr_leaf_remove_attr(
int forkoff;
int error;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
- &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner,
+ args->blkno, &bp);
if (error)
return error;
@@ -679,7 +739,7 @@ xfs_attr_leaf_shrink(
if (!xfs_attr_is_leaf(dp))
return 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp);
if (error)
return error;
@@ -868,6 +928,11 @@ xfs_attr_lookup(
return -ENOATTR;
}
+ /* Prerequisite for xfs_attr_is_leaf */
+ error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
if (xfs_attr_is_leaf(dp)) {
error = xfs_attr_leaf_hasname(args, &bp);
@@ -883,74 +948,72 @@ xfs_attr_lookup(
return error;
}
-static void
-xfs_attr_defer_add(
- struct xfs_da_args *args,
- unsigned int op_flags)
+int
+xfs_attr_add_fork(
+ struct xfs_inode *ip, /* incore inode pointer */
+ int size, /* space new attribute needs */
+ int rsvd) /* xact may use reserved blks */
{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp; /* transaction pointer */
+ unsigned int blks; /* space reservation */
+ int error; /* error return value */
- struct xfs_attr_intent *new;
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
- new = kmem_cache_zalloc(xfs_attr_intent_cache,
- GFP_KERNEL | __GFP_NOFAIL);
- new->xattri_op_flags = op_flags;
- new->xattri_da_args = args;
+ blks = XFS_ADDAFORK_SPACE_RES(mp);
- switch (op_flags) {
- case XFS_ATTRI_OP_FLAGS_SET:
- new->xattri_dela_state = xfs_attr_init_add_state(args);
- break;
- case XFS_ATTRI_OP_FLAGS_REPLACE:
- new->xattri_dela_state = xfs_attr_init_replace_state(args);
- break;
- case XFS_ATTRI_OP_FLAGS_REMOVE:
- new->xattri_dela_state = xfs_attr_init_remove_state(args);
- break;
- default:
- ASSERT(0);
- }
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0,
+ rsvd, &tp);
+ if (error)
+ return error;
+
+ if (xfs_inode_has_attr_fork(ip))
+ goto trans_cancel;
+
+ error = xfs_bmap_add_attrfork(tp, ip, size, rsvd);
+ if (error)
+ goto trans_cancel;
+
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
- xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type);
- trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
+trans_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
}
/*
- * Note: If args->value is NULL the attribute will be removed, just like the
- * Linux ->setattr API.
+ * Make a change to the xattr structure.
+ *
+ * The caller must have initialized @args, attached dquots, and must not hold
+ * any ILOCKs. Reserved data blocks may be used if @rsvd is set.
+ *
+ * Returns -EEXIST for XFS_ATTRUPDATE_CREATE if the name already exists.
+ * Returns -ENOATTR for XFS_ATTRUPDATE_REMOVE if the name does not exist.
+ * Returns 0 on success, or a negative errno if something else went wrong.
*/
int
xfs_attr_set(
- struct xfs_da_args *args)
+ struct xfs_da_args *args,
+ enum xfs_attr_update op,
+ bool rsvd)
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
struct xfs_trans_res tres;
- bool rsvd = (args->attr_filter & XFS_ATTR_ROOT);
int error, local;
int rmt_blks = 0;
unsigned int total;
- if (xfs_is_shutdown(dp->i_mount))
- return -EIO;
-
- error = xfs_qm_dqattach(dp);
- if (error)
- return error;
-
- args->geo = mp->m_attr_geo;
- args->whichfork = XFS_ATTR_FORK;
- args->hashval = xfs_da_hashname(args->name, args->namelen);
+ ASSERT(!args->trans);
- /*
- * We have no control over the attribute names that userspace passes us
- * to remove, so we have to allow the name lookup prior to attribute
- * removal to fail as well. Preserve the logged flag, since we need
- * to pass that through to the logging code.
- */
- args->op_flags = XFS_DA_OP_OKNOENT |
- (args->op_flags & XFS_DA_OP_LOGGED);
-
- if (args->value) {
+ switch (op) {
+ case XFS_ATTRUPDATE_UPSERT:
+ case XFS_ATTRUPDATE_CREATE:
+ case XFS_ATTRUPDATE_REPLACE:
XFS_STATS_INC(mp, xs_attr_set);
args->total = xfs_attr_calc_size(args, &local);
@@ -963,16 +1026,18 @@ xfs_attr_set(
xfs_attr_sf_entsize_byname(args->namelen,
args->valuelen);
- error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+ error = xfs_attr_add_fork(dp, sf_size, rsvd);
if (error)
return error;
}
if (!local)
rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
- } else {
+ break;
+ case XFS_ATTRUPDATE_REMOVE:
XFS_STATS_INC(mp, xs_attr_remove);
- rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
+ rmt_blks = xfs_attr3_max_rmt_blocks(mp);
+ break;
}
/*
@@ -984,12 +1049,9 @@ xfs_attr_set(
if (error)
return error;
- if (args->value || xfs_inode_hasattr(dp)) {
- error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
+ if (op != XFS_ATTRUPDATE_REMOVE || xfs_inode_hasattr(dp)) {
+ error = xfs_iext_count_extend(args->trans, dp, XFS_ATTR_FORK,
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(args->trans, dp,
- XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error)
goto out_trans_cancel;
}
@@ -997,26 +1059,26 @@ xfs_attr_set(
error = xfs_attr_lookup(args);
switch (error) {
case -EEXIST:
- if (!args->value) {
+ if (op == XFS_ATTRUPDATE_REMOVE) {
/* if no value, we are performing a remove operation */
- xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE);
+ xfs_attr_defer_add(args, XFS_ATTR_DEFER_REMOVE);
break;
}
/* Pure create fails if the attr already exists */
- if (args->attr_flags & XATTR_CREATE)
+ if (op == XFS_ATTRUPDATE_CREATE)
goto out_trans_cancel;
- xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE);
+ xfs_attr_defer_add(args, XFS_ATTR_DEFER_REPLACE);
break;
case -ENOATTR:
/* Can't remove what isn't there. */
- if (!args->value)
+ if (op == XFS_ATTRUPDATE_REMOVE)
goto out_trans_cancel;
/* Pure replace fails if no existing attr to replace. */
- if (args->attr_flags & XATTR_REPLACE)
+ if (op == XFS_ATTRUPDATE_REPLACE)
goto out_trans_cancel;
- xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET);
+ xfs_attr_defer_add(args, XFS_ATTR_DEFER_SET);
break;
default:
goto out_trans_cancel;
@@ -1029,8 +1091,7 @@ xfs_attr_set(
if (xfs_has_wsync(mp))
xfs_trans_set_sync(args->trans);
- if (!(args->op_flags & XFS_DA_OP_NOTIME))
- xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
/*
* Commit the last in the sequence of transactions.
@@ -1039,6 +1100,7 @@ xfs_attr_set(
error = xfs_trans_commit(args->trans);
out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ args->trans = NULL;
return error;
out_trans_cancel:
@@ -1051,7 +1113,7 @@ out_trans_cancel:
* External routines when attribute list is inside the inode
*========================================================================*/
-static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
+int xfs_attr_sf_totsize(struct xfs_inode *dp)
{
struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
@@ -1154,7 +1216,7 @@ xfs_attr_leaf_try_add(
struct xfs_buf *bp;
int error;
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp);
if (error)
return error;
@@ -1202,7 +1264,7 @@ xfs_attr_leaf_hasname(
{
int error = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, bp);
if (error)
return error;
@@ -1511,12 +1573,23 @@ out_release:
return error;
}
+/* Enforce that there is at most one namespace bit per attr. */
+inline bool xfs_attr_check_namespace(unsigned int attr_flags)
+{
+ return hweight32(attr_flags & XFS_ATTR_NSP_ONDISK_MASK) < 2;
+}
+
/* Returns true if the attribute entry name is valid. */
bool
xfs_attr_namecheck(
+ unsigned int attr_flags,
const void *name,
size_t length)
{
+ /* Only one namespace bit allowed. */
+ if (!xfs_attr_check_namespace(attr_flags))
+ return false;
+
/*
* MAXNAMELEN includes the trailing null, but (name/length) leave it
* out, so use >= for the length check.
@@ -1524,6 +1597,10 @@ xfs_attr_namecheck(
if (length >= MAXNAMELEN)
return false;
+ /* Parent pointers have their own validation. */
+ if (attr_flags & XFS_ATTR_PARENT)
+ return xfs_parent_namecheck(attr_flags, name, length);
+
/* There shouldn't be any nulls here */
return !memchr(name, 0, length);
}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 81be9b3e4004..088cb7b30168 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -47,8 +47,9 @@ struct xfs_attrlist_cursor_kern {
/* void; state communicated via *context */
-typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
- unsigned char *, int, int);
+typedef void (*put_listent_func_t)(struct xfs_attr_list_context *context,
+ int flags, unsigned char *name, int namelen, void *value,
+ int valuelen);
struct xfs_attr_list_context {
struct xfs_trans *tp;
@@ -510,8 +511,8 @@ struct xfs_attr_intent {
struct xfs_da_args *xattri_da_args;
/*
- * Shared buffer containing the attr name and value so that the logging
- * code can share large memory buffers between log items.
+ * Shared buffer containing the attr name, new name, and value so that
+ * the logging code can share large memory buffers between log items.
*/
struct xfs_attri_log_nameval *xattri_nameval;
@@ -529,6 +530,11 @@ struct xfs_attr_intent {
struct xfs_bmbt_irec xattri_map;
};
+static inline unsigned int
+xfs_attr_intent_op(const struct xfs_attr_intent *attr)
+{
+ return attr->xattri_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+}
/*========================================================================
* Function prototypes for the kernel.
@@ -544,10 +550,20 @@ int xfs_inode_hasattr(struct xfs_inode *ip);
bool xfs_attr_is_leaf(struct xfs_inode *ip);
int xfs_attr_get_ilocked(struct xfs_da_args *args);
int xfs_attr_get(struct xfs_da_args *args);
-int xfs_attr_set(struct xfs_da_args *args);
+
+enum xfs_attr_update {
+ XFS_ATTRUPDATE_REMOVE, /* remove attr */
+ XFS_ATTRUPDATE_UPSERT, /* set value, replace any existing attr */
+ XFS_ATTRUPDATE_CREATE, /* set value, fail if attr already exists */
+ XFS_ATTRUPDATE_REPLACE, /* set value, fail if attr does not exist */
+};
+
+int xfs_attr_set(struct xfs_da_args *args, enum xfs_attr_update op, bool rsvd);
int xfs_attr_set_iter(struct xfs_attr_intent *attr);
int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
-bool xfs_attr_namecheck(const void *name, size_t length);
+bool xfs_attr_check_namespace(unsigned int attr_flags);
+bool xfs_attr_namecheck(unsigned int attr_flags, const void *name,
+ size_t length);
int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
unsigned int *total);
@@ -590,7 +606,6 @@ xfs_attr_init_add_state(struct xfs_da_args *args)
static inline enum xfs_delattr_state
xfs_attr_init_remove_state(struct xfs_da_args *args)
{
- args->op_flags |= XFS_DA_OP_REMOVE;
if (xfs_attr_is_shortform(args->dp))
return XFS_DAS_SF_REMOVE;
if (xfs_attr_is_leaf(args->dp))
@@ -614,8 +629,25 @@ xfs_attr_init_replace_state(struct xfs_da_args *args)
return xfs_attr_init_add_state(args);
}
+xfs_dahash_t xfs_attr_hashname(const uint8_t *name, int namelen);
+
+xfs_dahash_t xfs_attr_hashval(struct xfs_mount *mp, unsigned int attr_flags,
+ const uint8_t *name, int namelen, const void *value,
+ int valuelen);
+
+/* Set the hash value for any extended attribute from any namespace. */
+static inline void xfs_attr_sethash(struct xfs_da_args *args)
+{
+ args->hashval = xfs_attr_hashval(args->dp->i_mount, args->attr_filter,
+ args->name, args->namelen,
+ args->value, args->valuelen);
+}
+
extern struct kmem_cache *xfs_attr_intent_cache;
int __init xfs_attr_intent_init_cache(void);
void xfs_attr_intent_destroy_cache(void);
+int xfs_attr_sf_totsize(struct xfs_inode *dp);
+int xfs_attr_add_fork(struct xfs_inode *ip, int size, int rsvd);
+
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index ac904cc1a97b..b9e98950eb3d 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -388,6 +388,27 @@ xfs_attr3_leaf_verify(
return NULL;
}
+xfs_failaddr_t
+xfs_attr3_leaf_header_check(
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_attr3_leafblock *hdr3 = bp->b_addr;
+
+ if (hdr3->hdr.info.hdr.magic !=
+ cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return __this_address;
+
+ if (be64_to_cpu(hdr3->hdr.info.owner) != owner)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
static void
xfs_attr3_leaf_write_verify(
struct xfs_buf *bp)
@@ -448,16 +469,30 @@ int
xfs_attr3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t bno,
struct xfs_buf **bpp)
{
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK,
&xfs_attr3_leaf_buf_ops);
- if (!err && tp && *bpp)
+ if (err || !(*bpp))
+ return err;
+
+ fa = xfs_attr3_leaf_header_check(*bpp, owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
+ return -EFSCORRUPTED;
+ }
+
+ if (tp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
- return err;
+ return 0;
}
/*========================================================================
@@ -472,28 +507,57 @@ xfs_attr3_leaf_read(
* INCOMPLETE flag will not be set in attr->attr_filter, but rather
* XFS_DA_OP_RECOVERY will be set in args->op_flags.
*/
+static inline unsigned int xfs_attr_match_mask(const struct xfs_da_args *args)
+{
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return XFS_ATTR_NSP_ONDISK_MASK;
+ return XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE;
+}
+
+static inline bool
+xfs_attr_parent_match(
+ const struct xfs_da_args *args,
+ const void *value,
+ unsigned int valuelen)
+{
+ ASSERT(args->value != NULL);
+
+ /* Parent pointers do not use remote values */
+ if (!value)
+ return false;
+
+ /*
+ * The only value we support is a parent rec. However, we'll accept
+ * any valuelen so that offline repair can delete ATTR_PARENT values
+ * that are not parent pointers.
+ */
+ if (valuelen != args->valuelen)
+ return false;
+
+ return memcmp(args->value, value, valuelen) == 0;
+}
+
static bool
xfs_attr_match(
struct xfs_da_args *args,
- uint8_t namelen,
- unsigned char *name,
- int flags)
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen)
{
+ unsigned int mask = xfs_attr_match_mask(args);
if (args->namelen != namelen)
return false;
+ if ((args->attr_filter & mask) != (attr_flags & mask))
+ return false;
if (memcmp(args->name, name, namelen) != 0)
return false;
- /* Recovery ignores the INCOMPLETE flag. */
- if ((args->op_flags & XFS_DA_OP_RECOVERY) &&
- args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK))
- return true;
+ if (attr_flags & XFS_ATTR_PARENT)
+ return xfs_attr_parent_match(args, value, valuelen);
- /* All remaining matches need to be filtered by INCOMPLETE state. */
- if (args->attr_filter !=
- (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
- return false;
return true;
}
@@ -504,6 +568,13 @@ xfs_attr_copy_value(
int valuelen)
{
/*
+ * Parent pointer lookups require the caller to specify the name and
+ * value, so don't copy anything.
+ */
+ if (args->attr_filter & XFS_ATTR_PARENT)
+ return 0;
+
+ /*
* No copy if all we have to do is get the length
*/
if (!args->valuelen) {
@@ -711,8 +782,9 @@ xfs_attr_sf_findname(
for (sfe = xfs_attr_sf_firstentry(sf);
sfe < xfs_attr_sf_endptr(sf);
sfe = xfs_attr_sf_nextentry(sfe)) {
- if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
+ if (xfs_attr_match(args, sfe->flags, sfe->nameval,
+ sfe->namelen, &sfe->nameval[sfe->namelen],
+ sfe->valuelen))
return sfe;
}
@@ -819,7 +891,8 @@ xfs_attr_sf_removename(
*/
if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
- !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
+ !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) &&
+ !xfs_has_parent(mp)) {
xfs_attr_fork_remove(dp, args->trans);
} else {
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
@@ -828,7 +901,8 @@ xfs_attr_sf_removename(
ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) ||
(args->op_flags & XFS_DA_OP_ADDNAME) ||
!xfs_has_attr2(mp) ||
- dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
+ dp->i_df.if_format == XFS_DINODE_FMT_BTREE ||
+ xfs_has_parent(mp));
xfs_trans_log_inode(args->trans, dp,
XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
@@ -904,6 +978,7 @@ xfs_attr_shortform_to_leaf(
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
nargs.op_flags = XFS_DA_OP_OKNOENT;
+ nargs.owner = args->owner;
sfe = xfs_attr_sf_firstentry(sf);
for (i = 0; i < sf->count; i++) {
@@ -911,9 +986,13 @@ xfs_attr_shortform_to_leaf(
nargs.namelen = sfe->namelen;
nargs.value = &sfe->nameval[nargs.namelen];
nargs.valuelen = sfe->valuelen;
- nargs.hashval = xfs_da_hashname(sfe->nameval,
- sfe->namelen);
nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK;
+ if (!xfs_attr_check_namespace(sfe->flags)) {
+ xfs_da_mark_sick(args);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ xfs_attr_sethash(&nargs);
error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
ASSERT(error == -ENOATTR);
error = xfs_attr3_leaf_add(bp, &nargs);
@@ -1027,7 +1106,7 @@ xfs_attr_shortform_verify(
* one namespace flag per xattr, so we can just count the
* bits (i.e. hweight) here.
*/
- if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1)
+ if (!xfs_attr_check_namespace(sfep->flags))
return __this_address;
sfep = next_sfep;
@@ -1106,6 +1185,7 @@ xfs_attr3_leaf_to_shortform(
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
nargs.op_flags = XFS_DA_OP_OKNOENT;
+ nargs.owner = args->owner;
for (i = 0; i < ichdr.count; entry++, i++) {
if (entry->flags & XFS_ATTR_INCOMPLETE)
@@ -1158,7 +1238,7 @@ xfs_attr3_leaf_to_node(
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
- error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, dp, args->owner, 0, &bp1);
if (error)
goto out;
@@ -1237,7 +1317,7 @@ xfs_attr3_leaf_create(
ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
- hdr3->owner = cpu_to_be64(dp->i_ino);
+ hdr3->owner = cpu_to_be64(args->owner);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
@@ -1993,7 +2073,7 @@ xfs_attr3_leaf_toosmall(
if (blkno == 0)
continue;
error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
- blkno, &bp);
+ state->args->owner, blkno, &bp);
if (error)
return error;
@@ -2401,18 +2481,23 @@ xfs_attr3_leaf_lookup_int(
*/
if (entry->flags & XFS_ATTR_LOCAL) {
name_loc = xfs_attr3_leaf_name_local(leaf, probe);
- if (!xfs_attr_match(args, name_loc->namelen,
- name_loc->nameval, entry->flags))
+ if (!xfs_attr_match(args, entry->flags,
+ name_loc->nameval, name_loc->namelen,
+ &name_loc->nameval[name_loc->namelen],
+ be16_to_cpu(name_loc->valuelen)))
continue;
args->index = probe;
return -EEXIST;
} else {
+ unsigned int valuelen;
+
name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
- if (!xfs_attr_match(args, name_rmt->namelen,
- name_rmt->name, entry->flags))
+ valuelen = be32_to_cpu(name_rmt->valuelen);
+ if (!xfs_attr_match(args, entry->flags, name_rmt->name,
+ name_rmt->namelen, NULL, valuelen))
continue;
args->index = probe;
- args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtvaluelen = valuelen;
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
args->rmtblkcnt = xfs_attr3_rmt_blocks(
args->dp->i_mount,
@@ -2715,7 +2800,8 @@ xfs_attr3_leaf_clearflag(
/*
* Set up the operation.
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner,
+ args->blkno, &bp);
if (error)
return error;
@@ -2779,7 +2865,8 @@ xfs_attr3_leaf_setflag(
/*
* Set up the operation.
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner,
+ args->blkno, &bp);
if (error)
return error;
@@ -2838,7 +2925,8 @@ xfs_attr3_leaf_flipflags(
/*
* Read the block containing the "old" attr
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner,
+ args->blkno, &bp1);
if (error)
return error;
@@ -2846,8 +2934,8 @@ xfs_attr3_leaf_flipflags(
* Read the block containing the "new" attr, if it is different
*/
if (args->blkno2 != args->blkno) {
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
- &bp2);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner,
+ args->blkno2, &bp2);
if (error)
return error;
} else {
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 9b9948639c0f..bac219589896 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -98,12 +98,14 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
struct xfs_buf *leaf2_bp);
int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, struct xfs_buf **bpp);
+ xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp);
void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
struct xfs_attr3_icleaf_hdr *to,
struct xfs_attr_leafblock *from);
void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo,
struct xfs_attr_leafblock *to,
struct xfs_attr3_icleaf_hdr *from);
+xfs_failaddr_t xfs_attr3_leaf_header_check(struct xfs_buf *bp,
+ xfs_ino_t owner);
#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index ff0412828772..4c44ce1c8a64 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -43,19 +43,32 @@
* the logging system and therefore never have a log item.
*/
-/*
- * Each contiguous block has a header, so it is not just a simple attribute
- * length to FSB conversion.
- */
-int
+/* How many bytes can be stored in a remote value buffer? */
+inline unsigned int
+xfs_attr3_rmt_buf_space(
+ struct xfs_mount *mp)
+{
+ unsigned int blocksize = mp->m_attr_geo->blksize;
+
+ if (xfs_has_crc(mp))
+ return blocksize - sizeof(struct xfs_attr3_rmt_hdr);
+
+ return blocksize;
+}
+
+/* Compute number of fsblocks needed to store a remote attr value */
+unsigned int
xfs_attr3_rmt_blocks(
- struct xfs_mount *mp,
- int attrlen)
+ struct xfs_mount *mp,
+ unsigned int attrlen)
{
- if (xfs_has_crc(mp)) {
- int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
- return (attrlen + buflen - 1) / buflen;
- }
+ /*
+ * Each contiguous block has a header, so it is not just a simple
+ * attribute length to FSB conversion.
+ */
+ if (xfs_has_crc(mp))
+ return howmany(attrlen, xfs_attr3_rmt_buf_space(mp));
+
return XFS_B_TO_FSB(mp, attrlen);
}
@@ -92,7 +105,6 @@ xfs_attr3_rmt_verify(
struct xfs_mount *mp,
struct xfs_buf *bp,
void *ptr,
- int fsbsize,
xfs_daddr_t bno)
{
struct xfs_attr3_rmt_hdr *rmt = ptr;
@@ -103,7 +115,7 @@ xfs_attr3_rmt_verify(
return __this_address;
if (be64_to_cpu(rmt->rm_blkno) != bno)
return __this_address;
- if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+ if (be32_to_cpu(rmt->rm_bytes) > mp->m_attr_geo->blksize - sizeof(*rmt))
return __this_address;
if (be32_to_cpu(rmt->rm_offset) +
be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX)
@@ -122,9 +134,9 @@ __xfs_attr3_rmt_read_verify(
{
struct xfs_mount *mp = bp->b_mount;
char *ptr;
- int len;
+ unsigned int len;
xfs_daddr_t bno;
- int blksize = mp->m_attr_geo->blksize;
+ unsigned int blksize = mp->m_attr_geo->blksize;
/* no verification of non-crc buffers */
if (!xfs_has_crc(mp))
@@ -141,7 +153,7 @@ __xfs_attr3_rmt_read_verify(
*failaddr = __this_address;
return -EFSBADCRC;
}
- *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
+ *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, bno);
if (*failaddr)
return -EFSCORRUPTED;
len -= blksize;
@@ -186,7 +198,7 @@ xfs_attr3_rmt_write_verify(
{
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- int blksize = mp->m_attr_geo->blksize;
+ unsigned int blksize = mp->m_attr_geo->blksize;
char *ptr;
int len;
xfs_daddr_t bno;
@@ -203,7 +215,7 @@ xfs_attr3_rmt_write_verify(
while (len > 0) {
struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
- fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
+ fa = xfs_attr3_rmt_verify(mp, bp, ptr, bno);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -280,30 +292,30 @@ xfs_attr_rmtval_copyout(
struct xfs_mount *mp,
struct xfs_buf *bp,
struct xfs_inode *dp,
- int *offset,
- int *valuelen,
+ xfs_ino_t owner,
+ unsigned int *offset,
+ unsigned int *valuelen,
uint8_t **dst)
{
char *src = bp->b_addr;
- xfs_ino_t ino = dp->i_ino;
xfs_daddr_t bno = xfs_buf_daddr(bp);
- int len = BBTOB(bp->b_length);
- int blksize = mp->m_attr_geo->blksize;
+ unsigned int len = BBTOB(bp->b_length);
+ unsigned int blksize = mp->m_attr_geo->blksize;
ASSERT(len >= blksize);
while (len > 0 && *valuelen > 0) {
- int hdr_size = 0;
- int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+ unsigned int hdr_size = 0;
+ unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp);
byte_cnt = min(*valuelen, byte_cnt);
if (xfs_has_crc(mp)) {
- if (xfs_attr3_rmt_hdr_ok(src, ino, *offset,
+ if (xfs_attr3_rmt_hdr_ok(src, owner, *offset,
byte_cnt, bno)) {
xfs_alert(mp,
"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
- bno, *offset, byte_cnt, ino);
+ bno, *offset, byte_cnt, owner);
xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -330,20 +342,20 @@ xfs_attr_rmtval_copyin(
struct xfs_mount *mp,
struct xfs_buf *bp,
xfs_ino_t ino,
- int *offset,
- int *valuelen,
+ unsigned int *offset,
+ unsigned int *valuelen,
uint8_t **src)
{
char *dst = bp->b_addr;
xfs_daddr_t bno = xfs_buf_daddr(bp);
- int len = BBTOB(bp->b_length);
- int blksize = mp->m_attr_geo->blksize;
+ unsigned int len = BBTOB(bp->b_length);
+ unsigned int blksize = mp->m_attr_geo->blksize;
ASSERT(len >= blksize);
while (len > 0 && *valuelen > 0) {
- int hdr_size;
- int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+ unsigned int hdr_size;
+ unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp);
byte_cnt = min(*valuelen, byte_cnt);
hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
@@ -389,12 +401,12 @@ xfs_attr_rmtval_get(
struct xfs_buf *bp;
xfs_dablk_t lblkno = args->rmtblkno;
uint8_t *dst = args->value;
- int valuelen;
+ unsigned int valuelen;
int nmap;
int error;
- int blkcnt = args->rmtblkcnt;
+ unsigned int blkcnt = args->rmtblkcnt;
int i;
- int offset = 0;
+ unsigned int offset = 0;
trace_xfs_attr_rmtval_get(args);
@@ -427,8 +439,7 @@ xfs_attr_rmtval_get(
return error;
error = xfs_attr_rmtval_copyout(mp, bp, args->dp,
- &offset, &valuelen,
- &dst);
+ args->owner, &offset, &valuelen, &dst);
xfs_buf_relse(bp);
if (error)
return error;
@@ -453,7 +464,7 @@ xfs_attr_rmt_find_hole(
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
int error;
- int blkcnt;
+ unsigned int blkcnt;
xfs_fileoff_t lfileoff = 0;
/*
@@ -482,11 +493,11 @@ xfs_attr_rmtval_set_value(
struct xfs_bmbt_irec map;
xfs_dablk_t lblkno;
uint8_t *src = args->value;
- int blkcnt;
- int valuelen;
+ unsigned int blkcnt;
+ unsigned int valuelen;
int nmap;
int error;
- int offset = 0;
+ unsigned int offset = 0;
/*
* Roll through the "value", copying the attribute value to the
@@ -522,8 +533,8 @@ xfs_attr_rmtval_set_value(
return error;
bp->b_ops = &xfs_attr3_rmt_buf_ops;
- xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
- &valuelen, &src);
+ xfs_attr_rmtval_copyin(mp, bp, args->owner, &offset, &valuelen,
+ &src);
error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
xfs_buf_relse(bp);
@@ -626,7 +637,6 @@ xfs_attr_rmtval_set_blk(
if (error)
return error;
- ASSERT(nmap == 1);
ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
(map->br_startblock != HOLESTARTBLOCK));
@@ -646,7 +656,7 @@ xfs_attr_rmtval_invalidate(
struct xfs_da_args *args)
{
xfs_dablk_t lblkno;
- int blkcnt;
+ unsigned int blkcnt;
int error;
/*
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index d097ec6c4dc3..e3c6c7d774bf 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -6,7 +6,13 @@
#ifndef __XFS_ATTR_REMOTE_H__
#define __XFS_ATTR_REMOTE_H__
-int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+unsigned int xfs_attr3_rmt_blocks(struct xfs_mount *mp, unsigned int attrlen);
+
+/* Number of rmt blocks needed to store the maximally sized attr value */
+static inline unsigned int xfs_attr3_max_rmt_blocks(struct xfs_mount *mp)
+{
+ return xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
+}
int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index bc4422223024..73bdc0e55682 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -16,6 +16,7 @@ typedef struct xfs_attr_sf_sort {
uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
xfs_dahash_t hash; /* this entry's hash value */
unsigned char *name; /* name value, pointer into buffer */
+ void *value;
} xfs_attr_sf_sort_t;
#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 656c95a22f2e..3b3206d312d6 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -779,7 +779,7 @@ xfs_bmap_local_to_extents_empty(
}
-STATIC int /* error */
+int /* error */
xfs_bmap_local_to_extents(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
@@ -789,7 +789,8 @@ xfs_bmap_local_to_extents(
void (*init_fn)(struct xfs_trans *tp,
struct xfs_buf *bp,
struct xfs_inode *ip,
- struct xfs_ifork *ifp))
+ struct xfs_ifork *ifp, void *priv),
+ void *priv)
{
int error = 0;
int flags; /* logging flags returned */
@@ -850,7 +851,7 @@ xfs_bmap_local_to_extents(
* log here. Note that init_fn must also set the buffer log item type
* correctly.
*/
- init_fn(tp, bp, ip, ifp);
+ init_fn(tp, bp, ip, ifp, priv);
/* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
@@ -976,13 +977,14 @@ xfs_bmap_add_attrfork_local(
dargs.total = dargs.geo->fsbcount;
dargs.whichfork = XFS_DATA_FORK;
dargs.trans = tp;
+ dargs.owner = ip->i_ino;
return xfs_dir2_sf_to_block(&dargs);
}
if (S_ISLNK(VFS_I(ip)->i_mode))
return xfs_bmap_local_to_extents(tp, ip, 1, flags,
- XFS_DATA_FORK,
- xfs_symlink_local_to_remote);
+ XFS_DATA_FORK, xfs_symlink_local_to_remote,
+ NULL);
/* should only be called for types that support local format data */
ASSERT(0);
@@ -1023,40 +1025,29 @@ xfs_bmap_set_attrforkoff(
}
/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
+ * Convert inode from non-attributed to attributed. Caller must hold the
+ * ILOCK_EXCL and the file cannot have an attr fork.
*/
int /* error code */
xfs_bmap_add_attrfork(
- xfs_inode_t *ip, /* incore inode pointer */
+ struct xfs_trans *tp,
+ struct xfs_inode *ip, /* incore inode pointer */
int size, /* space new attribute needs */
int rsvd) /* xact may use reserved blks */
{
- xfs_mount_t *mp; /* mount structure */
- xfs_trans_t *tp; /* transaction pointer */
- int blks; /* space reservation */
+ struct xfs_mount *mp = tp->t_mountp;
int version = 1; /* superblock attr version */
int logflags; /* logging flags */
int error; /* error return value */
- ASSERT(xfs_inode_has_attr_fork(ip) == 0);
-
- mp = ip->i_mount;
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
-
- blks = XFS_ADDAFORK_SPACE_RES(mp);
-
- error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0,
- rsvd, &tp);
- if (error)
- return error;
- if (xfs_inode_has_attr_fork(ip))
- goto trans_cancel;
+ ASSERT(!xfs_inode_has_attr_fork(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_bmap_set_attrforkoff(ip, size, &version);
if (error)
- goto trans_cancel;
+ return error;
xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
logflags = 0;
@@ -1077,7 +1068,7 @@ xfs_bmap_add_attrfork(
if (logflags)
xfs_trans_log_inode(tp, ip, logflags);
if (error)
- goto trans_cancel;
+ return error;
if (!xfs_has_attr(mp) ||
(!xfs_has_attr2(mp) && version == 2)) {
bool log_sb = false;
@@ -1096,14 +1087,7 @@ xfs_bmap_add_attrfork(
xfs_log_sb(tp);
}
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-
-trans_cancel:
- xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
+ return 0;
}
/*
@@ -1586,6 +1570,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -1616,6 +1601,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1650,6 +1636,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -1684,6 +1671,7 @@ xfs_bmap_add_extent_delay_real(
goto done;
}
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -1722,6 +1710,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING:
@@ -1812,6 +1801,7 @@ xfs_bmap_add_extent_delay_real(
xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
xfs_iext_next(ifp, &bma->icur);
xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT);
+ ASSERT(da_new <= da_old);
break;
case BMAP_RIGHT_FILLING:
@@ -1861,6 +1851,7 @@ xfs_bmap_add_extent_delay_real(
PREV.br_blockcount = temp;
xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
xfs_iext_next(ifp, &bma->icur);
+ ASSERT(da_new <= da_old);
break;
case 0:
@@ -1975,7 +1966,7 @@ xfs_bmap_add_extent_delay_real(
}
if (da_new != da_old)
- xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
+ xfs_mod_delalloc(bma->ip, 0, (int64_t)da_new - da_old);
if (bma->cur) {
da_new += bma->cur->bc_bmap.allocated;
@@ -1983,11 +1974,10 @@ xfs_bmap_add_extent_delay_real(
}
/* adjust for changes in reserved delayed indirect blocks */
- if (da_new != da_old) {
- ASSERT(state == 0 || da_new < da_old);
- error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new),
- false);
- }
+ if (da_new < da_old)
+ xfs_add_fdblocks(mp, da_old - da_new);
+ else if (da_new > da_old)
+ error = xfs_dec_fdblocks(mp, da_new - da_old, true);
xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
@@ -2688,12 +2678,12 @@ xfs_bmap_add_extent_hole_delay(
}
if (oldlen != newlen) {
ASSERT(oldlen > newlen);
- xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen),
- false);
+ xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
+
/*
* Nothing to do for disk quota accounting here.
*/
- xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen);
+ xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
}
}
@@ -3370,7 +3360,7 @@ xfs_bmap_alloc_account(
* yet.
*/
if (ap->wasdel) {
- xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
+ xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0);
return;
}
@@ -3394,7 +3384,7 @@ xfs_bmap_alloc_account(
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel) {
ap->ip->i_delayed_blks -= ap->length;
- xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
+ xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0);
fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT;
} else {
fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
@@ -4066,6 +4056,7 @@ xfs_bmapi_reserve_delalloc(
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
+ uint64_t fdblocks;
int error;
xfs_fileoff_t aoff = off;
@@ -4108,17 +4099,21 @@ xfs_bmapi_reserve_delalloc(
indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
ASSERT(indlen > 0);
- error = xfs_mod_fdblocks(mp, -((int64_t)alen), false);
- if (error)
- goto out_unreserve_quota;
+ fdblocks = indlen;
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, alen));
+ if (error)
+ goto out_unreserve_quota;
+ } else {
+ fdblocks += alen;
+ }
- error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false);
+ error = xfs_dec_fdblocks(mp, fdblocks, false);
if (error)
- goto out_unreserve_blocks;
-
+ goto out_unreserve_frextents;
ip->i_delayed_blks += alen;
- xfs_mod_delalloc(ip->i_mount, alen + indlen);
+ xfs_mod_delalloc(ip, alen, indlen);
got->br_startoff = aoff;
got->br_startblock = nullstartblock(indlen);
@@ -4139,8 +4134,9 @@ xfs_bmapi_reserve_delalloc(
return 0;
-out_unreserve_blocks:
- xfs_mod_fdblocks(mp, alen, false);
+out_unreserve_frextents:
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, alen));
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
@@ -4191,26 +4187,10 @@ xfs_bmapi_allocate(
struct xfs_mount *mp = bma->ip->i_mount;
int whichfork = xfs_bmapi_whichfork(bma->flags);
struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
- int tmp_logflags = 0;
int error;
ASSERT(bma->length > 0);
-
- /*
- * For the wasdelay case, we could also just allocate the stuff asked
- * for in this bmap call but that wouldn't be as good.
- */
- if (bma->wasdel) {
- bma->length = (xfs_extlen_t)bma->got.br_blockcount;
- bma->offset = bma->got.br_startoff;
- if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
- bma->prev.br_startoff = NULLFILEOFF;
- } else {
- bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
- if (!bma->eof)
- bma->length = XFS_FILBLKS_MIN(bma->length,
- bma->got.br_startoff - bma->offset);
- }
+ ASSERT(bma->length <= XFS_MAX_BMBT_EXTLEN);
if (bma->flags & XFS_BMAPI_CONTIG)
bma->minlen = bma->length;
@@ -4226,8 +4206,15 @@ xfs_bmapi_allocate(
} else {
error = xfs_bmap_alloc_userdata(bma);
}
- if (error || bma->blkno == NULLFSBLOCK)
+ if (error)
return error;
+ if (bma->blkno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ if (WARN_ON_ONCE(!xfs_valid_startblock(bma->ip, bma->blkno))) {
+ xfs_bmap_mark_sick(bma->ip, whichfork);
+ return -EFSCORRUPTED;
+ }
if (bma->flags & XFS_BMAPI_ZERO) {
error = xfs_zero_extent(bma->ip, bma->blkno, bma->length);
@@ -4260,8 +4247,6 @@ xfs_bmapi_allocate(
error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
whichfork, &bma->icur, &bma->cur, &bma->got,
&bma->logflags, bma->flags);
-
- bma->logflags |= tmp_logflags;
if (error)
return error;
@@ -4406,6 +4391,15 @@ xfs_bmapi_finish(
* extent state if necessary. Details behaviour is controlled by the flags
* parameter. Only allocates blocks from a single allocation group, to avoid
* locking problems.
+ *
+ * Returns 0 on success and places the extent mappings in mval. nmaps is used
+ * as an input/output parameter where the caller specifies the maximum number
+ * of mappings that may be returned and xfs_bmapi_write passes back the number
+ * of mappings (including existing mappings) it found.
+ *
+ * Returns a negative error code on failure, including -ENOSPC when it could not
+ * allocate any blocks and -ENOSR when it did allocate blocks to convert a
+ * delalloc range, but those blocks were before the passed in range.
*/
int
xfs_bmapi_write(
@@ -4524,20 +4518,33 @@ xfs_bmapi_write(
* allocation length request (which can be 64 bits in
* length) and the bma length request, which is
* xfs_extlen_t and therefore 32 bits. Hence we have to
- * check for 32-bit overflows and handle them here.
+ * be careful and do the min() using the larger type to
+ * avoid overflows.
*/
- if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
- bma.length = XFS_MAX_BMBT_EXTLEN;
- else
- bma.length = len;
+ bma.length = XFS_FILBLKS_MIN(len, XFS_MAX_BMBT_EXTLEN);
+
+ if (wasdelay) {
+ bma.length = XFS_FILBLKS_MIN(bma.length,
+ bma.got.br_blockcount -
+ (bno - bma.got.br_startoff));
+ } else {
+ if (!eof)
+ bma.length = XFS_FILBLKS_MIN(bma.length,
+ bma.got.br_startoff - bno);
+ }
- ASSERT(len > 0);
ASSERT(bma.length > 0);
error = xfs_bmapi_allocate(&bma);
- if (error)
+ if (error) {
+ /*
+ * If we already allocated space in a previous
+ * iteration return what we go so far when
+ * running out of space.
+ */
+ if (error == -ENOSPC && bma.nallocs)
+ break;
goto error0;
- if (bma.blkno == NULLFSBLOCK)
- break;
+ }
/*
* If this is a CoW allocation, record the data in
@@ -4575,7 +4582,6 @@ xfs_bmapi_write(
if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got))
eof = true;
}
- *nmap = n;
error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
whichfork);
@@ -4586,7 +4592,22 @@ xfs_bmapi_write(
ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork));
xfs_bmapi_finish(&bma, whichfork, 0);
xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
- orig_nmap, *nmap);
+ orig_nmap, n);
+
+ /*
+ * When converting delayed allocations, xfs_bmapi_allocate ignores
+ * the passed in bno and always converts from the start of the found
+ * delalloc extent.
+ *
+ * To avoid a successful return with *nmap set to 0, return the magic
+ * -ENOSR error code for this particular case so that the caller can
+ * handle it.
+ */
+ if (!n) {
+ ASSERT(bma.nallocs >= *nmap);
+ return -ENOSR;
+ }
+ *nmap = n;
return 0;
error0:
xfs_bmapi_finish(&bma, whichfork, error);
@@ -4599,8 +4620,8 @@ error0:
* invocations to allocate the target offset if a large enough physical extent
* is not available.
*/
-int
-xfs_bmapi_convert_delalloc(
+static int
+xfs_bmapi_convert_one_delalloc(
struct xfs_inode *ip,
int whichfork,
xfs_off_t offset,
@@ -4630,11 +4651,8 @@ xfs_bmapi_convert_delalloc(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_iext_count_may_overflow(ip, whichfork,
+ error = xfs_iext_count_extend(tp, ip, whichfork,
XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
@@ -4657,19 +4675,25 @@ xfs_bmapi_convert_delalloc(
if (!isnullstartblock(bma.got.br_startblock)) {
xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
xfs_iomap_inode_sequence(ip, flags));
- *seq = READ_ONCE(ifp->if_seq);
+ if (seq)
+ *seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
bma.tp = tp;
bma.ip = ip;
bma.wasdel = true;
- bma.offset = bma.got.br_startoff;
- bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
- XFS_MAX_BMBT_EXTLEN);
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
/*
+ * Always allocate convert from the start of the delalloc extent even if
+ * that is outside the passed in range to create large contiguous
+ * extents on disk.
+ */
+ bma.offset = bma.got.br_startoff;
+ bma.length = bma.got.br_blockcount;
+
+ /*
* When we're converting the delalloc reservations backing dirty pages
* in the page cache, we must be careful about how we create the new
* extents:
@@ -4693,22 +4717,14 @@ xfs_bmapi_convert_delalloc(
if (error)
goto out_finish;
- error = -ENOSPC;
- if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
- goto out_finish;
- if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) {
- xfs_bmap_mark_sick(ip, whichfork);
- error = -EFSCORRUPTED;
- goto out_finish;
- }
-
XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
xfs_iomap_inode_sequence(ip, flags));
- *seq = READ_ONCE(ifp->if_seq);
+ if (seq)
+ *seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
@@ -4731,6 +4747,36 @@ out_trans_cancel:
return error;
}
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in iomap.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ loff_t offset,
+ struct iomap *iomap,
+ unsigned int *seq)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs offset
+ * and put the result into iomap. Allocate in a loop because it may
+ * take several attempts to allocate real blocks for a contiguous
+ * delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset,
+ iomap, seq);
+ if (error)
+ return error;
+ } while (iomap->offset + iomap->length <= offset);
+
+ return 0;
+}
+
int
xfs_bmapi_remap(
struct xfs_trans *tp,
@@ -4822,32 +4868,18 @@ error0:
* ores == 1). The number of stolen blocks is returned. The availability and
* subsequent accounting of stolen blocks is the responsibility of the caller.
*/
-static xfs_filblks_t
+static void
xfs_bmap_split_indlen(
xfs_filblks_t ores, /* original res. */
xfs_filblks_t *indlen1, /* ext1 worst indlen */
- xfs_filblks_t *indlen2, /* ext2 worst indlen */
- xfs_filblks_t avail) /* stealable blocks */
+ xfs_filblks_t *indlen2) /* ext2 worst indlen */
{
xfs_filblks_t len1 = *indlen1;
xfs_filblks_t len2 = *indlen2;
xfs_filblks_t nres = len1 + len2; /* new total res. */
- xfs_filblks_t stolen = 0;
xfs_filblks_t resfactor;
/*
- * Steal as many blocks as we can to try and satisfy the worst case
- * indlen for both new extents.
- */
- if (ores < nres && avail)
- stolen = XFS_FILBLKS_MIN(nres - ores, avail);
- ores += stolen;
-
- /* nothing else to do if we've satisfied the new reservation */
- if (ores >= nres)
- return stolen;
-
- /*
* We can't meet the total required reservation for the two extents.
* Calculate the percent of the overall shortage between both extents
* and apply this percentage to each of the requested indlen values.
@@ -4891,11 +4923,9 @@ xfs_bmap_split_indlen(
*indlen1 = len1;
*indlen2 = len2;
-
- return stolen;
}
-int
+void
xfs_bmap_del_extent_delay(
struct xfs_inode *ip,
int whichfork,
@@ -4908,9 +4938,9 @@ xfs_bmap_del_extent_delay(
struct xfs_bmbt_irec new;
int64_t da_old, da_new, da_diff = 0;
xfs_fileoff_t del_endoff, got_endoff;
- xfs_filblks_t got_indlen, new_indlen, stolen;
+ xfs_filblks_t got_indlen, new_indlen, stolen = 0;
uint32_t state = xfs_bmap_fork_to_state(whichfork);
- int error = 0;
+ uint64_t fdblocks;
bool isrt;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -4925,18 +4955,12 @@ xfs_bmap_del_extent_delay(
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
- if (isrt)
- xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
-
/*
* Update the inode delalloc counter now and wait to update the
* sb counters as we might have to borrow some blocks for the
* indirect block accounting.
*/
- ASSERT(!isrt);
- error = xfs_quota_unreserve_blkres(ip, del->br_blockcount);
- if (error)
- return error;
+ xfs_quota_unreserve_blkres(ip, del->br_blockcount);
ip->i_delayed_blks -= del->br_blockcount;
if (got->br_startoff == del->br_startoff)
@@ -4990,8 +5014,24 @@ xfs_bmap_del_extent_delay(
new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount);
WARN_ON_ONCE(!got_indlen || !new_indlen);
- stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen,
- del->br_blockcount);
+ /*
+ * Steal as many blocks as we can to try and satisfy the worst
+ * case indlen for both new extents.
+ *
+ * However, we can't just steal reservations from the data
+ * blocks if this is an RT inodes as the data and metadata
+ * blocks come from different pools. We'll have to live with
+ * under-filled indirect reservation in this case.
+ */
+ da_new = got_indlen + new_indlen;
+ if (da_new > da_old && !isrt) {
+ stolen = XFS_FILBLKS_MIN(da_new - da_old,
+ del->br_blockcount);
+ da_old += stolen;
+ }
+ if (da_new > da_old)
+ xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen);
+ da_new = got_indlen + new_indlen;
got->br_startblock = nullstartblock((int)got_indlen);
@@ -5003,20 +5043,21 @@ xfs_bmap_del_extent_delay(
xfs_iext_next(ifp, icur);
xfs_iext_insert(ip, icur, &new, state);
- da_new = got_indlen + new_indlen - stolen;
del->br_blockcount -= stolen;
break;
}
ASSERT(da_old >= da_new);
da_diff = da_old - da_new;
- if (!isrt)
- da_diff += del->br_blockcount;
- if (da_diff) {
- xfs_mod_fdblocks(mp, da_diff, false);
- xfs_mod_delalloc(mp, -da_diff);
- }
- return error;
+ fdblocks = da_diff;
+
+ if (isrt)
+ xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
+ else
+ fdblocks += del->br_blockcount;
+
+ xfs_add_fdblocks(mp, fdblocks);
+ xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
}
void
@@ -5107,8 +5148,7 @@ xfs_bmap_del_extent_real(
{
xfs_fsblock_t del_endblock=0; /* first block past del */
xfs_fileoff_t del_endoff; /* first offset past del */
- int do_fx; /* free extent at end of routine */
- int error; /* error return value */
+ int error = 0; /* error return value */
struct xfs_bmbt_irec got; /* current extent entry */
xfs_fileoff_t got_endoff; /* first offset past got */
int i; /* temp state */
@@ -5151,20 +5191,10 @@ xfs_bmap_del_extent_real(
return -ENOSPC;
*logflagsp = XFS_ILOG_CORE;
- if (xfs_ifork_is_realtime(ip, whichfork)) {
- if (!(bflags & XFS_BMAPI_REMAP)) {
- error = xfs_rtfree_blocks(tp, del->br_startblock,
- del->br_blockcount);
- if (error)
- return error;
- }
-
- do_fx = 0;
+ if (xfs_ifork_is_realtime(ip, whichfork))
qfield = XFS_TRANS_DQ_RTBCOUNT;
- } else {
- do_fx = 1;
+ else
qfield = XFS_TRANS_DQ_BCOUNT;
- }
nblks = del->br_blockcount;
del_endblock = del->br_startblock + del->br_blockcount;
@@ -5312,18 +5342,29 @@ xfs_bmap_del_extent_real(
/*
* If we need to, add to list of extents to delete.
*/
- if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
+ if (!(bflags & XFS_BMAPI_REMAP)) {
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
+ } else if (xfs_ifork_is_realtime(ip, whichfork)) {
+ /*
+ * Ensure the bitmap and summary inodes are locked
+ * and joined to the transaction before modifying them.
+ */
+ if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) {
+ tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED;
+ xfs_rtbitmap_lock(tp, mp);
+ }
+ error = xfs_rtfree_blocks(tp, del->br_startblock,
+ del->br_blockcount);
} else {
error = xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
XFS_AG_RESV_NONE,
((bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN));
- if (error)
- return error;
}
+ if (error)
+ return error;
}
/*
@@ -5414,16 +5455,6 @@ __xfs_bunmapi(
} else
cur = NULL;
- if (isrt) {
- /*
- * Synchronize by locking the bitmap inode.
- */
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
- }
-
extno = 0;
while (end != (xfs_fileoff_t)-1 && end >= start &&
(nexts == 0 || extno < nexts)) {
@@ -5584,18 +5615,16 @@ __xfs_bunmapi(
delete:
if (wasdel) {
- error = xfs_bmap_del_extent_delay(ip, whichfork, &icur,
- &got, &del);
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
} else {
error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
&del, &tmp_logflags, whichfork,
flags);
logflags |= tmp_logflags;
+ if (error)
+ goto error0;
}
- if (error)
- goto error0;
-
end = del.br_startoff - 1;
nodelete:
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index f7662595309d..667b0c2b33d1 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -158,7 +158,7 @@ static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec)
* Return true if the extent is a real, allocated extent, or false if it is a
* delayed allocation, and unwritten extent or a hole.
*/
-static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec)
+static inline bool xfs_bmap_is_written_extent(const struct xfs_bmbt_irec *irec)
{
return xfs_bmap_is_real_extent(irec) &&
irec->br_state != XFS_EXT_UNWRITTEN;
@@ -176,9 +176,16 @@ int xfs_bmap_longest_free_extent(struct xfs_perag *pag,
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp);
-int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+int xfs_bmap_add_attrfork(struct xfs_trans *tp, struct xfs_inode *ip,
+ int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork);
+int xfs_bmap_local_to_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_extlen_t total, int *logflagsp, int whichfork,
+ void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_inode *ip, struct xfs_ifork *ifp,
+ void *priv),
+ void *priv);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -195,7 +202,7 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extnum_t nexts, int *done);
-int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
+void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 718d071bb21a..16a529a88780 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -252,6 +252,51 @@ xfs_da3_node_verify(
return NULL;
}
+xfs_failaddr_t
+xfs_da3_node_header_check(
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
+
+ if (hdr3->hdr.magic != cpu_to_be16(XFS_DA3_NODE_MAGIC))
+ return __this_address;
+
+ if (be64_to_cpu(hdr3->owner) != owner)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+xfs_failaddr_t
+xfs_da3_header_check(
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_da_blkinfo *hdr = bp->b_addr;
+
+ if (!xfs_has_crc(mp))
+ return NULL;
+
+ switch (hdr->magic) {
+ case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+ return xfs_attr3_leaf_header_check(bp, owner);
+ case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+ return xfs_da3_node_header_check(bp, owner);
+ case cpu_to_be16(XFS_DIR3_LEAF1_MAGIC):
+ case cpu_to_be16(XFS_DIR3_LEAFN_MAGIC):
+ return xfs_dir3_leaf_header_check(bp, owner);
+ }
+
+ ASSERT(0);
+ return NULL;
+}
+
static void
xfs_da3_node_write_verify(
struct xfs_buf *bp)
@@ -486,7 +531,7 @@ xfs_da3_node_create(
memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
ichdr.magic = XFS_DA3_NODE_MAGIC;
hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
- hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+ hdr3->info.owner = cpu_to_be64(args->owner);
uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
ichdr.magic = XFS_DA_NODE_MAGIC;
@@ -1199,6 +1244,7 @@ xfs_da3_root_join(
struct xfs_da3_icnode_hdr oldroothdr;
int error;
struct xfs_inode *dp = state->args->dp;
+ xfs_failaddr_t fa;
trace_xfs_da_root_join(state->args);
@@ -1225,6 +1271,13 @@ xfs_da3_root_join(
error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork);
if (error)
return error;
+ fa = xfs_da3_header_check(bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(args->trans, bp);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
/*
@@ -1259,6 +1312,7 @@ xfs_da3_node_toosmall(
struct xfs_da_blkinfo *info;
xfs_dablk_t blkno;
struct xfs_buf *bp;
+ xfs_failaddr_t fa;
struct xfs_da3_icnode_hdr nodehdr;
int count;
int forward;
@@ -1333,6 +1387,13 @@ xfs_da3_node_toosmall(
state->args->whichfork);
if (error)
return error;
+ fa = xfs_da3_node_header_check(bp, state->args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(state->args->trans, bp);
+ xfs_da_mark_sick(state->args);
+ return -EFSCORRUPTED;
+ }
node = bp->b_addr;
xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node);
@@ -1591,6 +1652,7 @@ xfs_da3_node_lookup_int(
struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
struct xfs_da_args *args;
+ xfs_failaddr_t fa;
xfs_dablk_t blkno;
xfs_dahash_t hashval;
xfs_dahash_t btreehashval;
@@ -1629,6 +1691,12 @@ xfs_da3_node_lookup_int(
if (magic == XFS_ATTR_LEAF_MAGIC ||
magic == XFS_ATTR3_LEAF_MAGIC) {
+ fa = xfs_attr3_leaf_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
blk->magic = XFS_ATTR_LEAF_MAGIC;
blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
break;
@@ -1636,6 +1704,12 @@ xfs_da3_node_lookup_int(
if (magic == XFS_DIR2_LEAFN_MAGIC ||
magic == XFS_DIR3_LEAFN_MAGIC) {
+ fa = xfs_dir3_leaf_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
blk->magic = XFS_DIR2_LEAFN_MAGIC;
blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
blk->bp, NULL);
@@ -1648,6 +1722,13 @@ xfs_da3_node_lookup_int(
return -EFSCORRUPTED;
}
+ fa = xfs_da3_node_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
+
blk->magic = XFS_DA_NODE_MAGIC;
/*
@@ -1820,6 +1901,7 @@ xfs_da3_blk_link(
struct xfs_da_blkinfo *tmp_info;
struct xfs_da_args *args;
struct xfs_buf *bp;
+ xfs_failaddr_t fa;
int before = 0;
int error;
struct xfs_inode *dp = state->args->dp;
@@ -1863,6 +1945,13 @@ xfs_da3_blk_link(
&bp, args->whichfork);
if (error)
return error;
+ fa = xfs_da3_header_check(bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(args->trans, bp);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == old_info->magic);
@@ -1884,6 +1973,13 @@ xfs_da3_blk_link(
&bp, args->whichfork);
if (error)
return error;
+ fa = xfs_da3_header_check(bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(args->trans, bp);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == old_info->magic);
@@ -1913,6 +2009,7 @@ xfs_da3_blk_unlink(
struct xfs_da_blkinfo *tmp_info;
struct xfs_da_args *args;
struct xfs_buf *bp;
+ xfs_failaddr_t fa;
int error;
/*
@@ -1943,6 +2040,13 @@ xfs_da3_blk_unlink(
&bp, args->whichfork);
if (error)
return error;
+ fa = xfs_da3_header_check(bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(args->trans, bp);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == save_info->magic);
@@ -1960,6 +2064,13 @@ xfs_da3_blk_unlink(
&bp, args->whichfork);
if (error)
return error;
+ fa = xfs_da3_header_check(bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(args->trans, bp);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == save_info->magic);
@@ -1996,6 +2107,7 @@ xfs_da3_path_shift(
struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
struct xfs_buf *bp;
+ xfs_failaddr_t fa;
xfs_dablk_t blkno = 0;
int level;
int error;
@@ -2074,6 +2186,12 @@ xfs_da3_path_shift(
switch (be16_to_cpu(info->magic)) {
case XFS_DA_NODE_MAGIC:
case XFS_DA3_NODE_MAGIC:
+ fa = xfs_da3_node_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
blk->magic = XFS_DA_NODE_MAGIC;
xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
bp->b_addr);
@@ -2087,6 +2205,12 @@ xfs_da3_path_shift(
break;
case XFS_ATTR_LEAF_MAGIC:
case XFS_ATTR3_LEAF_MAGIC:
+ fa = xfs_attr3_leaf_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
blk->magic = XFS_ATTR_LEAF_MAGIC;
ASSERT(level == path->active-1);
blk->index = 0;
@@ -2094,6 +2218,12 @@ xfs_da3_path_shift(
break;
case XFS_DIR2_LEAFN_MAGIC:
case XFS_DIR3_LEAFN_MAGIC:
+ fa = xfs_dir3_leaf_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
blk->magic = XFS_DIR2_LEAFN_MAGIC;
ASSERT(level == path->active-1);
blk->index = 0;
@@ -2167,8 +2297,8 @@ xfs_da_grow_inode_int(
struct xfs_inode *dp = args->dp;
int w = args->whichfork;
xfs_rfsblock_t nblks = dp->i_nblocks;
- struct xfs_bmbt_irec map, *mapp;
- int nmap, error, got, i, mapi;
+ struct xfs_bmbt_irec map, *mapp = &map;
+ int nmap, error, got, i, mapi = 1;
/*
* Find a spot in the file space to put the new block.
@@ -2184,14 +2314,7 @@ xfs_da_grow_inode_int(
error = xfs_bmapi_write(tp, dp, *bno, count,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
args->total, &map, &nmap);
- if (error)
- return error;
-
- ASSERT(nmap <= 1);
- if (nmap == 1) {
- mapp = &map;
- mapi = 1;
- } else if (nmap == 0 && count > 1) {
+ if (error == -ENOSPC && count > 1) {
xfs_fileoff_t b;
int c;
@@ -2209,16 +2332,13 @@ xfs_da_grow_inode_int(
args->total, &mapp[mapi], &nmap);
if (error)
goto out_free_map;
- if (nmap < 1)
- break;
mapi += nmap;
b = mapp[mapi - 1].br_startoff +
mapp[mapi - 1].br_blockcount;
}
- } else {
- mapi = 0;
- mapp = NULL;
}
+ if (error)
+ goto out_free_map;
/*
* Count the blocks we got, make sure it matches the total.
@@ -2290,6 +2410,7 @@ xfs_da3_swap_lastblock(
struct xfs_buf *last_buf;
struct xfs_buf *sib_buf;
struct xfs_buf *par_buf;
+ xfs_failaddr_t fa;
xfs_dahash_t dead_hash;
xfs_fileoff_t lastoff;
xfs_dablk_t dead_blkno;
@@ -2326,6 +2447,14 @@ xfs_da3_swap_lastblock(
error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w);
if (error)
return error;
+ fa = xfs_da3_header_check(last_buf, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(last_buf, fa);
+ xfs_trans_brelse(tp, last_buf);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
+
/*
* Copy the last block into the dead buffer and log it.
*/
@@ -2364,6 +2493,13 @@ xfs_da3_swap_lastblock(
error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
if (error)
goto done;
+ fa = xfs_da3_header_check(sib_buf, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(sib_buf, fa);
+ xfs_da_mark_sick(args);
+ error = -EFSCORRUPTED;
+ goto done;
+ }
sib_info = sib_buf->b_addr;
if (XFS_IS_CORRUPT(mp,
be32_to_cpu(sib_info->forw) != last_blkno ||
@@ -2385,6 +2521,13 @@ xfs_da3_swap_lastblock(
error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
if (error)
goto done;
+ fa = xfs_da3_header_check(sib_buf, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(sib_buf, fa);
+ xfs_da_mark_sick(args);
+ error = -EFSCORRUPTED;
+ goto done;
+ }
sib_info = sib_buf->b_addr;
if (XFS_IS_CORRUPT(mp,
be32_to_cpu(sib_info->back) != last_blkno ||
@@ -2408,6 +2551,13 @@ xfs_da3_swap_lastblock(
error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
if (error)
goto done;
+ fa = xfs_da3_node_header_check(par_buf, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(par_buf, fa);
+ xfs_da_mark_sick(args);
+ error = -EFSCORRUPTED;
+ goto done;
+ }
par_node = par_buf->b_addr;
xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
if (XFS_IS_CORRUPT(mp,
@@ -2457,6 +2607,13 @@ xfs_da3_swap_lastblock(
error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
if (error)
goto done;
+ fa = xfs_da3_node_header_check(par_buf, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(par_buf, fa);
+ xfs_da_mark_sick(args);
+ error = -EFSCORRUPTED;
+ goto done;
+ }
par_node = par_buf->b_addr;
xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) {
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 706baf36e175..354d5d65043e 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -54,17 +54,24 @@ enum xfs_dacmp {
*/
typedef struct xfs_da_args {
struct xfs_da_geometry *geo; /* da block geometry */
- const uint8_t *name; /* string (maybe not NULL terminated) */
- int namelen; /* length of string (maybe no NULL) */
- uint8_t filetype; /* filetype of inode for directories */
+ const uint8_t *name; /* string (maybe not NULL terminated) */
+ const uint8_t *new_name; /* new attr name */
void *value; /* set of bytes (maybe contain NULLs) */
- int valuelen; /* length of value */
- unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
- unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */
- xfs_dahash_t hashval; /* hash value of name */
- xfs_ino_t inumber; /* input/output inode number */
+ void *new_value; /* new xattr value (may contain NULLs) */
struct xfs_inode *dp; /* directory inode to manipulate */
struct xfs_trans *trans; /* current trans (changes over time) */
+
+ xfs_ino_t inumber; /* input/output inode number */
+ xfs_ino_t owner; /* inode that owns the dir/attr data */
+
+ int valuelen; /* length of value */
+ int new_valuelen; /* length of new_value */
+ uint8_t filetype; /* filetype of inode for directories */
+ uint8_t op_flags; /* operation flags */
+ uint8_t attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
+ short namelen; /* length of string (maybe no NULL) */
+ short new_namelen; /* length of new attr name */
+ xfs_dahash_t hashval; /* hash value of name */
xfs_extlen_t total; /* total blocks needed, for 1st bmap */
int whichfork; /* data or attribute fork */
xfs_dablk_t blkno; /* blkno of attr leaf of interest */
@@ -77,7 +84,6 @@ typedef struct xfs_da_args {
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
int rmtvaluelen2; /* remote attr value length in bytes */
- uint32_t op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
@@ -89,10 +95,8 @@ typedef struct xfs_da_args {
#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */
#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */
#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */
-#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
-#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
-#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
-#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */
+#define XFS_DA_OP_RECOVERY (1u << 5) /* Log recovery operation */
+#define XFS_DA_OP_LOGGED (1u << 6) /* Use intent items to track op */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
@@ -100,8 +104,6 @@ typedef struct xfs_da_args {
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_NOTIME, "NOTIME" }, \
- { XFS_DA_OP_REMOVE, "REMOVE" }, \
{ XFS_DA_OP_RECOVERY, "RECOVERY" }, \
{ XFS_DA_OP_LOGGED, "LOGGED" }
@@ -235,6 +237,8 @@ void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
+xfs_failaddr_t xfs_da3_header_check(struct xfs_buf *bp, xfs_ino_t owner);
+xfs_failaddr_t xfs_da3_node_header_check(struct xfs_buf *bp, xfs_ino_t owner);
extern struct kmem_cache *xfs_da_state_cache;
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 060e5c96b70f..86de99e2f757 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -714,12 +714,30 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
+#define XFS_ATTR_PARENT_BIT 3 /* parent pointer attrs */
#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_PARENT (1u << XFS_ATTR_PARENT_BIT)
#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
-#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+
+#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | \
+ XFS_ATTR_SECURE | \
+ XFS_ATTR_PARENT)
+
+/* Private attr namespaces not exposed to userspace */
+#define XFS_ATTR_PRIVATE_NSP_MASK (XFS_ATTR_PARENT)
+
+#define XFS_ATTR_ONDISK_MASK (XFS_ATTR_NSP_ONDISK_MASK | \
+ XFS_ATTR_LOCAL | \
+ XFS_ATTR_INCOMPLETE)
+
+#define XFS_ATTR_NAMESPACE_STR \
+ { XFS_ATTR_LOCAL, "local" }, \
+ { XFS_ATTR_ROOT, "root" }, \
+ { XFS_ATTR_SECURE, "secure" }, \
+ { XFS_ATTR_PARENT, "parent" }
/*
* Alignment for namelist and valuelist entries (since they are mixed
@@ -862,9 +880,7 @@ struct xfs_attr3_rmt_hdr {
#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
-#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
- ((bufsize) - (xfs_has_crc((mp)) ? \
- sizeof(struct xfs_attr3_rmt_hdr) : 0))
+unsigned int xfs_attr3_rmt_buf_space(struct xfs_mount *mp);
/* Number of bytes in a directory block. */
static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
@@ -875,4 +891,17 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
struct xfs_da3_blkinfo *hdr3);
+/*
+ * Parent pointer attribute format definition
+ *
+ * The xattr name contains the dirent name.
+ * The xattr value encodes the parent inode number and generation to ease
+ * opening parents by handle.
+ * The xattr hashval is xfs_dir2_namehash() ^ p_ino
+ */
+struct xfs_parent_rec {
+ __be64 p_ino;
+ __be32 p_gen;
+} __packed;
+
#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index c13276095cc0..4a078e07e1a0 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -27,6 +27,7 @@
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trans_priv.h"
+#include "xfs_exchmaps.h"
static struct kmem_cache *xfs_defer_pending_cache;
@@ -1091,7 +1092,11 @@ xfs_defer_ops_continue(
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
/* Lock the captured resources to the new transaction. */
- if (dfc->dfc_held.dr_inos == 2)
+ if (dfc->dfc_held.dr_inos > 2) {
+ xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos);
+ xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos,
+ XFS_ILOCK_EXCL);
+ } else if (dfc->dfc_held.dr_inos == 2)
xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
else if (dfc->dfc_held.dr_inos == 1)
@@ -1176,6 +1181,10 @@ xfs_defer_init_item_caches(void)
error = xfs_attr_intent_init_cache();
if (error)
goto err;
+ error = xfs_exchmaps_intent_init_cache();
+ if (error)
+ goto err;
+
return 0;
err:
xfs_defer_destroy_item_caches();
@@ -1186,6 +1195,7 @@ err:
void
xfs_defer_destroy_item_caches(void)
{
+ xfs_exchmaps_intent_destroy_cache();
xfs_attr_intent_destroy_cache();
xfs_extfree_intent_destroy_cache();
xfs_bmap_intent_destroy_cache();
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 18a9fb92dde8..8b338031e487 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -72,12 +72,18 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
extern const struct xfs_defer_op_type xfs_attr_defer_type;
-
+extern const struct xfs_defer_op_type xfs_exchmaps_defer_type;
/*
* Deferred operation item relogging limits.
*/
-#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
+
+/*
+ * Rename w/ parent pointers can require up to 5 inodes with deferred ops to
+ * be joined to the transaction: src_dp, target_dp, src_ip, target_ip, and wip.
+ * These inodes are locked in sorted order by their inode numbers
+ */
+#define XFS_DEFER_OPS_NR_INODES 5
#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
/* Resources that must be held across a transaction roll. */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 4821519efad4..457f9a38f850 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -250,11 +250,68 @@ xfs_dir_init(
args->geo = dp->i_mount->m_dir_geo;
args->dp = dp;
args->trans = tp;
+ args->owner = dp->i_ino;
error = xfs_dir2_sf_create(args, pdp->i_ino);
kfree(args);
return error;
}
+enum xfs_dir2_fmt
+xfs_dir2_format(
+ struct xfs_da_args *args,
+ int *error)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ xfs_fileoff_t eof;
+
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ *error = 0;
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ return XFS_DIR2_FMT_SF;
+
+ *error = xfs_bmap_last_offset(dp, &eof, XFS_DATA_FORK);
+ if (*error)
+ return XFS_DIR2_FMT_ERROR;
+
+ if (eof == XFS_B_TO_FSB(mp, geo->blksize)) {
+ if (XFS_IS_CORRUPT(mp, dp->i_disk_size != geo->blksize)) {
+ xfs_da_mark_sick(args);
+ *error = -EFSCORRUPTED;
+ return XFS_DIR2_FMT_ERROR;
+ }
+ return XFS_DIR2_FMT_BLOCK;
+ }
+ if (eof == geo->leafblk + geo->fsbcount)
+ return XFS_DIR2_FMT_LEAF;
+ return XFS_DIR2_FMT_NODE;
+}
+
+int
+xfs_dir_createname_args(
+ struct xfs_da_args *args)
+{
+ int error;
+
+ if (!args->inumber)
+ args->op_flags |= XFS_DA_OP_JUSTCHECK;
+
+ switch (xfs_dir2_format(args, &error)) {
+ case XFS_DIR2_FMT_SF:
+ return xfs_dir2_sf_addname(args);
+ case XFS_DIR2_FMT_BLOCK:
+ return xfs_dir2_block_addname(args);
+ case XFS_DIR2_FMT_LEAF:
+ return xfs_dir2_leaf_addname(args);
+ case XFS_DIR2_FMT_NODE:
+ return xfs_dir2_node_addname(args);
+ default:
+ return error;
+ }
+}
+
/*
* Enter a name in a directory, or check for available space.
* If inum is 0, only the available space test is performed.
@@ -269,7 +326,6 @@ xfs_dir_createname(
{
struct xfs_da_args *args;
int rval;
- bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -295,31 +351,9 @@ xfs_dir_createname(
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
- if (!inum)
- args->op_flags |= XFS_DA_OP_JUSTCHECK;
-
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- rval = xfs_dir2_sf_addname(args);
- goto out_free;
- }
-
- rval = xfs_dir2_isblock(args, &v);
- if (rval)
- goto out_free;
- if (v) {
- rval = xfs_dir2_block_addname(args);
- goto out_free;
- }
+ args->owner = dp->i_ino;
- rval = xfs_dir2_isleaf(args, &v);
- if (rval)
- goto out_free;
- if (v)
- rval = xfs_dir2_leaf_addname(args);
- else
- rval = xfs_dir2_node_addname(args);
-
-out_free:
+ rval = xfs_dir_createname_args(args);
kfree(args);
return rval;
}
@@ -350,6 +384,34 @@ xfs_dir_cilookup_result(
return -EEXIST;
}
+int
+xfs_dir_lookup_args(
+ struct xfs_da_args *args)
+{
+ int error;
+
+ switch (xfs_dir2_format(args, &error)) {
+ case XFS_DIR2_FMT_SF:
+ error = xfs_dir2_sf_lookup(args);
+ break;
+ case XFS_DIR2_FMT_BLOCK:
+ error = xfs_dir2_block_lookup(args);
+ break;
+ case XFS_DIR2_FMT_LEAF:
+ error = xfs_dir2_leaf_lookup(args);
+ break;
+ case XFS_DIR2_FMT_NODE:
+ error = xfs_dir2_node_lookup(args);
+ break;
+ default:
+ break;
+ }
+
+ if (error != -EEXIST)
+ return error;
+ return 0;
+}
+
/*
* Lookup a name in a directory, give back the inode number.
* If ci_name is not NULL, returns the actual name in ci_name if it differs
@@ -366,7 +428,6 @@ xfs_dir_lookup(
{
struct xfs_da_args *args;
int rval;
- bool v;
int lock_mode;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -383,34 +444,12 @@ xfs_dir_lookup(
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
args->op_flags = XFS_DA_OP_OKNOENT;
+ args->owner = dp->i_ino;
if (ci_name)
args->op_flags |= XFS_DA_OP_CILOOKUP;
lock_mode = xfs_ilock_data_map_shared(dp);
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- rval = xfs_dir2_sf_lookup(args);
- goto out_check_rval;
- }
-
- rval = xfs_dir2_isblock(args, &v);
- if (rval)
- goto out_free;
- if (v) {
- rval = xfs_dir2_block_lookup(args);
- goto out_check_rval;
- }
-
- rval = xfs_dir2_isleaf(args, &v);
- if (rval)
- goto out_free;
- if (v)
- rval = xfs_dir2_leaf_lookup(args);
- else
- rval = xfs_dir2_node_lookup(args);
-
-out_check_rval:
- if (rval == -EEXIST)
- rval = 0;
+ rval = xfs_dir_lookup_args(args);
if (!rval) {
*inum = args->inumber;
if (ci_name) {
@@ -418,12 +457,31 @@ out_check_rval:
ci_name->len = args->valuelen;
}
}
-out_free:
xfs_iunlock(dp, lock_mode);
kfree(args);
return rval;
}
+int
+xfs_dir_removename_args(
+ struct xfs_da_args *args)
+{
+ int error;
+
+ switch (xfs_dir2_format(args, &error)) {
+ case XFS_DIR2_FMT_SF:
+ return xfs_dir2_sf_removename(args);
+ case XFS_DIR2_FMT_BLOCK:
+ return xfs_dir2_block_removename(args);
+ case XFS_DIR2_FMT_LEAF:
+ return xfs_dir2_leaf_removename(args);
+ case XFS_DIR2_FMT_NODE:
+ return xfs_dir2_node_removename(args);
+ default:
+ return error;
+ }
+}
+
/*
* Remove an entry from a directory.
*/
@@ -431,13 +489,12 @@ int
xfs_dir_removename(
struct xfs_trans *tp,
struct xfs_inode *dp,
- struct xfs_name *name,
+ const struct xfs_name *name,
xfs_ino_t ino,
xfs_extlen_t total) /* bmap's total block count */
{
struct xfs_da_args *args;
int rval;
- bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
@@ -456,30 +513,30 @@ xfs_dir_removename(
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
+ args->owner = dp->i_ino;
+ rval = xfs_dir_removename_args(args);
+ kfree(args);
+ return rval;
+}
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- rval = xfs_dir2_sf_removename(args);
- goto out_free;
- }
+int
+xfs_dir_replace_args(
+ struct xfs_da_args *args)
+{
+ int error;
- rval = xfs_dir2_isblock(args, &v);
- if (rval)
- goto out_free;
- if (v) {
- rval = xfs_dir2_block_removename(args);
- goto out_free;
+ switch (xfs_dir2_format(args, &error)) {
+ case XFS_DIR2_FMT_SF:
+ return xfs_dir2_sf_replace(args);
+ case XFS_DIR2_FMT_BLOCK:
+ return xfs_dir2_block_replace(args);
+ case XFS_DIR2_FMT_LEAF:
+ return xfs_dir2_leaf_replace(args);
+ case XFS_DIR2_FMT_NODE:
+ return xfs_dir2_node_replace(args);
+ default:
+ return error;
}
-
- rval = xfs_dir2_isleaf(args, &v);
- if (rval)
- goto out_free;
- if (v)
- rval = xfs_dir2_leaf_removename(args);
- else
- rval = xfs_dir2_node_removename(args);
-out_free:
- kfree(args);
- return rval;
}
/*
@@ -495,7 +552,6 @@ xfs_dir_replace(
{
struct xfs_da_args *args;
int rval;
- bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -517,28 +573,8 @@ xfs_dir_replace(
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
-
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- rval = xfs_dir2_sf_replace(args);
- goto out_free;
- }
-
- rval = xfs_dir2_isblock(args, &v);
- if (rval)
- goto out_free;
- if (v) {
- rval = xfs_dir2_block_replace(args);
- goto out_free;
- }
-
- rval = xfs_dir2_isleaf(args, &v);
- if (rval)
- goto out_free;
- if (v)
- rval = xfs_dir2_leaf_replace(args);
- else
- rval = xfs_dir2_node_replace(args);
-out_free:
+ args->owner = dp->i_ino;
+ rval = xfs_dir_replace_args(args);
kfree(args);
return rval;
}
@@ -607,57 +643,6 @@ xfs_dir2_grow_inode(
}
/*
- * See if the directory is a single-block form directory.
- */
-int
-xfs_dir2_isblock(
- struct xfs_da_args *args,
- bool *isblock)
-{
- struct xfs_mount *mp = args->dp->i_mount;
- xfs_fileoff_t eof;
- int error;
-
- error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
- if (error)
- return error;
-
- *isblock = false;
- if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize)
- return 0;
-
- *isblock = true;
- if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) {
- xfs_da_mark_sick(args);
- return -EFSCORRUPTED;
- }
- return 0;
-}
-
-/*
- * See if the directory is a single-leaf form directory.
- */
-int
-xfs_dir2_isleaf(
- struct xfs_da_args *args,
- bool *isleaf)
-{
- xfs_fileoff_t eof;
- int error;
-
- error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
- if (error)
- return error;
-
- *isleaf = false;
- if (eof != args->geo->leafblk + args->geo->fsbcount)
- return 0;
-
- *isleaf = true;
- return 0;
-}
-
-/*
* Remove the given block from the directory.
* This routine is used for data and free blocks, leaf/node are done
* by xfs_da_shrink_inode.
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 8497d041f316..6dbe6e9ecb49 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -36,6 +36,16 @@ xfs_dir2_samename(
return !memcmp(n1->name, n2->name, n1->len);
}
+enum xfs_dir2_fmt {
+ XFS_DIR2_FMT_SF,
+ XFS_DIR2_FMT_BLOCK,
+ XFS_DIR2_FMT_LEAF,
+ XFS_DIR2_FMT_NODE,
+ XFS_DIR2_FMT_ERROR,
+};
+
+enum xfs_dir2_fmt xfs_dir2_format(struct xfs_da_args *args, int *error);
+
/*
* Convert inode mode to directory entry filetype
*/
@@ -58,7 +68,7 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
const struct xfs_name *name, xfs_ino_t *inum,
struct xfs_name *ci_name);
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t ino,
+ const struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
const struct xfs_name *name, xfs_ino_t inum,
@@ -66,6 +76,11 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name);
+int xfs_dir_lookup_args(struct xfs_da_args *args);
+int xfs_dir_createname_args(struct xfs_da_args *args);
+int xfs_dir_removename_args(struct xfs_da_args *args);
+int xfs_dir_replace_args(struct xfs_da_args *args);
+
/*
* Direct call from the bmap code, bypassing the generic directory layer.
*/
@@ -74,8 +89,6 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
/*
* Interface routines used by userspace utilities
*/
-extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock);
-extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
@@ -101,6 +114,10 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+xfs_failaddr_t xfs_dir3_leaf_header_check(struct xfs_buf *bp, xfs_ino_t owner);
+xfs_failaddr_t xfs_dir3_data_header_check(struct xfs_buf *bp, xfs_ino_t owner);
+xfs_failaddr_t xfs_dir3_block_header_check(struct xfs_buf *bp, xfs_ino_t owner);
+
extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index a2da007adb46..0f93ed1a4a74 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -115,17 +115,20 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.verify_struct = xfs_dir3_block_verify,
};
-static xfs_failaddr_t
+xfs_failaddr_t
xfs_dir3_block_header_check(
- struct xfs_inode *dp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
{
- struct xfs_mount *mp = dp->i_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (xfs_has_crc(mp)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- if (be64_to_cpu(hdr3->owner) != dp->i_ino)
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+ return __this_address;
+
+ if (be64_to_cpu(hdr3->owner) != owner)
return __this_address;
}
@@ -136,6 +139,7 @@ int
xfs_dir3_block_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dp->i_mount;
@@ -148,7 +152,7 @@ xfs_dir3_block_read(
return err;
/* Check things that we can't do in the verifier. */
- fa = xfs_dir3_block_header_check(dp, *bpp);
+ fa = xfs_dir3_block_header_check(*bpp, owner);
if (fa) {
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
@@ -163,12 +167,13 @@ xfs_dir3_block_read(
static void
xfs_dir3_block_init(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_buf *bp,
- struct xfs_inode *dp)
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
{
- struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
bp->b_ops = &xfs_dir3_block_buf_ops;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
@@ -177,7 +182,7 @@ xfs_dir3_block_init(
memset(hdr3, 0, sizeof(*hdr3));
hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
- hdr3->owner = cpu_to_be64(dp->i_ino);
+ hdr3->owner = cpu_to_be64(args->owner);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
return;
@@ -382,7 +387,7 @@ xfs_dir2_block_addname(
tp = args->trans;
/* Read the (one and only) directory block into bp. */
- error = xfs_dir3_block_read(tp, dp, &bp);
+ error = xfs_dir3_block_read(tp, dp, args->owner, &bp);
if (error)
return error;
@@ -697,7 +702,7 @@ xfs_dir2_block_lookup_int(
dp = args->dp;
tp = args->trans;
- error = xfs_dir3_block_read(tp, dp, &bp);
+ error = xfs_dir3_block_read(tp, dp, args->owner, &bp);
if (error)
return error;
@@ -981,7 +986,8 @@ xfs_dir2_leaf_to_block(
* Read the data block if we don't already have it, give up if it fails.
*/
if (!dbp) {
- error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ args->geo->datablk, 0, &dbp);
if (error)
return error;
}
@@ -1009,7 +1015,7 @@ xfs_dir2_leaf_to_block(
/*
* Start converting it to block form.
*/
- xfs_dir3_block_init(mp, tp, dbp, dp);
+ xfs_dir3_block_init(args, dbp);
needlog = 1;
needscan = 0;
@@ -1129,7 +1135,7 @@ xfs_dir2_sf_to_block(
error = xfs_dir3_data_init(args, blkno, &bp);
if (error)
goto out_free;
- xfs_dir3_block_init(mp, tp, bp, dp);
+ xfs_dir3_block_init(args, bp);
hdr = bp->b_addr;
/*
@@ -1169,7 +1175,7 @@ xfs_dir2_sf_to_block(
* Create entry for .
*/
dep = bp->b_addr + offset;
- dep->inumber = cpu_to_be64(dp->i_ino);
+ dep->inumber = cpu_to_be64(args->owner);
dep->namelen = 1;
dep->name[0] = '.';
xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 7a6d965bea71..ea0b9628df18 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -395,17 +395,20 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.verify_write = xfs_dir3_data_write_verify,
};
-static xfs_failaddr_t
+xfs_failaddr_t
xfs_dir3_data_header_check(
- struct xfs_inode *dp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
{
- struct xfs_mount *mp = dp->i_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (xfs_has_crc(mp)) {
struct xfs_dir3_data_hdr *hdr3 = bp->b_addr;
- if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ if (hdr3->hdr.magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+ return __this_address;
+
+ if (be64_to_cpu(hdr3->hdr.owner) != owner)
return __this_address;
}
@@ -416,6 +419,7 @@ int
xfs_dir3_data_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t bno,
unsigned int flags,
struct xfs_buf **bpp)
@@ -429,7 +433,7 @@ xfs_dir3_data_read(
return err;
/* Check things that we can't do in the verifier. */
- fa = xfs_dir3_data_header_check(dp, *bpp);
+ fa = xfs_dir3_data_header_check(*bpp, owner);
if (fa) {
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
@@ -725,7 +729,7 @@ xfs_dir3_data_init(
memset(hdr3, 0, sizeof(*hdr3));
hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
- hdr3->owner = cpu_to_be64(dp->i_ino);
+ hdr3->owner = cpu_to_be64(args->owner);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
} else
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 08dda5ce9d91..71c2f22a3f6e 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -208,6 +208,29 @@ xfs_dir3_leaf_verify(
return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true);
}
+xfs_failaddr_t
+xfs_dir3_leaf_header_check(
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_leaf *hdr3 = bp->b_addr;
+
+ if (hdr3->hdr.info.hdr.magic !=
+ cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) &&
+ hdr3->hdr.info.hdr.magic !=
+ cpu_to_be16(XFS_DIR3_LEAFN_MAGIC))
+ return __this_address;
+
+ if (be64_to_cpu(hdr3->hdr.info.owner) != owner)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
static void
xfs_dir3_leaf_read_verify(
struct xfs_buf *bp)
@@ -271,32 +294,60 @@ int
xfs_dir3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
&xfs_dir3_leaf1_buf_ops);
- if (!err && tp && *bpp)
+ if (err || !(*bpp))
+ return err;
+
+ fa = xfs_dir3_leaf_header_check(*bpp, owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
+ return -EFSCORRUPTED;
+ }
+
+ if (tp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
- return err;
+ return 0;
}
int
xfs_dir3_leafn_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
&xfs_dir3_leafn_buf_ops);
- if (!err && tp && *bpp)
+ if (err || !(*bpp))
+ return err;
+
+ fa = xfs_dir3_leaf_header_check(*bpp, owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
+ return -EFSCORRUPTED;
+ }
+
+ if (tp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
- return err;
+ return 0;
}
/*
@@ -304,12 +355,12 @@ xfs_dir3_leafn_read(
*/
static void
xfs_dir3_leaf_init(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
- xfs_ino_t owner,
uint16_t type)
{
+ struct xfs_mount *mp = args->dp->i_mount;
+ struct xfs_trans *tp = args->trans;
struct xfs_dir2_leaf *leaf = bp->b_addr;
ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
@@ -323,7 +374,7 @@ xfs_dir3_leaf_init(
? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
: cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
- leaf3->info.owner = cpu_to_be64(owner);
+ leaf3->info.owner = cpu_to_be64(args->owner);
uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
memset(leaf, 0, sizeof(*leaf));
@@ -356,7 +407,6 @@ xfs_dir3_leaf_get_buf(
{
struct xfs_inode *dp = args->dp;
struct xfs_trans *tp = args->trans;
- struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp;
int error;
@@ -369,7 +419,7 @@ xfs_dir3_leaf_get_buf(
if (error)
return error;
- xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+ xfs_dir3_leaf_init(args, bp, magic);
xfs_dir3_leaf_log_header(args, bp);
if (magic == XFS_DIR2_LEAF1_MAGIC)
xfs_dir3_leaf_log_tail(args, bp);
@@ -647,7 +697,8 @@ xfs_dir2_leaf_addname(
trace_xfs_dir2_leaf_addname(args);
- error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk,
+ &lbp);
if (error)
return error;
@@ -834,9 +885,9 @@ xfs_dir2_leaf_addname(
* Already had space in some data block.
* Just read that one in.
*/
- error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, use_block),
- 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(args->geo, use_block), 0,
+ &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1238,7 +1289,8 @@ xfs_dir2_leaf_lookup_int(
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk,
+ &lbp);
if (error)
return error;
@@ -1276,9 +1328,9 @@ xfs_dir2_leaf_lookup_int(
if (newdb != curdb) {
if (dbp)
xfs_trans_brelse(tp, dbp);
- error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, newdb),
- 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(args->geo, newdb), 0,
+ &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1318,9 +1370,9 @@ xfs_dir2_leaf_lookup_int(
ASSERT(cidb != -1);
if (cidb != curdb) {
xfs_trans_brelse(tp, dbp);
- error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, cidb),
- 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(args->geo, cidb), 0,
+ &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1614,7 +1666,8 @@ xfs_dir2_leaf_trim_data(
/*
* Read the offending data block. We need its buffer.
*/
- error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(geo, db), 0, &dbp);
if (error)
return error;
@@ -1753,7 +1806,8 @@ xfs_dir2_node_to_leaf(
/*
* Read the freespace block.
*/
- error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp);
+ error = xfs_dir2_free_read(tp, dp, args->owner, args->geo->freeblk,
+ &fbp);
if (error)
return error;
xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index be0b8834028c..fe8d4fa13128 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -175,11 +175,11 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
/* Everything ok in the free block header? */
static xfs_failaddr_t
xfs_dir3_free_header_check(
- struct xfs_inode *dp,
- xfs_dablk_t fbno,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ xfs_ino_t owner,
+ xfs_dablk_t fbno)
{
- struct xfs_mount *mp = dp->i_mount;
+ struct xfs_mount *mp = bp->b_mount;
int maxbests = mp->m_dir_geo->free_max_bests;
unsigned int firstdb;
@@ -195,7 +195,7 @@ xfs_dir3_free_header_check(
return __this_address;
if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
return __this_address;
- if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ if (be64_to_cpu(hdr3->hdr.owner) != owner)
return __this_address;
} else {
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
@@ -214,6 +214,7 @@ static int
__xfs_dir3_free_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t fbno,
unsigned int flags,
struct xfs_buf **bpp)
@@ -227,7 +228,7 @@ __xfs_dir3_free_read(
return err;
/* Check things that we can't do in the verifier. */
- fa = xfs_dir3_free_header_check(dp, fbno, *bpp);
+ fa = xfs_dir3_free_header_check(*bpp, owner, fbno);
if (fa) {
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
@@ -299,20 +300,23 @@ int
xfs_dir2_free_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp);
+ return __xfs_dir3_free_read(tp, dp, owner, fbno, 0, bpp);
}
static int
xfs_dir2_free_try_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp);
+ return __xfs_dir3_free_read(tp, dp, owner, fbno, XFS_DABUF_MAP_HOLE_OK,
+ bpp);
}
static int
@@ -349,7 +353,7 @@ xfs_dir3_free_get_buf(
hdr.magic = XFS_DIR3_FREE_MAGIC;
hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp));
- hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+ hdr3->hdr.owner = cpu_to_be64(args->owner);
uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
} else
hdr.magic = XFS_DIR2_FREE_MAGIC;
@@ -717,7 +721,7 @@ xfs_dir2_leafn_lookup_for_addname(
if (curbp)
xfs_trans_brelse(tp, curbp);
- error = xfs_dir2_free_read(tp, dp,
+ error = xfs_dir2_free_read(tp, dp, args->owner,
xfs_dir2_db_to_da(args->geo,
newfdb),
&curbp);
@@ -863,7 +867,7 @@ xfs_dir2_leafn_lookup_for_entry(
ASSERT(state->extravalid);
curbp = state->extrablk.bp;
} else {
- error = xfs_dir3_data_read(tp, dp,
+ error = xfs_dir3_data_read(tp, dp, args->owner,
xfs_dir2_db_to_da(args->geo,
newdb),
0, &curbp);
@@ -1356,8 +1360,8 @@ xfs_dir2_leafn_remove(
* read in the free block.
*/
fdb = xfs_dir2_db_to_fdb(geo, db);
- error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb),
- &fbp);
+ error = xfs_dir2_free_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(geo, fdb), &fbp);
if (error)
return error;
free = fbp->b_addr;
@@ -1562,7 +1566,8 @@ xfs_dir2_leafn_toosmall(
/*
* Read the sibling leaf block.
*/
- error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp);
+ error = xfs_dir3_leafn_read(state->args->trans, dp,
+ state->args->owner, blkno, &bp);
if (error)
return error;
@@ -1715,7 +1720,7 @@ xfs_dir2_node_add_datablk(
* that was just allocated.
*/
fbno = xfs_dir2_db_to_fdb(args->geo, *dbno);
- error = xfs_dir2_free_try_read(tp, dp,
+ error = xfs_dir2_free_try_read(tp, dp, args->owner,
xfs_dir2_db_to_da(args->geo, fbno), &fbp);
if (error)
return error;
@@ -1862,7 +1867,7 @@ xfs_dir2_node_find_freeblk(
* so this might not succeed. This should be really rare, so
* there's no reason to avoid it.
*/
- error = xfs_dir2_free_try_read(tp, dp,
+ error = xfs_dir2_free_try_read(tp, dp, args->owner,
xfs_dir2_db_to_da(args->geo, fbno),
&fbp);
if (error)
@@ -1948,9 +1953,8 @@ xfs_dir2_node_addname_int(
&freehdr, &findex);
} else {
/* Read the data block in. */
- error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, dbno),
- 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(args->geo, dbno), 0, &dbp);
}
if (error)
return error;
@@ -2302,7 +2306,7 @@ xfs_dir2_node_trim_free(
/*
* Read the freespace block.
*/
- error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+ error = xfs_dir2_free_try_read(tp, dp, args->owner, fo, &bp);
if (error)
return error;
/*
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 1db2e60ba827..3befb32509fa 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -50,8 +50,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
/* xfs_dir2_block.c */
-extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_buf **bpp);
+int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_ino_t owner, struct xfs_buf **bpp);
extern int xfs_dir2_block_addname(struct xfs_da_args *args);
extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
extern int xfs_dir2_block_removename(struct xfs_da_args *args);
@@ -78,7 +78,8 @@ extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp,
struct xfs_buf *bp);
int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp);
+ xfs_ino_t owner, xfs_dablk_t bno, unsigned int flags,
+ struct xfs_buf **bpp);
int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
unsigned int flags);
@@ -95,9 +96,9 @@ void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp,
void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to,
struct xfs_dir3_icleaf_hdr *from);
int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t fbno, struct xfs_buf **bpp);
+ xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp);
int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t fbno, struct xfs_buf **bpp);
+ xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp);
extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
struct xfs_buf *dbp);
extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -154,8 +155,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
extern int xfs_dir2_node_replace(struct xfs_da_args *args);
extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
int *rvalp);
-extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t fbno, struct xfs_buf **bpp);
+int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp);
/* xfs_dir2_sf.c */
xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 01a9e86b3037..7002d7676a78 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -63,7 +63,8 @@
#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
#define XFS_ERRTAG_WB_DELAY_MS 42
#define XFS_ERRTAG_WRITE_DELAY_MS 43
-#define XFS_ERRTAG_MAX 44
+#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44
+#define XFS_ERRTAG_MAX 45
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -111,5 +112,6 @@
#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
#define XFS_RANDOM_WB_DELAY_MS 3000
#define XFS_RANDOM_WRITE_DELAY_MS 3000
+#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c
new file mode 100644
index 000000000000..2021396651de
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_exchmaps.c
@@ -0,0 +1,1235 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_quota.h"
+#include "xfs_exchmaps.h"
+#include "xfs_trace.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
+#include "xfs_health.h"
+#include "xfs_exchmaps_item.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_dir2.h"
+#include "xfs_symlink_remote.h"
+
+struct kmem_cache *xfs_exchmaps_intent_cache;
+
+/* bmbt mappings adjacent to a pair of records. */
+struct xfs_exchmaps_adjacent {
+ struct xfs_bmbt_irec left1;
+ struct xfs_bmbt_irec right1;
+ struct xfs_bmbt_irec left2;
+ struct xfs_bmbt_irec right2;
+};
+
+#define ADJACENT_INIT { \
+ .left1 = { .br_startblock = HOLESTARTBLOCK }, \
+ .right1 = { .br_startblock = HOLESTARTBLOCK }, \
+ .left2 = { .br_startblock = HOLESTARTBLOCK }, \
+ .right2 = { .br_startblock = HOLESTARTBLOCK }, \
+}
+
+/* Information to reset reflink flag / CoW fork state after an exchange. */
+
+/*
+ * If the reflink flag is set on either inode, make sure it has an incore CoW
+ * fork, since all reflink inodes must have them. If there's a CoW fork and it
+ * has mappings in it, make sure the inodes are tagged appropriately so that
+ * speculative preallocations can be GC'd if we run low of space.
+ */
+static inline void
+xfs_exchmaps_ensure_cowfork(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *cfork;
+
+ if (xfs_is_reflink_inode(ip))
+ xfs_ifork_init_cow(ip);
+
+ cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
+ if (!cfork)
+ return;
+ if (cfork->if_bytes > 0)
+ xfs_inode_set_cowblocks_tag(ip);
+ else
+ xfs_inode_clear_cowblocks_tag(ip);
+}
+
+/*
+ * Adjust the on-disk inode size upwards if needed so that we never add
+ * mappings into the file past EOF. This is crucial so that log recovery won't
+ * get confused by the sudden appearance of post-eof mappings.
+ */
+STATIC void
+xfs_exchmaps_update_size(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ xfs_fsize_t new_isize)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_fsize_t len;
+
+ if (new_isize < 0)
+ return;
+
+ len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount),
+ new_isize);
+
+ if (len <= ip->i_disk_size)
+ return;
+
+ trace_xfs_exchmaps_update_inode_size(ip, len);
+
+ ip->i_disk_size = len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Advance the incore state tracking after exchanging a mapping. */
+static inline void
+xmi_advance(
+ struct xfs_exchmaps_intent *xmi,
+ const struct xfs_bmbt_irec *irec)
+{
+ xmi->xmi_startoff1 += irec->br_blockcount;
+ xmi->xmi_startoff2 += irec->br_blockcount;
+ xmi->xmi_blockcount -= irec->br_blockcount;
+}
+
+/* Do we still have more mappings to exchange? */
+static inline bool
+xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi)
+{
+ return xmi->xmi_blockcount > 0;
+}
+
+/* Do we have post-operation cleanups to perform? */
+static inline bool
+xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi)
+{
+ return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK |
+ XFS_EXCHMAPS_CLEAR_INO2_REFLINK |
+ __XFS_EXCHMAPS_INO2_SHORTFORM);
+}
+
+/* Check all mappings to make sure we can actually exchange them. */
+int
+xfs_exchmaps_check_forks(
+ struct xfs_mount *mp,
+ const struct xfs_exchmaps_req *req)
+{
+ struct xfs_ifork *ifp1, *ifp2;
+ int whichfork = xfs_exchmaps_reqfork(req);
+
+ /* No fork? */
+ ifp1 = xfs_ifork_ptr(req->ip1, whichfork);
+ ifp2 = xfs_ifork_ptr(req->ip2, whichfork);
+ if (!ifp1 || !ifp2)
+ return -EINVAL;
+
+ /* We don't know how to exchange local format forks. */
+ if (ifp1->if_format == XFS_DINODE_FMT_LOCAL ||
+ ifp2->if_format == XFS_DINODE_FMT_LOCAL)
+ return -EINVAL;
+
+ return 0;
+}
+
+#ifdef CONFIG_XFS_QUOTA
+/* Log the actual updates to the quota accounting. */
+static inline void
+xfs_exchmaps_update_quota(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi,
+ struct xfs_bmbt_irec *irec1,
+ struct xfs_bmbt_irec *irec2)
+{
+ int64_t ip1_delta = 0, ip2_delta = 0;
+ unsigned int qflag;
+
+ qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT :
+ XFS_TRANS_DQ_BCOUNT;
+
+ if (xfs_bmap_is_real_extent(irec1)) {
+ ip1_delta -= irec1->br_blockcount;
+ ip2_delta += irec1->br_blockcount;
+ }
+
+ if (xfs_bmap_is_real_extent(irec2)) {
+ ip1_delta += irec2->br_blockcount;
+ ip2_delta -= irec2->br_blockcount;
+ }
+
+ xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta);
+ xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta);
+}
+#else
+# define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0)
+#endif
+
+/* Decide if we want to skip this mapping from file1. */
+static inline bool
+xfs_exchmaps_can_skip_mapping(
+ struct xfs_exchmaps_intent *xmi,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = xmi->xmi_ip1->i_mount;
+
+ /* Do not skip this mapping if the caller did not tell us to. */
+ if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN))
+ return false;
+
+ /* Do not skip mapped, written mappings. */
+ if (xfs_bmap_is_written_extent(irec))
+ return false;
+
+ /*
+ * The mapping is unwritten or a hole. It cannot be a delalloc
+ * reservation because we already excluded those. It cannot be an
+ * unwritten extent with dirty page cache because we flushed the page
+ * cache. For files where the allocation unit is 1FSB (files on the
+ * data dev, rt files if the extent size is 1FSB), we can safely
+ * skip this mapping.
+ */
+ if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1))
+ return true;
+
+ /*
+ * For a realtime file with a multi-fsb allocation unit, the decision
+ * is trickier because we can only swap full allocation units.
+ * Unwritten mappings can appear in the middle of an rtx if the rtx is
+ * partially written, but they can also appear for preallocations.
+ *
+ * If the mapping is a hole, skip it entirely. Holes should align with
+ * rtx boundaries.
+ */
+ if (!xfs_bmap_is_real_extent(irec))
+ return true;
+
+ /*
+ * All mappings below this point are unwritten.
+ *
+ * - If the beginning is not aligned to an rtx, trim the end of the
+ * mapping so that it does not cross an rtx boundary, and swap it.
+ *
+ * - If both ends are aligned to an rtx, skip the entire mapping.
+ */
+ if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) {
+ xfs_fileoff_t new_end;
+
+ new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize);
+ irec->br_blockcount = min(irec->br_blockcount,
+ new_end - irec->br_startoff);
+ return false;
+ }
+ if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize))
+ return true;
+
+ /*
+ * All mappings below this point are unwritten, start on an rtx
+ * boundary, and do not end on an rtx boundary.
+ *
+ * - If the mapping is longer than one rtx, trim the end of the mapping
+ * down to an rtx boundary and skip it.
+ *
+ * - The mapping is shorter than one rtx. Swap it.
+ */
+ if (irec->br_blockcount > mp->m_sb.sb_rextsize) {
+ xfs_fileoff_t new_end;
+
+ new_end = rounddown_64(irec->br_startoff + irec->br_blockcount,
+ mp->m_sb.sb_rextsize);
+ irec->br_blockcount = new_end - irec->br_startoff;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Walk forward through the file ranges in @xmi until we find two different
+ * mappings to exchange. If there is work to do, return the mappings;
+ * otherwise we've reached the end of the range and xmi_blockcount will be
+ * zero.
+ *
+ * If the walk skips over a pair of mappings to the same storage, save them as
+ * the left records in @adj (if provided) so that the simulation phase can
+ * avoid an extra lookup.
+ */
+static int
+xfs_exchmaps_find_mappings(
+ struct xfs_exchmaps_intent *xmi,
+ struct xfs_bmbt_irec *irec1,
+ struct xfs_bmbt_irec *irec2,
+ struct xfs_exchmaps_adjacent *adj)
+{
+ int nimaps;
+ int bmap_flags;
+ int error;
+
+ bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi));
+
+ for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) {
+ /* Read mapping from the first file */
+ nimaps = 1;
+ error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1,
+ xmi->xmi_blockcount, irec1, &nimaps,
+ bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 ||
+ irec1->br_startblock == DELAYSTARTBLOCK ||
+ irec1->br_startoff != xmi->xmi_startoff1) {
+ /*
+ * We should never get no mapping or a delalloc mapping
+ * or something that doesn't match what we asked for,
+ * since the caller flushed both inodes and we hold the
+ * ILOCKs for both inodes.
+ */
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) {
+ trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1);
+ continue;
+ }
+
+ /* Read mapping from the second file */
+ nimaps = 1;
+ error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2,
+ irec1->br_blockcount, irec2, &nimaps,
+ bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 ||
+ irec2->br_startblock == DELAYSTARTBLOCK ||
+ irec2->br_startoff != xmi->xmi_startoff2) {
+ /*
+ * We should never get no mapping or a delalloc mapping
+ * or something that doesn't match what we asked for,
+ * since the caller flushed both inodes and we hold the
+ * ILOCKs for both inodes.
+ */
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ /*
+ * We can only exchange as many blocks as the smaller of the
+ * two mapping maps.
+ */
+ irec1->br_blockcount = min(irec1->br_blockcount,
+ irec2->br_blockcount);
+
+ trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1);
+ trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2);
+
+ /* We found something to exchange, so return it. */
+ if (irec1->br_startblock != irec2->br_startblock)
+ return 0;
+
+ /*
+ * Two mappings pointing to the same physical block must not
+ * have different states; that's filesystem corruption. Move
+ * on to the next mapping if they're both holes or both point
+ * to the same physical space extent.
+ */
+ if (irec1->br_state != irec2->br_state) {
+ xfs_bmap_mark_sick(xmi->xmi_ip1,
+ xfs_exchmaps_whichfork(xmi));
+ xfs_bmap_mark_sick(xmi->xmi_ip2,
+ xfs_exchmaps_whichfork(xmi));
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Save the mappings if we're estimating work and skipping
+ * these identical mappings.
+ */
+ if (adj) {
+ memcpy(&adj->left1, irec1, sizeof(*irec1));
+ memcpy(&adj->left2, irec2, sizeof(*irec2));
+ }
+ }
+
+ return 0;
+}
+
+/* Exchange these two mappings. */
+static void
+xfs_exchmaps_one_step(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi,
+ struct xfs_bmbt_irec *irec1,
+ struct xfs_bmbt_irec *irec2)
+{
+ int whichfork = xfs_exchmaps_whichfork(xmi);
+
+ xfs_exchmaps_update_quota(tp, xmi, irec1, irec2);
+
+ /* Remove both mappings. */
+ xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1);
+ xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2);
+
+ /*
+ * Re-add both mappings. We exchange the file offsets between the two
+ * maps and add the opposite map, which has the effect of filling the
+ * logical offsets we just unmapped, but with with the physical mapping
+ * information exchanged.
+ */
+ swap(irec1->br_startoff, irec2->br_startoff);
+ xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2);
+ xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1);
+
+ /* Make sure we're not adding mappings past EOF. */
+ if (whichfork == XFS_DATA_FORK) {
+ xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2,
+ xmi->xmi_isize1);
+ xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1,
+ xmi->xmi_isize2);
+ }
+
+ /*
+ * Advance our cursor and exit. The caller (either defer ops or log
+ * recovery) will log the XMD item, and if *blockcount is nonzero, it
+ * will log a new XMI item for the remainder and call us back.
+ */
+ xmi_advance(xmi, irec1);
+}
+
+/* Convert inode2's leaf attr fork back to shortform, if possible.. */
+STATIC int
+xfs_exchmaps_attr_to_sf(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ struct xfs_da_args args = {
+ .dp = xmi->xmi_ip2,
+ .geo = tp->t_mountp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .trans = tp,
+ .owner = xmi->xmi_ip2->i_ino,
+ };
+ struct xfs_buf *bp;
+ int forkoff;
+ int error;
+
+ if (!xfs_attr_is_leaf(xmi->xmi_ip2))
+ return 0;
+
+ error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0,
+ &bp);
+ if (error)
+ return error;
+
+ forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2);
+ if (forkoff == 0)
+ return 0;
+
+ return xfs_attr3_leaf_to_shortform(bp, &args, forkoff);
+}
+
+/* Convert inode2's block dir fork back to shortform, if possible.. */
+STATIC int
+xfs_exchmaps_dir_to_sf(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ struct xfs_da_args args = {
+ .dp = xmi->xmi_ip2,
+ .geo = tp->t_mountp->m_dir_geo,
+ .whichfork = XFS_DATA_FORK,
+ .trans = tp,
+ .owner = xmi->xmi_ip2->i_ino,
+ };
+ struct xfs_dir2_sf_hdr sfh;
+ struct xfs_buf *bp;
+ int size;
+ int error = 0;
+
+ if (xfs_dir2_format(&args, &error) != XFS_DIR2_FMT_BLOCK)
+ return error;
+
+ error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp);
+ if (error)
+ return error;
+
+ size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh);
+ if (size > xfs_inode_data_fork_size(xmi->xmi_ip2))
+ return 0;
+
+ return xfs_dir2_block_to_sf(&args, bp, size, &sfh);
+}
+
+/* Convert inode2's remote symlink target back to shortform, if possible. */
+STATIC int
+xfs_exchmaps_link_to_sf(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ struct xfs_inode *ip = xmi->xmi_ip2;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ char *buf;
+ int error;
+
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
+ ip->i_disk_size > xfs_inode_data_fork_size(ip))
+ return 0;
+
+ /* Read the current symlink target into a buffer. */
+ buf = kmalloc(ip->i_disk_size + 1,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+ if (!buf) {
+ ASSERT(0);
+ return -ENOMEM;
+ }
+
+ error = xfs_symlink_remote_read(ip, buf);
+ if (error)
+ goto free;
+
+ /* Remove the blocks. */
+ error = xfs_symlink_remote_truncate(tp, ip);
+ if (error)
+ goto free;
+
+ /* Convert fork to local format and log our changes. */
+ xfs_idestroy_fork(ifp);
+ ifp->if_bytes = 0;
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
+ xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+free:
+ kfree(buf);
+ return error;
+}
+
+/* Clear the reflink flag after an exchange. */
+static inline void
+xfs_exchmaps_clear_reflink(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ trace_xfs_reflink_unset_inode_flag(ip);
+
+ ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Finish whatever work might come after an exchange operation. */
+static int
+xfs_exchmaps_do_postop_work(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) {
+ int error = 0;
+
+ if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)
+ error = xfs_exchmaps_attr_to_sf(tp, xmi);
+ else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode))
+ error = xfs_exchmaps_dir_to_sf(tp, xmi);
+ else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode))
+ error = xfs_exchmaps_link_to_sf(tp, xmi);
+ xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM;
+ if (error)
+ return error;
+ }
+
+ if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) {
+ xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1);
+ xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK;
+ }
+
+ if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) {
+ xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2);
+ xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK;
+ }
+
+ return 0;
+}
+
+/* Finish one step in a mapping exchange operation, possibly relogging. */
+int
+xfs_exchmaps_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ struct xfs_bmbt_irec irec1, irec2;
+ int error;
+
+ if (xmi_has_more_exchange_work(xmi)) {
+ /*
+ * If the operation state says that some range of the files
+ * have not yet been exchanged, look for mappings in that range
+ * to exchange. If we find some mappings, exchange them.
+ */
+ error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL);
+ if (error)
+ return error;
+
+ if (xmi_has_more_exchange_work(xmi))
+ xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2);
+
+ /*
+ * If the caller asked us to exchange the file sizes after the
+ * exchange and either we just exchanged the last mappings in
+ * the range or we didn't find anything to exchange, update the
+ * ondisk file sizes.
+ */
+ if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) &&
+ !xmi_has_more_exchange_work(xmi)) {
+ xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1;
+ xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2;
+
+ xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE);
+ xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE);
+ }
+ } else if (xmi_has_postop_work(xmi)) {
+ /*
+ * Now that we're finished with the exchange operation,
+ * complete the post-op cleanup work.
+ */
+ error = xfs_exchmaps_do_postop_work(tp, xmi);
+ if (error)
+ return error;
+ }
+
+ if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
+ return -EIO;
+
+ /* If we still have work to do, ask for a new transaction. */
+ if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) {
+ trace_xfs_exchmaps_defer(tp->t_mountp, xmi);
+ return -EAGAIN;
+ }
+
+ /*
+ * If we reach here, we've finished all the exchange work and the post
+ * operation work. The last thing we need to do before returning to
+ * the caller is to make sure that COW forks are set up correctly.
+ */
+ if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) {
+ xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1);
+ xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2);
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the amount of bmbt blocks we should reserve for each file. In the
+ * worst case, each exchange will fill a hole with a new mapping, which could
+ * result in a btree split every time we add a new leaf block.
+ */
+static inline uint64_t
+xfs_exchmaps_bmbt_blocks(
+ struct xfs_mount *mp,
+ const struct xfs_exchmaps_req *req)
+{
+ return howmany_64(req->nr_exchanges,
+ XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) *
+ XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req));
+}
+
+/* Compute the space we should reserve for the rmap btree expansions. */
+static inline uint64_t
+xfs_exchmaps_rmapbt_blocks(
+ struct xfs_mount *mp,
+ const struct xfs_exchmaps_req *req)
+{
+ if (!xfs_has_rmapbt(mp))
+ return 0;
+ if (XFS_IS_REALTIME_INODE(req->ip1))
+ return 0;
+
+ return howmany_64(req->nr_exchanges,
+ XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *
+ XFS_RMAPADD_SPACE_RES(mp);
+}
+
+/* Estimate the bmbt and rmapbt overhead required to exchange mappings. */
+int
+xfs_exchmaps_estimate_overhead(
+ struct xfs_exchmaps_req *req)
+{
+ struct xfs_mount *mp = req->ip1->i_mount;
+ xfs_filblks_t bmbt_blocks;
+ xfs_filblks_t rmapbt_blocks;
+ xfs_filblks_t resblks = req->resblks;
+
+ /*
+ * Compute the number of bmbt and rmapbt blocks we might need to handle
+ * the estimated number of exchanges.
+ */
+ bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req);
+ rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req);
+
+ trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks);
+
+ /* Make sure the change in file block count doesn't overflow. */
+ if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount))
+ return -EFBIG;
+ if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount))
+ return -EFBIG;
+
+ /*
+ * Add together the number of blocks we need to handle btree growth,
+ * then add it to the number of blocks we need to reserve to this
+ * transaction.
+ */
+ if (check_add_overflow(resblks, bmbt_blocks, &resblks))
+ return -ENOSPC;
+ if (check_add_overflow(resblks, bmbt_blocks, &resblks))
+ return -ENOSPC;
+ if (check_add_overflow(resblks, rmapbt_blocks, &resblks))
+ return -ENOSPC;
+ if (check_add_overflow(resblks, rmapbt_blocks, &resblks))
+ return -ENOSPC;
+
+ /* Can't actually reserve more than UINT_MAX blocks. */
+ if (req->resblks > UINT_MAX)
+ return -ENOSPC;
+
+ req->resblks = resblks;
+ trace_xfs_exchmaps_final_estimate(req);
+ return 0;
+}
+
+/* Decide if we can merge two real mappings. */
+static inline bool
+xmi_can_merge(
+ const struct xfs_bmbt_irec *b1,
+ const struct xfs_bmbt_irec *b2)
+{
+ /* Don't merge holes. */
+ if (b1->br_startblock == HOLESTARTBLOCK ||
+ b2->br_startblock == HOLESTARTBLOCK)
+ return false;
+
+ /* We don't merge holes. */
+ if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2))
+ return false;
+
+ if (b1->br_startoff + b1->br_blockcount == b2->br_startoff &&
+ b1->br_startblock + b1->br_blockcount == b2->br_startblock &&
+ b1->br_state == b2->br_state &&
+ b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ return true;
+
+ return false;
+}
+
+/*
+ * Decide if we can merge three mappings. Caller must ensure all three
+ * mappings must not be holes or delalloc reservations.
+ */
+static inline bool
+xmi_can_merge_all(
+ const struct xfs_bmbt_irec *l,
+ const struct xfs_bmbt_irec *m,
+ const struct xfs_bmbt_irec *r)
+{
+ xfs_filblks_t new_len;
+
+ new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount;
+ return new_len <= XFS_MAX_BMBT_EXTLEN;
+}
+
+#define CLEFT_CONTIG 0x01
+#define CRIGHT_CONTIG 0x02
+#define CHOLE 0x04
+#define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG)
+
+#define NLEFT_CONTIG 0x10
+#define NRIGHT_CONTIG 0x20
+#define NHOLE 0x40
+#define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG)
+
+/* Estimate the effect of a single exchange on mapping count. */
+static inline int
+xmi_delta_nextents_step(
+ struct xfs_mount *mp,
+ const struct xfs_bmbt_irec *left,
+ const struct xfs_bmbt_irec *curr,
+ const struct xfs_bmbt_irec *new,
+ const struct xfs_bmbt_irec *right)
+{
+ bool lhole, rhole, chole, nhole;
+ unsigned int state = 0;
+ int ret = 0;
+
+ lhole = left->br_startblock == HOLESTARTBLOCK;
+ rhole = right->br_startblock == HOLESTARTBLOCK;
+ chole = curr->br_startblock == HOLESTARTBLOCK;
+ nhole = new->br_startblock == HOLESTARTBLOCK;
+
+ if (chole)
+ state |= CHOLE;
+ if (!lhole && !chole && xmi_can_merge(left, curr))
+ state |= CLEFT_CONTIG;
+ if (!rhole && !chole && xmi_can_merge(curr, right))
+ state |= CRIGHT_CONTIG;
+ if ((state & CBOTH_CONTIG) == CBOTH_CONTIG &&
+ !xmi_can_merge_all(left, curr, right))
+ state &= ~CRIGHT_CONTIG;
+
+ if (nhole)
+ state |= NHOLE;
+ if (!lhole && !nhole && xmi_can_merge(left, new))
+ state |= NLEFT_CONTIG;
+ if (!rhole && !nhole && xmi_can_merge(new, right))
+ state |= NRIGHT_CONTIG;
+ if ((state & NBOTH_CONTIG) == NBOTH_CONTIG &&
+ !xmi_can_merge_all(left, new, right))
+ state &= ~NRIGHT_CONTIG;
+
+ switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) {
+ case CLEFT_CONTIG | CRIGHT_CONTIG:
+ /*
+ * left/curr/right are the same mapping, so deleting curr
+ * causes 2 new mappings to be created.
+ */
+ ret += 2;
+ break;
+ case 0:
+ /*
+ * curr is not contiguous with any mapping, so we remove curr
+ * completely
+ */
+ ret--;
+ break;
+ case CHOLE:
+ /* hole, do nothing */
+ break;
+ case CLEFT_CONTIG:
+ case CRIGHT_CONTIG:
+ /* trim either left or right, no change */
+ break;
+ }
+
+ switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) {
+ case NLEFT_CONTIG | NRIGHT_CONTIG:
+ /*
+ * left/curr/right will become the same mapping, so adding
+ * curr causes the deletion of right.
+ */
+ ret--;
+ break;
+ case 0:
+ /* new is not contiguous with any mapping */
+ ret++;
+ break;
+ case NHOLE:
+ /* hole, do nothing. */
+ break;
+ case NLEFT_CONTIG:
+ case NRIGHT_CONTIG:
+ /* new is absorbed into left or right, no change */
+ break;
+ }
+
+ trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret,
+ state);
+ return ret;
+}
+
+/* Make sure we don't overflow the extent (mapping) counters. */
+static inline int
+xmi_ensure_delta_nextents(
+ struct xfs_exchmaps_req *req,
+ struct xfs_inode *ip,
+ int64_t delta)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int whichfork = xfs_exchmaps_reqfork(req);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ uint64_t new_nextents;
+ xfs_extnum_t max_nextents;
+
+ if (delta < 0)
+ return 0;
+
+ /*
+ * It's always an error if the delta causes integer overflow. delta
+ * needs an explicit cast here to avoid warnings about implicit casts
+ * coded into the overflow check.
+ */
+ if (check_add_overflow(ifp->if_nextents, (uint64_t)delta,
+ &new_nextents))
+ return -EFBIG;
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
+ new_nextents > 10)
+ return -EFBIG;
+
+ /*
+ * We always promote both inodes to have large extent counts if the
+ * superblock feature is enabled, so we only need to check against the
+ * theoretical maximum.
+ */
+ max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
+ whichfork);
+ if (new_nextents > max_nextents)
+ return -EFBIG;
+
+ return 0;
+}
+
+/* Find the next mapping after irec. */
+static inline int
+xmi_next(
+ struct xfs_inode *ip,
+ int bmap_flags,
+ const struct xfs_bmbt_irec *irec,
+ struct xfs_bmbt_irec *nrec)
+{
+ xfs_fileoff_t off;
+ xfs_filblks_t blockcount;
+ int nimaps = 1;
+ int error;
+
+ off = irec->br_startoff + irec->br_blockcount;
+ blockcount = XFS_MAX_FILEOFF - off;
+ error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags);
+ if (error)
+ return error;
+ if (nrec->br_startblock == DELAYSTARTBLOCK ||
+ nrec->br_startoff != off) {
+ /*
+ * If we don't get the mapping we want, return a zero-length
+ * mapping, which our estimator function will pretend is a hole.
+ * We shouldn't get delalloc reservations.
+ */
+ nrec->br_startblock = HOLESTARTBLOCK;
+ }
+
+ return 0;
+}
+
+int __init
+xfs_exchmaps_intent_init_cache(void)
+{
+ xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent",
+ sizeof(struct xfs_exchmaps_intent),
+ 0, 0, NULL);
+
+ return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_exchmaps_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_exchmaps_intent_cache);
+ xfs_exchmaps_intent_cache = NULL;
+}
+
+/*
+ * Decide if we will exchange the reflink flags between the two files after the
+ * exchange. The only time we want to do this is if we're exchanging all
+ * mappings under EOF and the inode reflink flags have different states.
+ */
+static inline bool
+xmi_can_exchange_reflink_flags(
+ const struct xfs_exchmaps_req *req,
+ unsigned int reflink_state)
+{
+ struct xfs_mount *mp = req->ip1->i_mount;
+
+ if (hweight32(reflink_state) != 1)
+ return false;
+ if (req->startoff1 != 0 || req->startoff2 != 0)
+ return false;
+ if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size))
+ return false;
+ if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size))
+ return false;
+ return true;
+}
+
+
+/* Allocate and initialize a new incore intent item from a request. */
+struct xfs_exchmaps_intent *
+xfs_exchmaps_init_intent(
+ const struct xfs_exchmaps_req *req)
+{
+ struct xfs_exchmaps_intent *xmi;
+ unsigned int rs = 0;
+
+ xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&xmi->xmi_list);
+ xmi->xmi_ip1 = req->ip1;
+ xmi->xmi_ip2 = req->ip2;
+ xmi->xmi_startoff1 = req->startoff1;
+ xmi->xmi_startoff2 = req->startoff2;
+ xmi->xmi_blockcount = req->blockcount;
+ xmi->xmi_isize1 = xmi->xmi_isize2 = -1;
+ xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS;
+
+ if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) {
+ xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM;
+ return xmi;
+ }
+
+ if (req->flags & XFS_EXCHMAPS_SET_SIZES) {
+ xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES;
+ xmi->xmi_isize1 = req->ip2->i_disk_size;
+ xmi->xmi_isize2 = req->ip1->i_disk_size;
+ }
+
+ /* Record the state of each inode's reflink flag before the op. */
+ if (xfs_is_reflink_inode(req->ip1))
+ rs |= 1;
+ if (xfs_is_reflink_inode(req->ip2))
+ rs |= 2;
+
+ /*
+ * Figure out if we're clearing the reflink flags (which effectively
+ * exchanges them) after the operation.
+ */
+ if (xmi_can_exchange_reflink_flags(req, rs)) {
+ if (rs & 1)
+ xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK;
+ if (rs & 2)
+ xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK;
+ }
+
+ if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) ||
+ S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode))
+ xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM;
+
+ return xmi;
+}
+
+/*
+ * Estimate the number of exchange operations and the number of file blocks
+ * in each file that will be affected by the exchange operation.
+ */
+int
+xfs_exchmaps_estimate(
+ struct xfs_exchmaps_req *req)
+{
+ struct xfs_exchmaps_intent *xmi;
+ struct xfs_bmbt_irec irec1, irec2;
+ struct xfs_exchmaps_adjacent adj = ADJACENT_INIT;
+ xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0;
+ int64_t d_nexts1, d_nexts2;
+ int bmap_flags;
+ int error;
+
+ ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS));
+
+ bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req));
+ xmi = xfs_exchmaps_init_intent(req);
+
+ /*
+ * To guard against the possibility of overflowing the extent counters,
+ * we have to estimate an upper bound on the potential increase in that
+ * counter. We can split the mapping at each end of the range, and for
+ * each step of the exchange we can split the mapping that we're
+ * working on if the mappings do not align.
+ */
+ d_nexts1 = d_nexts2 = 3;
+
+ while (xmi_has_more_exchange_work(xmi)) {
+ /*
+ * Walk through the file ranges until we find something to
+ * exchange. Because we're simulating the exchange, pass in
+ * adj to capture skipped mappings for correct estimation of
+ * bmbt record merges.
+ */
+ error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj);
+ if (error)
+ goto out_free;
+ if (!xmi_has_more_exchange_work(xmi))
+ break;
+
+ /* Update accounting. */
+ if (xfs_bmap_is_real_extent(&irec1))
+ ip1_blocks += irec1.br_blockcount;
+ if (xfs_bmap_is_real_extent(&irec2))
+ ip2_blocks += irec2.br_blockcount;
+ req->nr_exchanges++;
+
+ /* Read the next mappings from both files. */
+ error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1);
+ if (error)
+ goto out_free;
+
+ error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2);
+ if (error)
+ goto out_free;
+
+ /* Update extent count deltas. */
+ d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount,
+ &adj.left1, &irec1, &irec2, &adj.right1);
+
+ d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount,
+ &adj.left2, &irec2, &irec1, &adj.right2);
+
+ /* Now pretend we exchanged the mappings. */
+ if (xmi_can_merge(&adj.left2, &irec1))
+ adj.left2.br_blockcount += irec1.br_blockcount;
+ else
+ memcpy(&adj.left2, &irec1, sizeof(irec1));
+
+ if (xmi_can_merge(&adj.left1, &irec2))
+ adj.left1.br_blockcount += irec2.br_blockcount;
+ else
+ memcpy(&adj.left1, &irec2, sizeof(irec2));
+
+ xmi_advance(xmi, &irec1);
+ }
+
+ /* Account for the blocks that are being exchanged. */
+ if (XFS_IS_REALTIME_INODE(req->ip1) &&
+ xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) {
+ req->ip1_rtbcount = ip1_blocks;
+ req->ip2_rtbcount = ip2_blocks;
+ } else {
+ req->ip1_bcount = ip1_blocks;
+ req->ip2_bcount = ip2_blocks;
+ }
+
+ /*
+ * Make sure that both forks have enough slack left in their extent
+ * counters that the exchange operation will not overflow.
+ */
+ trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2);
+ if (req->ip1 == req->ip2) {
+ error = xmi_ensure_delta_nextents(req, req->ip1,
+ d_nexts1 + d_nexts2);
+ } else {
+ error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1);
+ if (error)
+ goto out_free;
+ error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2);
+ }
+ if (error)
+ goto out_free;
+
+ trace_xfs_exchmaps_initial_estimate(req);
+ error = xfs_exchmaps_estimate_overhead(req);
+out_free:
+ kmem_cache_free(xfs_exchmaps_intent_cache, xmi);
+ return error;
+}
+
+/* Set the reflink flag before an operation. */
+static inline void
+xfs_exchmaps_set_reflink(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ trace_xfs_reflink_set_inode_flag(ip);
+
+ ip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * If either file has shared blocks and we're exchanging data forks, we must
+ * flag the other file as having shared blocks so that we get the shared-block
+ * rmap functions if we need to fix up the rmaps.
+ */
+void
+xfs_exchmaps_ensure_reflink(
+ struct xfs_trans *tp,
+ const struct xfs_exchmaps_intent *xmi)
+{
+ unsigned int rs = 0;
+
+ if (xfs_is_reflink_inode(xmi->xmi_ip1))
+ rs |= 1;
+ if (xfs_is_reflink_inode(xmi->xmi_ip2))
+ rs |= 2;
+
+ if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2))
+ xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2);
+
+ if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1))
+ xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1);
+}
+
+/* Set the large extent count flag before an operation if needed. */
+static inline void
+xfs_exchmaps_ensure_large_extent_counts(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ if (xfs_inode_has_large_extent_counts(ip))
+ return;
+
+ ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Widen the extent counter fields of both inodes if necessary. */
+void
+xfs_exchmaps_upgrade_extent_counts(
+ struct xfs_trans *tp,
+ const struct xfs_exchmaps_intent *xmi)
+{
+ if (!xfs_has_large_extent_counts(tp->t_mountp))
+ return;
+
+ xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1);
+ xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2);
+}
+
+/*
+ * Schedule an exchange a range of mappings from one inode to another.
+ *
+ * The use of file mapping exchange log intent items ensures the operation can
+ * be resumed even if the system goes down. The caller must commit the
+ * transaction to start the work.
+ *
+ * The caller must ensure the inodes must be joined to the transaction and
+ * ILOCKd; they will still be joined to the transaction at exit.
+ */
+void
+xfs_exchange_mappings(
+ struct xfs_trans *tp,
+ const struct xfs_exchmaps_req *req)
+{
+ struct xfs_exchmaps_intent *xmi;
+
+ BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS);
+
+ xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL);
+ ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS));
+ if (req->flags & XFS_EXCHMAPS_SET_SIZES)
+ ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK));
+ ASSERT(xfs_has_exchange_range(tp->t_mountp));
+
+ if (req->blockcount == 0)
+ return;
+
+ xmi = xfs_exchmaps_init_intent(req);
+ xfs_exchmaps_defer_add(tp, xmi);
+ xfs_exchmaps_ensure_reflink(tp, xmi);
+ xfs_exchmaps_upgrade_extent_counts(tp, xmi);
+}
diff --git a/fs/xfs/libxfs/xfs_exchmaps.h b/fs/xfs/libxfs/xfs_exchmaps.h
new file mode 100644
index 000000000000..fa822dff202a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_exchmaps.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_EXCHMAPS_H__
+#define __XFS_EXCHMAPS_H__
+
+/* In-core deferred operation info about a file mapping exchange request. */
+struct xfs_exchmaps_intent {
+ /* List of other incore deferred work. */
+ struct list_head xmi_list;
+
+ /* Inodes participating in the operation. */
+ struct xfs_inode *xmi_ip1;
+ struct xfs_inode *xmi_ip2;
+
+ /* File offset range information. */
+ xfs_fileoff_t xmi_startoff1;
+ xfs_fileoff_t xmi_startoff2;
+ xfs_filblks_t xmi_blockcount;
+
+ /* Set these file sizes after the operation, unless negative. */
+ xfs_fsize_t xmi_isize1;
+ xfs_fsize_t xmi_isize2;
+
+ uint64_t xmi_flags; /* XFS_EXCHMAPS_* flags */
+};
+
+/* Try to convert inode2 from block to short format at the end, if possible. */
+#define __XFS_EXCHMAPS_INO2_SHORTFORM (1ULL << 63)
+
+#define XFS_EXCHMAPS_INTERNAL_FLAGS (__XFS_EXCHMAPS_INO2_SHORTFORM)
+
+/* flags that can be passed to xfs_exchmaps_{estimate,mappings} */
+#define XFS_EXCHMAPS_PARAMS (XFS_EXCHMAPS_ATTR_FORK | \
+ XFS_EXCHMAPS_SET_SIZES | \
+ XFS_EXCHMAPS_INO1_WRITTEN)
+
+static inline int
+xfs_exchmaps_whichfork(const struct xfs_exchmaps_intent *xmi)
+{
+ if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)
+ return XFS_ATTR_FORK;
+ return XFS_DATA_FORK;
+}
+
+/* Parameters for a mapping exchange request. */
+struct xfs_exchmaps_req {
+ /* Inodes participating in the operation. */
+ struct xfs_inode *ip1;
+ struct xfs_inode *ip2;
+
+ /* File offset range information. */
+ xfs_fileoff_t startoff1;
+ xfs_fileoff_t startoff2;
+ xfs_filblks_t blockcount;
+
+ /* XFS_EXCHMAPS_* operation flags */
+ uint64_t flags;
+
+ /*
+ * Fields below this line are filled out by xfs_exchmaps_estimate;
+ * callers should initialize this part of the struct to zero.
+ */
+
+ /*
+ * Data device blocks to be moved out of ip1, and free space needed to
+ * handle the bmbt changes.
+ */
+ xfs_filblks_t ip1_bcount;
+
+ /*
+ * Data device blocks to be moved out of ip2, and free space needed to
+ * handle the bmbt changes.
+ */
+ xfs_filblks_t ip2_bcount;
+
+ /* rt blocks to be moved out of ip1. */
+ xfs_filblks_t ip1_rtbcount;
+
+ /* rt blocks to be moved out of ip2. */
+ xfs_filblks_t ip2_rtbcount;
+
+ /* Free space needed to handle the bmbt changes */
+ unsigned long long resblks;
+
+ /* Number of exchanges needed to complete the operation */
+ unsigned long long nr_exchanges;
+};
+
+static inline int
+xfs_exchmaps_reqfork(const struct xfs_exchmaps_req *req)
+{
+ if (req->flags & XFS_EXCHMAPS_ATTR_FORK)
+ return XFS_ATTR_FORK;
+ return XFS_DATA_FORK;
+}
+
+int xfs_exchmaps_estimate_overhead(struct xfs_exchmaps_req *req);
+int xfs_exchmaps_estimate(struct xfs_exchmaps_req *req);
+
+extern struct kmem_cache *xfs_exchmaps_intent_cache;
+
+int __init xfs_exchmaps_intent_init_cache(void);
+void xfs_exchmaps_intent_destroy_cache(void);
+
+struct xfs_exchmaps_intent *xfs_exchmaps_init_intent(
+ const struct xfs_exchmaps_req *req);
+void xfs_exchmaps_ensure_reflink(struct xfs_trans *tp,
+ const struct xfs_exchmaps_intent *xmi);
+void xfs_exchmaps_upgrade_extent_counts(struct xfs_trans *tp,
+ const struct xfs_exchmaps_intent *xmi);
+
+int xfs_exchmaps_finish_one(struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi);
+
+int xfs_exchmaps_check_forks(struct xfs_mount *mp,
+ const struct xfs_exchmaps_req *req);
+
+void xfs_exchange_mappings(struct xfs_trans *tp,
+ const struct xfs_exchmaps_req *req);
+
+#endif /* __XFS_EXCHMAPS_H__ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 2b2f9050fbfb..61f51becff4f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -367,19 +367,23 @@ xfs_sb_has_ro_compat_feature(
return (sbp->sb_features_ro_compat & feature) != 0;
}
-#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
-#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
-#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
-#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
-#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
-#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
+#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
+#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
+#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
+#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
+#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
+#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
+#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_ALL \
- (XFS_SB_FEAT_INCOMPAT_FTYPE| \
- XFS_SB_FEAT_INCOMPAT_SPINODES| \
- XFS_SB_FEAT_INCOMPAT_META_UUID| \
- XFS_SB_FEAT_INCOMPAT_BIGTIME| \
- XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
- XFS_SB_FEAT_INCOMPAT_NREXT64)
+ (XFS_SB_FEAT_INCOMPAT_FTYPE | \
+ XFS_SB_FEAT_INCOMPAT_SPINODES | \
+ XFS_SB_FEAT_INCOMPAT_META_UUID | \
+ XFS_SB_FEAT_INCOMPAT_BIGTIME | \
+ XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \
+ XFS_SB_FEAT_INCOMPAT_NREXT64 | \
+ XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
+ XFS_SB_FEAT_INCOMPAT_PARENT)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -898,6 +902,12 @@ static inline uint xfs_dinode_size(int version)
#define XFS_MAXLINK ((1U << 31) - 1U)
/*
+ * Any file that hits the maximum ondisk link count should be pinned to avoid
+ * a use-after-free situation.
+ */
+#define XFS_NLINK_PINNED (~0U)
+
+/*
* Values for di_format
*
* This enum is used in string mapping in xfs_trace.h; please keep the
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index ca1b17d01437..97996cb79aaa 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -239,6 +239,8 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
+#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
+#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
/*
* Minimum and maximum sizes need for growth checks.
@@ -409,6 +411,7 @@ struct xfs_bulkstat {
#define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */
#define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */
#define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */
+#define XFS_BS_SICK_DIRTREE (1 << 8) /* directory tree structure */
/*
* Project quota id helpers (previously projid was 16bit only
@@ -632,7 +635,9 @@ typedef struct xfs_fsop_attrmulti_handlereq {
/*
* per machine unique filesystem identifier types.
*/
-typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */
+typedef struct xfs_fsid {
+ __u32 val[2]; /* file system id type */
+} xfs_fsid_t;
typedef struct xfs_fid {
__u16 fid_len; /* length of remainder */
@@ -715,9 +720,19 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */
#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */
#define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */
+#define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 28
+#define XFS_SCRUB_TYPE_NR 29
+
+/*
+ * This special type code only applies to the vectored scrub implementation.
+ *
+ * If any of the previous scrub vectors recorded runtime errors or have
+ * sv_flags bits set that match the OFLAG bits in the barrier vector's
+ * sv_flags, set the barrier's sv_ret to -ECANCELED and return to userspace.
+ */
+#define XFS_SCRUB_TYPE_BARRIER (0xFFFFFFFF)
/* i: Repair this metadata. */
#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
@@ -763,6 +778,29 @@ struct xfs_scrub_metadata {
XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
#define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
+/* Vectored scrub calls to reduce the number of kernel transitions. */
+
+struct xfs_scrub_vec {
+ __u32 sv_type; /* XFS_SCRUB_TYPE_* */
+ __u32 sv_flags; /* XFS_SCRUB_FLAGS_* */
+ __s32 sv_ret; /* 0 or a negative error code */
+ __u32 sv_reserved; /* must be zero */
+};
+
+/* Vectored metadata scrub control structure. */
+struct xfs_scrub_vec_head {
+ __u64 svh_ino; /* inode number. */
+ __u32 svh_gen; /* inode generation. */
+ __u32 svh_agno; /* ag number. */
+ __u32 svh_flags; /* XFS_SCRUB_VEC_FLAGS_* */
+ __u16 svh_rest_us; /* wait this much time between vector items */
+ __u16 svh_nr; /* number of svh_vectors */
+ __u64 svh_reserved; /* must be zero */
+ __u64 svh_vectors; /* pointer to buffer of xfs_scrub_vec */
+};
+
+#define XFS_SCRUB_VEC_FLAGS_ALL (0)
+
/*
* ioctl limits
*/
@@ -772,6 +810,118 @@ struct xfs_scrub_metadata {
# define XFS_XATTR_LIST_MAX 65536
#endif
+/*
+ * Exchange part of file1 with part of the file that this ioctl that is being
+ * called against (which we'll call file2). Filesystems must be able to
+ * restart and complete the operation even after the system goes down.
+ */
+struct xfs_exchange_range {
+ __s32 file1_fd;
+ __u32 pad; /* must be zeroes */
+ __u64 file1_offset; /* file1 offset, bytes */
+ __u64 file2_offset; /* file2 offset, bytes */
+ __u64 length; /* bytes to exchange */
+
+ __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */
+};
+
+/*
+ * Exchange file data all the way to the ends of both files, and then exchange
+ * the file sizes. This flag can be used to replace a file's contents with a
+ * different amount of data. length will be ignored.
+ */
+#define XFS_EXCHANGE_RANGE_TO_EOF (1ULL << 0)
+
+/* Flush all changes in file data and file metadata to disk before returning. */
+#define XFS_EXCHANGE_RANGE_DSYNC (1ULL << 1)
+
+/* Dry run; do all the parameter verification but do not change anything. */
+#define XFS_EXCHANGE_RANGE_DRY_RUN (1ULL << 2)
+
+/*
+ * Exchange only the parts of the two files where the file allocation units
+ * mapped to file1's range have been written to. This can accelerate
+ * scatter-gather atomic writes with a temp file if all writes are aligned to
+ * the file allocation unit.
+ */
+#define XFS_EXCHANGE_RANGE_FILE1_WRITTEN (1ULL << 3)
+
+#define XFS_EXCHANGE_RANGE_ALL_FLAGS (XFS_EXCHANGE_RANGE_TO_EOF | \
+ XFS_EXCHANGE_RANGE_DSYNC | \
+ XFS_EXCHANGE_RANGE_DRY_RUN | \
+ XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
+
+/* Iterating parent pointers of files. */
+
+/* target was the root directory */
+#define XFS_GETPARENTS_OFLAG_ROOT (1U << 0)
+
+/* Cursor is done iterating pptrs */
+#define XFS_GETPARENTS_OFLAG_DONE (1U << 1)
+
+#define XFS_GETPARENTS_OFLAGS_ALL (XFS_GETPARENTS_OFLAG_ROOT | \
+ XFS_GETPARENTS_OFLAG_DONE)
+
+#define XFS_GETPARENTS_IFLAGS_ALL (0)
+
+struct xfs_getparents_rec {
+ struct xfs_handle gpr_parent; /* Handle to parent */
+ __u32 gpr_reclen; /* Length of entire record */
+ __u32 gpr_reserved; /* zero */
+ char gpr_name[]; /* Null-terminated filename */
+};
+
+/* Iterate through this file's directory parent pointers */
+struct xfs_getparents {
+ /*
+ * Structure to track progress in iterating the parent pointers.
+ * Must be initialized to zeroes before the first ioctl call, and
+ * not touched by callers after that.
+ */
+ struct xfs_attrlist_cursor gp_cursor;
+
+ /* Input flags: XFS_GETPARENTS_IFLAG* */
+ __u16 gp_iflags;
+
+ /* Output flags: XFS_GETPARENTS_OFLAG* */
+ __u16 gp_oflags;
+
+ /* Size of the gp_buffer in bytes */
+ __u32 gp_bufsize;
+
+ /* Must be set to zero */
+ __u64 gp_reserved;
+
+ /* Pointer to a buffer in which to place xfs_getparents_rec */
+ __u64 gp_buffer;
+};
+
+static inline struct xfs_getparents_rec *
+xfs_getparents_first_rec(struct xfs_getparents *gp)
+{
+ return (struct xfs_getparents_rec *)(uintptr_t)gp->gp_buffer;
+}
+
+static inline struct xfs_getparents_rec *
+xfs_getparents_next_rec(struct xfs_getparents *gp,
+ struct xfs_getparents_rec *gpr)
+{
+ void *next = ((void *)gpr + gpr->gpr_reclen);
+ void *end = (void *)(uintptr_t)(gp->gp_buffer + gp->gp_bufsize);
+
+ if (next >= end)
+ return NULL;
+
+ return next;
+}
+
+/* Iterate through this file handle's directory parent pointers. */
+struct xfs_getparents_by_handle {
+ /* Handle to file whose parents we want. */
+ struct xfs_handle gph_handle;
+
+ struct xfs_getparents gph_request;
+};
/*
* ioctl commands that are used by Linux filesystems
@@ -808,6 +958,9 @@ struct xfs_scrub_metadata {
/* XFS_IOC_GETFSMAP ------ hoisted 59 */
#define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata)
#define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry)
+#define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents)
+#define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle)
+#define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head)
/*
* ioctl commands that replace IRIX syssgi()'s
@@ -843,6 +996,7 @@ struct xfs_scrub_metadata {
#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
+#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 3c64b5f9bd68..b0edb4288e59 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -95,6 +95,7 @@ struct xfs_da_args;
/* Don't propagate sick status to ag health summary during inactivation */
#define XFS_SICK_INO_FORGET (1 << 12)
+#define XFS_SICK_INO_DIRTREE (1 << 13) /* directory tree structure */
/* Primary evidence of health problems in a given group. */
#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \
@@ -125,7 +126,8 @@ struct xfs_da_args;
XFS_SICK_INO_DIR | \
XFS_SICK_INO_XATTR | \
XFS_SICK_INO_SYMLINK | \
- XFS_SICK_INO_PARENT)
+ XFS_SICK_INO_PARENT | \
+ XFS_SICK_INO_DIRTREE)
#define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \
XFS_SICK_INO_BMBTA_ZAPPED | \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index e5ac3e5430c4..14c81f227c5b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1058,6 +1058,33 @@ xfs_inobt_first_free_inode(
}
/*
+ * If this AG has corrupt inodes, check if allocating this inode would fail
+ * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again
+ * somewhere else.
+ */
+static int
+xfs_dialloc_check_ino(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ xfs_ino_t ino)
+{
+ struct xfs_imap imap;
+ struct xfs_buf *bp;
+ int error;
+
+ error = xfs_imap(pag, tp, ino, &imap, 0);
+ if (error)
+ return -EAGAIN;
+
+ error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp);
+ if (error)
+ return -EAGAIN;
+
+ xfs_trans_brelse(tp, bp);
+ return 0;
+}
+
+/*
* Allocate an inode using the inobt-only algorithm.
*/
STATIC int
@@ -1309,6 +1336,13 @@ alloc_inode:
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+
+ if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
+ error = xfs_dialloc_check_ino(pag, tp, ino);
+ if (error)
+ goto error0;
+ }
+
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
error = xfs_inobt_update(cur, &rec);
@@ -1584,6 +1618,12 @@ xfs_dialloc_ag(
XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+ if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
+ error = xfs_dialloc_check_ino(pag, tp, ino);
+ if (error)
+ goto error_cur;
+ }
+
/*
* Modify or remove the finobt record.
*/
@@ -1699,7 +1739,7 @@ xfs_dialloc_good_ag(
return false;
if (!xfs_perag_initialised_agi(pag)) {
- error = xfs_ialloc_read_agi(pag, tp, NULL);
+ error = xfs_ialloc_read_agi(pag, tp, 0, NULL);
if (error)
return false;
}
@@ -1768,7 +1808,7 @@ xfs_dialloc_try_ag(
* Then read in the AGI buffer and recheck with the AGI buffer
* lock held.
*/
- error = xfs_ialloc_read_agi(pag, *tpp, &agbp);
+ error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp);
if (error)
return error;
@@ -2286,7 +2326,7 @@ xfs_difree(
/*
* Get the allocation group header.
*/
- error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
if (error) {
xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
__func__, error);
@@ -2332,7 +2372,7 @@ xfs_imap_lookup(
int error;
int i;
- error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
if (error) {
xfs_alert(mp,
"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
@@ -2675,6 +2715,7 @@ int
xfs_read_agi(
struct xfs_perag *pag,
struct xfs_trans *tp,
+ xfs_buf_flags_t flags,
struct xfs_buf **agibpp)
{
struct xfs_mount *mp = pag->pag_mount;
@@ -2684,7 +2725,7 @@ xfs_read_agi(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
+ XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
if (error)
@@ -2704,6 +2745,7 @@ int
xfs_ialloc_read_agi(
struct xfs_perag *pag,
struct xfs_trans *tp,
+ int flags,
struct xfs_buf **agibpp)
{
struct xfs_buf *agibp;
@@ -2712,7 +2754,9 @@ xfs_ialloc_read_agi(
trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
- error = xfs_read_agi(pag, tp, &agibp);
+ error = xfs_read_agi(pag, tp,
+ (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+ &agibp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index f1412183bb44..b549627e3a61 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -63,10 +63,11 @@ xfs_ialloc_log_agi(
struct xfs_buf *bp, /* allocation group header buffer */
uint32_t fields); /* bitmask of fields to log */
-int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, xfs_buf_flags_t flags,
struct xfs_buf **agibpp);
int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
- struct xfs_buf **agibpp);
+ int flags, struct xfs_buf **agibpp);
+#define XFS_IALLOC_FLAG_TRYLOCK (1U << 0) /* use trylock for buffer locking */
/*
* Lookup a record by ino in the btree given by cur.
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index cc661fca6ff5..42e9fd47f6c7 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -745,7 +745,7 @@ xfs_finobt_count_blocks(
struct xfs_btree_cur *cur;
int error;
- error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
if (error)
return error;
@@ -768,7 +768,7 @@ xfs_finobt_read_blocks(
struct xfs_agi *agi;
int error;
- error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index d0dcce462bf4..d79002343d0b 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -491,6 +491,14 @@ xfs_dinode_verify(
return __this_address;
}
+ if (dip->di_version > 1) {
+ if (dip->di_onlink)
+ return __this_address;
+ } else {
+ if (dip->di_nlink)
+ return __this_address;
+ }
+
/* don't allow invalid i_size */
di_size = be64_to_cpu(dip->di_size);
if (di_size & (1ULL << 63))
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 7d660a973909..9d11ae015909 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -765,53 +765,46 @@ xfs_ifork_verify_local_attr(
return 0;
}
+/*
+ * Check if the inode fork supports adding nr_to_add more extents.
+ *
+ * If it doesn't but we can upgrade it to large extent counters, do the upgrade.
+ * If we can't upgrade or are already using big counters but still can't fit the
+ * additional extents, return -EFBIG.
+ */
int
-xfs_iext_count_may_overflow(
+xfs_iext_count_extend(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
int whichfork,
- int nr_to_add)
+ uint nr_to_add)
{
+ struct xfs_mount *mp = ip->i_mount;
+ bool has_large =
+ xfs_inode_has_large_extent_counts(ip);
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
- uint64_t max_exts;
uint64_t nr_exts;
+ ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
+
if (whichfork == XFS_COW_FORK)
return 0;
- max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
- whichfork);
-
- if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
- max_exts = 10;
-
+ /* no point in upgrading if if_nextents overflows */
nr_exts = ifp->if_nextents + nr_to_add;
- if (nr_exts < ifp->if_nextents || nr_exts > max_exts)
+ if (nr_exts < ifp->if_nextents)
return -EFBIG;
- return 0;
-}
-
-/*
- * Upgrade this inode's extent counter fields to be able to handle a potential
- * increase in the extent count by nr_to_add. Normally this is the same
- * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
- */
-int
-xfs_iext_count_upgrade(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- uint nr_to_add)
-{
- ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
-
- if (!xfs_has_large_extent_counts(ip->i_mount) ||
- xfs_inode_has_large_extent_counts(ip) ||
- XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
+ nr_exts > 10)
return -EFBIG;
- ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
+ if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) {
+ if (has_large || !xfs_has_large_extent_counts(mp))
+ return -EFBIG;
+ ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ }
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index bd53eb951b65..2373d12fd474 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -256,10 +256,8 @@ extern void xfs_ifork_init_cow(struct xfs_inode *ip);
int xfs_ifork_verify_local_data(struct xfs_inode *ip);
int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
-int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
- int nr_to_add);
-int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
- uint nr_to_add);
+int xfs_iext_count_extend(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, uint nr_to_add);
bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork);
/* returns true if the fork has extents but they are not read in yet. */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 16872972e1e9..3e6682ed656b 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -115,10 +115,13 @@ struct xfs_unmount_log_format {
#define XLOG_REG_TYPE_BUD_FORMAT 26
#define XLOG_REG_TYPE_ATTRI_FORMAT 27
#define XLOG_REG_TYPE_ATTRD_FORMAT 28
-#define XLOG_REG_TYPE_ATTR_NAME 29
+#define XLOG_REG_TYPE_ATTR_NAME 29
#define XLOG_REG_TYPE_ATTR_VALUE 30
-#define XLOG_REG_TYPE_MAX 30
-
+#define XLOG_REG_TYPE_XMI_FORMAT 31
+#define XLOG_REG_TYPE_XMD_FORMAT 32
+#define XLOG_REG_TYPE_ATTR_NEWNAME 33
+#define XLOG_REG_TYPE_ATTR_NEWVALUE 34
+#define XLOG_REG_TYPE_MAX 34
/*
* Flags to log operation header
@@ -243,6 +246,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_BUD 0x1245
#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/
#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */
+#define XFS_LI_XMI 0x1248 /* mapping exchange intent */
+#define XFS_LI_XMD 0x1249 /* mapping exchange done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -260,7 +265,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_BUI, "XFS_LI_BUI" }, \
{ XFS_LI_BUD, "XFS_LI_BUD" }, \
{ XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \
- { XFS_LI_ATTRD, "XFS_LI_ATTRD" }
+ { XFS_LI_ATTRD, "XFS_LI_ATTRD" }, \
+ { XFS_LI_XMI, "XFS_LI_XMI" }, \
+ { XFS_LI_XMD, "XFS_LI_XMD" }
/*
* Inode Log Item Format definitions.
@@ -879,6 +886,61 @@ struct xfs_bud_log_format {
};
/*
+ * XMI/XMD (file mapping exchange) log format definitions
+ */
+
+/* This is the structure used to lay out an mapping exchange log item. */
+struct xfs_xmi_log_format {
+ uint16_t xmi_type; /* xmi log item type */
+ uint16_t xmi_size; /* size of this item */
+ uint32_t __pad; /* must be zero */
+ uint64_t xmi_id; /* xmi identifier */
+
+ uint64_t xmi_inode1; /* inumber of first file */
+ uint64_t xmi_inode2; /* inumber of second file */
+ uint32_t xmi_igen1; /* generation of first file */
+ uint32_t xmi_igen2; /* generation of second file */
+ uint64_t xmi_startoff1; /* block offset into file1 */
+ uint64_t xmi_startoff2; /* block offset into file2 */
+ uint64_t xmi_blockcount; /* number of blocks */
+ uint64_t xmi_flags; /* XFS_EXCHMAPS_* */
+ uint64_t xmi_isize1; /* intended file1 size */
+ uint64_t xmi_isize2; /* intended file2 size */
+};
+
+/* Exchange mappings between extended attribute forks instead of data forks. */
+#define XFS_EXCHMAPS_ATTR_FORK (1ULL << 0)
+
+/* Set the file sizes when finished. */
+#define XFS_EXCHMAPS_SET_SIZES (1ULL << 1)
+
+/*
+ * Exchange the mappings of the two files only if the file allocation units
+ * mapped to file1's range have been written.
+ */
+#define XFS_EXCHMAPS_INO1_WRITTEN (1ULL << 2)
+
+/* Clear the reflink flag from inode1 after the operation. */
+#define XFS_EXCHMAPS_CLEAR_INO1_REFLINK (1ULL << 3)
+
+/* Clear the reflink flag from inode2 after the operation. */
+#define XFS_EXCHMAPS_CLEAR_INO2_REFLINK (1ULL << 4)
+
+#define XFS_EXCHMAPS_LOGGED_FLAGS (XFS_EXCHMAPS_ATTR_FORK | \
+ XFS_EXCHMAPS_SET_SIZES | \
+ XFS_EXCHMAPS_INO1_WRITTEN | \
+ XFS_EXCHMAPS_CLEAR_INO1_REFLINK | \
+ XFS_EXCHMAPS_CLEAR_INO2_REFLINK)
+
+/* This is the structure used to lay out an mapping exchange done log item. */
+struct xfs_xmd_log_format {
+ uint16_t xmd_type; /* xmd log item type */
+ uint16_t xmd_size; /* size of this item */
+ uint32_t __pad;
+ uint64_t xmd_xmi_id; /* id of corresponding xmi */
+};
+
+/*
* Dquot Log format definitions.
*
* The first two fields must be the type and size fitting into
@@ -966,6 +1028,9 @@ struct xfs_icreate_log {
#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */
#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */
#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */
+#define XFS_ATTRI_OP_FLAGS_PPTR_SET 4 /* Set parent pointer */
+#define XFS_ATTRI_OP_FLAGS_PPTR_REMOVE 5 /* Remove parent pointer */
+#define XFS_ATTRI_OP_FLAGS_PPTR_REPLACE 6 /* Replace parent pointer */
#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */
/*
@@ -974,6 +1039,7 @@ struct xfs_icreate_log {
*/
#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \
XFS_ATTR_SECURE | \
+ XFS_ATTR_PARENT | \
XFS_ATTR_INCOMPLETE)
/*
@@ -983,11 +1049,22 @@ struct xfs_icreate_log {
struct xfs_attri_log_format {
uint16_t alfi_type; /* attri log item type */
uint16_t alfi_size; /* size of this item */
- uint32_t __pad; /* pad to 64 bit aligned */
+ uint32_t alfi_igen; /* generation of alfi_ino for pptr ops */
uint64_t alfi_id; /* attri identifier */
uint64_t alfi_ino; /* the inode for this attr operation */
uint32_t alfi_op_flags; /* marks the op as a set or remove */
- uint32_t alfi_name_len; /* attr name length */
+ union {
+ uint32_t alfi_name_len; /* attr name length */
+ struct {
+ /*
+ * For PPTR_REPLACE, these are the lengths of the old
+ * and new attr names. The new and old values must
+ * have the same length.
+ */
+ uint16_t alfi_old_name_len;
+ uint16_t alfi_new_name_len;
+ };
+ };
uint32_t alfi_value_len; /* attr value length */
uint32_t alfi_attr_filter;/* attr filter flags */
};
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 9fe7a9564bca..521d327e4c89 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -75,6 +75,8 @@ extern const struct xlog_recover_item_ops xlog_cui_item_ops;
extern const struct xlog_recover_item_ops xlog_cud_item_ops;
extern const struct xlog_recover_item_ops xlog_attri_item_ops;
extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
+extern const struct xlog_recover_item_ops xlog_xmi_item_ops;
+extern const struct xlog_recover_item_ops xlog_xmd_item_ops;
/*
* Macros, structures, prototypes for internal log manager use.
@@ -121,6 +123,8 @@ bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_inode **ipp);
+int xlog_recover_iget_handle(struct xfs_mount *mp, xfs_ino_t ino, uint32_t gen,
+ struct xfs_inode **ipp);
void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
uint64_t intent_id);
int xlog_alloc_buf_cancel_table(struct xlog *log);
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 9975b93a7412..d3bd6a86c8fe 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -17,6 +17,34 @@
#include "xfs_trace.h"
/*
+ * Shortly after enabling the large extents count feature in 2023, longstanding
+ * bugs were found in the code that computes the minimum log size. Luckily,
+ * the bugs resulted in over-estimates of that size, so there's no impact to
+ * existing users. However, we don't want to reduce the minimum log size
+ * because that can create the situation where a newer mkfs writes a new
+ * filesystem that an older kernel won't mount.
+ *
+ * Several years prior, we also discovered that the transaction reservations
+ * for rmap and reflink operations were unnecessarily large. That was fixed,
+ * but the minimum log size computation was left alone to avoid the
+ * compatibility problems noted above. Fix that too.
+ *
+ * Therefore, we only may correct the computation starting with filesystem
+ * features that didn't exist in 2023. In other words, only turn this on if
+ * the filesystem has parent pointers.
+ *
+ * This function can be called before the XFS_HAS_* flags have been set up,
+ * (e.g. mkfs) so we must check the ondisk superblock.
+ */
+static inline bool
+xfs_want_minlogsize_fixes(
+ struct xfs_sb *sb)
+{
+ return xfs_sb_is_v5(sb) &&
+ xfs_sb_has_incompat_feature(sb, XFS_SB_FEAT_INCOMPAT_PARENT);
+}
+
+/*
* Calculate the maximum length in bytes that would be required for a local
* attribute value as large attributes out of line are not logged.
*/
@@ -31,6 +59,15 @@ xfs_log_calc_max_attrsetm_res(
MAXNAMELEN - 1;
nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
nblks += XFS_B_TO_FSB(mp, size);
+
+ /*
+ * If the feature set is new enough, correct a unit conversion error in
+ * the xattr transaction reservation code that resulted in oversized
+ * minimum log size computations.
+ */
+ if (xfs_want_minlogsize_fixes(&mp->m_sb))
+ size = XFS_B_TO_FSB(mp, size);
+
nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
return M_RES(mp)->tr_attrsetm.tr_logres +
@@ -49,6 +86,15 @@ xfs_log_calc_trans_resv_for_minlogblocks(
unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
/*
+ * If the feature set is new enough, drop the oversized minimum log
+ * size computation introduced by the original reflink code.
+ */
+ if (xfs_want_minlogsize_fixes(&mp->m_sb)) {
+ xfs_trans_resv_calc(mp, resv);
+ return;
+ }
+
+ /*
* In the early days of rmap+reflink, we always set the rmap maxlevels
* to 9 even if the AG was small enough that it would never grow to
* that height. Transaction reservation sizes influence the minimum
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 81885a6a028e..e8cdd77d03fa 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -119,6 +119,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1);
XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_parent_rec, 12);
/* log structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88);
@@ -155,6 +156,11 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16);
XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16);
+ /* parent pointer ioctls */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_rec, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_getparents, 40);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_by_handle, 64);
+
/*
* The v5 superblock format extended several v4 header structures with
* additional data. While new fields are only accessible on v5
diff --git a/fs/xfs/libxfs/xfs_parent.c b/fs/xfs/libxfs/xfs_parent.c
new file mode 100644
index 000000000000..69366c44a701
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_parent.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022-2024 Oracle.
+ * All rights reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_da_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr_sf.h"
+#include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_log.h"
+#include "xfs_xattr.h"
+#include "xfs_parent.h"
+#include "xfs_trans_space.h"
+#include "xfs_attr_item.h"
+#include "xfs_health.h"
+
+struct kmem_cache *xfs_parent_args_cache;
+
+/*
+ * Parent pointer attribute handling.
+ *
+ * Because the attribute name is a filename component, it will never be longer
+ * than 255 bytes and must not contain nulls or slashes. These are roughly the
+ * same constraints that apply to attribute names.
+ *
+ * The attribute value must always be a struct xfs_parent_rec. This means the
+ * attribute will never be in remote format because 12 bytes is nowhere near
+ * xfs_attr_leaf_entsize_local_max() (~75% of block size).
+ *
+ * Creating a new parent attribute will always create a new attribute - there
+ * should never, ever be an existing attribute in the tree for a new inode.
+ * ENOSPC behavior is problematic - creating the inode without the parent
+ * pointer is effectively a corruption, so we allow parent attribute creation
+ * to dip into the reserve block pool to avoid unexpected ENOSPC errors from
+ * occurring.
+ */
+
+/* Return true if parent pointer attr name is valid. */
+bool
+xfs_parent_namecheck(
+ unsigned int attr_flags,
+ const void *name,
+ size_t length)
+{
+ /*
+ * Parent pointers always use logged operations, so there should never
+ * be incomplete xattrs.
+ */
+ if (attr_flags & XFS_ATTR_INCOMPLETE)
+ return false;
+
+ return xfs_dir2_namecheck(name, length);
+}
+
+/* Return true if parent pointer attr value is valid. */
+bool
+xfs_parent_valuecheck(
+ struct xfs_mount *mp,
+ const void *value,
+ size_t valuelen)
+{
+ const struct xfs_parent_rec *rec = value;
+
+ if (!xfs_has_parent(mp))
+ return false;
+
+ /* The xattr value must be a parent record. */
+ if (valuelen != sizeof(struct xfs_parent_rec))
+ return false;
+
+ /* The parent record must be local. */
+ if (value == NULL)
+ return false;
+
+ /* The parent inumber must be valid. */
+ if (!xfs_verify_dir_ino(mp, be64_to_cpu(rec->p_ino)))
+ return false;
+
+ return true;
+}
+
+/* Compute the attribute name hash for a parent pointer. */
+xfs_dahash_t
+xfs_parent_hashval(
+ struct xfs_mount *mp,
+ const uint8_t *name,
+ int namelen,
+ xfs_ino_t parent_ino)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+
+ /*
+ * Use the same dirent name hash as would be used on the directory, but
+ * mix in the parent inode number to avoid collisions on hardlinked
+ * files with identical names but different parents.
+ */
+ return xfs_dir2_hashname(mp, &xname) ^
+ upper_32_bits(parent_ino) ^ lower_32_bits(parent_ino);
+}
+
+/* Compute the attribute name hash from the xattr components. */
+xfs_dahash_t
+xfs_parent_hashattr(
+ struct xfs_mount *mp,
+ const uint8_t *name,
+ int namelen,
+ const void *value,
+ int valuelen)
+{
+ const struct xfs_parent_rec *rec = value;
+
+ /* Requires a local attr value in xfs_parent_rec format */
+ if (valuelen != sizeof(struct xfs_parent_rec)) {
+ ASSERT(valuelen == sizeof(struct xfs_parent_rec));
+ return 0;
+ }
+
+ if (!value) {
+ ASSERT(value != NULL);
+ return 0;
+ }
+
+ return xfs_parent_hashval(mp, name, namelen, be64_to_cpu(rec->p_ino));
+}
+
+/*
+ * Initialize the parent pointer arguments structure. Caller must have zeroed
+ * the contents of @args. @tp is only required for updates.
+ */
+static void
+xfs_parent_da_args_init(
+ struct xfs_da_args *args,
+ struct xfs_trans *tp,
+ struct xfs_parent_rec *rec,
+ struct xfs_inode *child,
+ xfs_ino_t owner,
+ const struct xfs_name *parent_name)
+{
+ args->geo = child->i_mount->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->attr_filter = XFS_ATTR_PARENT;
+ args->op_flags = XFS_DA_OP_LOGGED | XFS_DA_OP_OKNOENT;
+ args->trans = tp;
+ args->dp = child;
+ args->owner = owner;
+ args->name = parent_name->name;
+ args->namelen = parent_name->len;
+ args->value = rec;
+ args->valuelen = sizeof(struct xfs_parent_rec);
+ xfs_attr_sethash(args);
+}
+
+/* Make sure the incore state is ready for a parent pointer query/update. */
+static inline int
+xfs_parent_iread_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *child)
+{
+ /* Parent pointers require that the attr fork must exist. */
+ if (XFS_IS_CORRUPT(child->i_mount, !xfs_inode_has_attr_fork(child))) {
+ xfs_inode_mark_sick(child, XFS_SICK_INO_PARENT);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_iread_extents(tp, child, XFS_ATTR_FORK);
+}
+
+/* Add a parent pointer to reflect a dirent addition. */
+int
+xfs_parent_addname(
+ struct xfs_trans *tp,
+ struct xfs_parent_args *ppargs,
+ struct xfs_inode *dp,
+ const struct xfs_name *parent_name,
+ struct xfs_inode *child)
+{
+ int error;
+
+ error = xfs_parent_iread_extents(tp, child);
+ if (error)
+ return error;
+
+ xfs_inode_to_parent_rec(&ppargs->rec, dp);
+ xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child,
+ child->i_ino, parent_name);
+ xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_SET);
+ return 0;
+}
+
+/* Remove a parent pointer to reflect a dirent removal. */
+int
+xfs_parent_removename(
+ struct xfs_trans *tp,
+ struct xfs_parent_args *ppargs,
+ struct xfs_inode *dp,
+ const struct xfs_name *parent_name,
+ struct xfs_inode *child)
+{
+ int error;
+
+ error = xfs_parent_iread_extents(tp, child);
+ if (error)
+ return error;
+
+ xfs_inode_to_parent_rec(&ppargs->rec, dp);
+ xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child,
+ child->i_ino, parent_name);
+ xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REMOVE);
+ return 0;
+}
+
+/* Replace one parent pointer with another to reflect a rename. */
+int
+xfs_parent_replacename(
+ struct xfs_trans *tp,
+ struct xfs_parent_args *ppargs,
+ struct xfs_inode *old_dp,
+ const struct xfs_name *old_name,
+ struct xfs_inode *new_dp,
+ const struct xfs_name *new_name,
+ struct xfs_inode *child)
+{
+ int error;
+
+ error = xfs_parent_iread_extents(tp, child);
+ if (error)
+ return error;
+
+ xfs_inode_to_parent_rec(&ppargs->rec, old_dp);
+ xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child,
+ child->i_ino, old_name);
+
+ xfs_inode_to_parent_rec(&ppargs->new_rec, new_dp);
+ ppargs->args.new_name = new_name->name;
+ ppargs->args.new_namelen = new_name->len;
+ ppargs->args.new_value = &ppargs->new_rec;
+ ppargs->args.new_valuelen = sizeof(struct xfs_parent_rec);
+ xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REPLACE);
+ return 0;
+}
+
+/*
+ * Extract parent pointer information from any parent pointer xattr into
+ * @parent_ino/gen. The last two parameters can be NULL pointers.
+ *
+ * Returns 0 if this is not a parent pointer xattr at all; or -EFSCORRUPTED for
+ * garbage.
+ */
+int
+xfs_parent_from_attr(
+ struct xfs_mount *mp,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ xfs_ino_t *parent_ino,
+ uint32_t *parent_gen)
+{
+ const struct xfs_parent_rec *rec = value;
+
+ ASSERT(attr_flags & XFS_ATTR_PARENT);
+
+ if (!xfs_parent_namecheck(attr_flags, name, namelen))
+ return -EFSCORRUPTED;
+ if (!xfs_parent_valuecheck(mp, value, valuelen))
+ return -EFSCORRUPTED;
+
+ if (parent_ino)
+ *parent_ino = be64_to_cpu(rec->p_ino);
+ if (parent_gen)
+ *parent_gen = be32_to_cpu(rec->p_gen);
+ return 0;
+}
+
+/*
+ * Look up a parent pointer record (@parent_name -> @pptr) of @ip.
+ *
+ * Caller must hold at least ILOCK_SHARED. The scratchpad need not be
+ * initialized.
+ *
+ * Returns 0 if the pointer is found, -ENOATTR if there is no match, or a
+ * negative errno.
+ */
+int
+xfs_parent_lookup(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ const struct xfs_name *parent_name,
+ struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch)
+{
+ memset(scratch, 0, sizeof(struct xfs_da_args));
+ xfs_parent_da_args_init(scratch, tp, pptr, ip, ip->i_ino, parent_name);
+ return xfs_attr_get_ilocked(scratch);
+}
+
+/* Sanity-check a parent pointer before we try to perform repairs. */
+static inline bool
+xfs_parent_sanity_check(
+ struct xfs_mount *mp,
+ const struct xfs_name *parent_name,
+ const struct xfs_parent_rec *pptr)
+{
+ if (!xfs_parent_namecheck(XFS_ATTR_PARENT, parent_name->name,
+ parent_name->len))
+ return false;
+
+ if (!xfs_parent_valuecheck(mp, pptr, sizeof(*pptr)))
+ return false;
+
+ return true;
+}
+
+
+/*
+ * Attach the parent pointer (@parent_name -> @pptr) to @ip immediately.
+ * Caller must not have a transaction or hold the ILOCK. This is for
+ * specialized repair functions only. The scratchpad need not be initialized.
+ */
+int
+xfs_parent_set(
+ struct xfs_inode *ip,
+ xfs_ino_t owner,
+ const struct xfs_name *parent_name,
+ struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch)
+{
+ if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ memset(scratch, 0, sizeof(struct xfs_da_args));
+ xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name);
+ return xfs_attr_set(scratch, XFS_ATTRUPDATE_CREATE, false);
+}
+
+/*
+ * Remove the parent pointer (@parent_name -> @pptr) from @ip immediately.
+ * Caller must not have a transaction or hold the ILOCK. This is for
+ * specialized repair functions only. The scratchpad need not be initialized.
+ */
+int
+xfs_parent_unset(
+ struct xfs_inode *ip,
+ xfs_ino_t owner,
+ const struct xfs_name *parent_name,
+ struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch)
+{
+ if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ memset(scratch, 0, sizeof(struct xfs_da_args));
+ xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name);
+ return xfs_attr_set(scratch, XFS_ATTRUPDATE_REMOVE, false);
+}
diff --git a/fs/xfs/libxfs/xfs_parent.h b/fs/xfs/libxfs/xfs_parent.h
new file mode 100644
index 000000000000..b8036527cdc7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_parent.h
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022-2024 Oracle.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_PARENT_H__
+#define __XFS_PARENT_H__
+
+/* Metadata validators */
+bool xfs_parent_namecheck(unsigned int attr_flags, const void *name,
+ size_t length);
+bool xfs_parent_valuecheck(struct xfs_mount *mp, const void *value,
+ size_t valuelen);
+
+xfs_dahash_t xfs_parent_hashval(struct xfs_mount *mp, const uint8_t *name,
+ int namelen, xfs_ino_t parent_ino);
+xfs_dahash_t xfs_parent_hashattr(struct xfs_mount *mp, const uint8_t *name,
+ int namelen, const void *value, int valuelen);
+
+/* Initializes a xfs_parent_rec to be stored as an attribute name. */
+static inline void
+xfs_parent_rec_init(
+ struct xfs_parent_rec *rec,
+ xfs_ino_t ino,
+ uint32_t gen)
+{
+ rec->p_ino = cpu_to_be64(ino);
+ rec->p_gen = cpu_to_be32(gen);
+}
+
+/* Initializes a xfs_parent_rec to be stored as an attribute name. */
+static inline void
+xfs_inode_to_parent_rec(
+ struct xfs_parent_rec *rec,
+ const struct xfs_inode *dp)
+{
+ xfs_parent_rec_init(rec, dp->i_ino, VFS_IC(dp)->i_generation);
+}
+
+extern struct kmem_cache *xfs_parent_args_cache;
+
+/*
+ * Parent pointer information needed to pass around the deferred xattr update
+ * machinery.
+ */
+struct xfs_parent_args {
+ struct xfs_parent_rec rec;
+ struct xfs_parent_rec new_rec;
+ struct xfs_da_args args;
+};
+
+/*
+ * Start a parent pointer update by allocating the context object we need to
+ * perform a parent pointer update.
+ */
+static inline int
+xfs_parent_start(
+ struct xfs_mount *mp,
+ struct xfs_parent_args **ppargsp)
+{
+ if (!xfs_has_parent(mp)) {
+ *ppargsp = NULL;
+ return 0;
+ }
+
+ *ppargsp = kmem_cache_zalloc(xfs_parent_args_cache, GFP_KERNEL);
+ if (!*ppargsp)
+ return -ENOMEM;
+ return 0;
+}
+
+/* Finish a parent pointer update by freeing the context object. */
+static inline void
+xfs_parent_finish(
+ struct xfs_mount *mp,
+ struct xfs_parent_args *ppargs)
+{
+ if (ppargs)
+ kmem_cache_free(xfs_parent_args_cache, ppargs);
+}
+
+int xfs_parent_addname(struct xfs_trans *tp, struct xfs_parent_args *ppargs,
+ struct xfs_inode *dp, const struct xfs_name *parent_name,
+ struct xfs_inode *child);
+int xfs_parent_removename(struct xfs_trans *tp, struct xfs_parent_args *ppargs,
+ struct xfs_inode *dp, const struct xfs_name *parent_name,
+ struct xfs_inode *child);
+int xfs_parent_replacename(struct xfs_trans *tp,
+ struct xfs_parent_args *ppargs,
+ struct xfs_inode *old_dp, const struct xfs_name *old_name,
+ struct xfs_inode *new_dp, const struct xfs_name *new_name,
+ struct xfs_inode *child);
+
+int xfs_parent_from_attr(struct xfs_mount *mp, unsigned int attr_flags,
+ const unsigned char *name, unsigned int namelen,
+ const void *value, unsigned int valuelen,
+ xfs_ino_t *parent_ino, uint32_t *parent_gen);
+
+/* Repair functions */
+int xfs_parent_lookup(struct xfs_trans *tp, struct xfs_inode *ip,
+ const struct xfs_name *name, struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch);
+int xfs_parent_set(struct xfs_inode *ip, xfs_ino_t owner,
+ const struct xfs_name *name, struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch);
+int xfs_parent_unset(struct xfs_inode *ip, xfs_ino_t owner,
+ const struct xfs_name *name, struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch);
+
+#endif /* __XFS_PARENT_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index f246d6dbf4ec..386b672c5058 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1168,3 +1168,60 @@ xfs_rtsummary_wordcount(
blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks);
return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
}
+
+/*
+ * Lock both realtime free space metadata inodes for a freespace update. If a
+ * transaction is given, the inodes will be joined to the transaction and the
+ * ILOCKs will be released on transaction commit.
+ */
+void
+xfs_rtbitmap_lock(
+ struct xfs_trans *tp,
+ struct xfs_mount *mp)
+{
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+ if (tp)
+ xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+ if (tp)
+ xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+}
+
+/* Unlock both realtime free space metadata inodes after a freespace update. */
+void
+xfs_rtbitmap_unlock(
+ struct xfs_mount *mp)
+{
+ xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+}
+
+/*
+ * Lock the realtime free space metadata inodes for a freespace scan. Callers
+ * must walk metadata blocks in order of increasing file offset.
+ */
+void
+xfs_rtbitmap_lock_shared(
+ struct xfs_mount *mp,
+ unsigned int rbmlock_flags)
+{
+ if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+
+ if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+}
+
+/* Unlock the realtime free space metadata inodes after a freespace scan. */
+void
+xfs_rtbitmap_unlock_shared(
+ struct xfs_mount *mp,
+ unsigned int rbmlock_flags)
+{
+ if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
+ xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+
+ if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 152a66750af5..6186585f2c37 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -360,6 +360,19 @@ xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
unsigned int rsumlevels, xfs_extlen_t rbmblocks);
unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+
+void xfs_rtbitmap_lock(struct xfs_trans *tp, struct xfs_mount *mp);
+void xfs_rtbitmap_unlock(struct xfs_mount *mp);
+
+/* Lock the rt bitmap inode in shared mode */
+#define XFS_RBMLOCK_BITMAP (1U << 0)
+/* Lock the rt summary inode in shared mode */
+#define XFS_RBMLOCK_SUMMARY (1U << 1)
+
+void xfs_rtbitmap_lock_shared(struct xfs_mount *mp,
+ unsigned int rbmlock_flags);
+void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp,
+ unsigned int rbmlock_flags);
#else /* CONFIG_XFS_RT */
# define xfs_rtfree_extent(t,b,l) (-ENOSYS)
# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS)
@@ -378,6 +391,10 @@ xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
# define xfs_rtbitmap_wordcount(mp, r) (0)
# define xfs_rtsummary_blockcount(mp, l, b) (0)
# define xfs_rtsummary_wordcount(mp, l, b) (0)
+# define xfs_rtbitmap_lock(tp, mp) do { } while (0)
+# define xfs_rtbitmap_unlock(mp) do { } while (0)
+# define xfs_rtbitmap_lock_shared(mp, lf) do { } while (0)
+# define xfs_rtbitmap_unlock_shared(mp, lf) do { } while (0)
#endif /* CONFIG_XFS_RT */
#endif /* __XFS_RTBITMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 73a4b895de67..09e4bf949bf8 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -26,6 +26,7 @@
#include "xfs_health.h"
#include "xfs_ag.h"
#include "xfs_rtbitmap.h"
+#include "xfs_exchrange.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -175,6 +176,10 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_NEEDSREPAIR;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
features |= XFS_FEAT_NREXT64;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)
+ features |= XFS_FEAT_EXCHANGE_RANGE;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT)
+ features |= XFS_FEAT_PARENT;
return features;
}
@@ -1251,6 +1256,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
if (xfs_has_inobtcounts(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT;
+ if (xfs_has_parent(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_PARENT;
if (xfs_has_sector(mp)) {
geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
geo->logsectsize = sbp->sb_logsectsize;
@@ -1259,6 +1266,8 @@ xfs_fs_geometry(
}
if (xfs_has_large_extent_counts(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
+ if (xfs_has_exchange_range(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index dfd61fa8332e..34f104ed372c 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -124,7 +124,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_TRANS_RES_FDBLKS (1u << 6)
/* Transaction contains an intent done log item */
#define XFS_TRANS_HAS_INTENT_DONE (1u << 7)
-
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
@@ -136,7 +135,10 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
* for free space from AG 0. If the correct transaction reservations have been
* made then this algorithm will eventually find all the space it needs.
*/
-#define XFS_TRANS_LOWMODE 0x100 /* allocate in low space mode */
+#define XFS_TRANS_LOWMODE (1u << 8)
+
+/* Transaction has locked the rtbitmap and rtsum inodes */
+#define XFS_TRANS_RTBITMAP_LOCKED (1u << 9)
/*
* Field values for xfs_trans_mod_sb.
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index ffb1317a9212..f228127a88ff 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -169,7 +169,8 @@ xfs_symlink_local_to_remote(
struct xfs_trans *tp,
struct xfs_buf *bp,
struct xfs_inode *ip,
- struct xfs_ifork *ifp)
+ struct xfs_ifork *ifp,
+ void *priv)
{
struct xfs_mount *mp = ip->i_mount;
char *buf;
@@ -310,6 +311,7 @@ int
xfs_symlink_write_target(
struct xfs_trans *tp,
struct xfs_inode *ip,
+ xfs_ino_t owner,
const char *target_path,
int pathlen,
xfs_fsblock_t fs_blocks,
@@ -364,8 +366,7 @@ xfs_symlink_write_target(
byte_cnt = min(byte_cnt, pathlen);
buf = bp->b_addr;
- buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt,
- bp);
+ buf += xfs_symlink_hdr_set(mp, owner, offset, byte_cnt, bp);
memcpy(buf, cur_chunk, byte_cnt);
@@ -380,3 +381,50 @@ xfs_symlink_write_target(
ASSERT(pathlen == 0);
return 0;
}
+
+/* Remove all the blocks from a symlink and invalidate buffers. */
+int
+xfs_symlink_remote_truncate(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *bp;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int done = 0;
+ int i;
+ int error;
+
+ /* Read mappings and invalidate buffers. */
+ error = xfs_bmapi_read(ip, 0, XFS_MAX_FILEOFF, mval, &nmaps, 0);
+ if (error)
+ return error;
+
+ for (i = 0; i < nmaps; i++) {
+ if (!xfs_bmap_is_real_extent(&mval[i]))
+ break;
+
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+ XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0,
+ &bp);
+ if (error)
+ return error;
+
+ xfs_trans_binval(tp, bp);
+ }
+
+ /* Unmap the remote blocks. */
+ error = xfs_bunmapi(tp, ip, 0, XFS_MAX_FILEOFF, 0, nmaps, &done);
+ if (error)
+ return error;
+ if (!done) {
+ ASSERT(done);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h
index a63bd38ae4fa..c1672fe1f17b 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.h
+++ b/fs/xfs/libxfs/xfs_symlink_remote.h
@@ -16,11 +16,13 @@ int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
uint32_t size, struct xfs_buf *bp);
void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_inode *ip, struct xfs_ifork *ifp);
+ struct xfs_inode *ip, struct xfs_ifork *ifp,
+ void *priv);
xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
int xfs_symlink_remote_read(struct xfs_inode *ip, char *link);
int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip,
- const char *target_path, int pathlen, xfs_fsblock_t fs_blocks,
- uint resblks);
+ xfs_ino_t owner, const char *target_path, int pathlen,
+ xfs_fsblock_t fs_blocks, uint resblks);
+int xfs_symlink_remote_truncate(struct xfs_trans *tp, struct xfs_inode *ip);
#endif /* __XFS_SYMLINK_REMOTE_H */
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6cd45e8c118d..6dbe6e7251e7 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -20,6 +20,9 @@
#include "xfs_qm.h"
#include "xfs_trans_space.h"
#include "xfs_rtbitmap.h"
+#include "xfs_attr_item.h"
+#include "xfs_log.h"
+#include "xfs_da_format.h"
#define _ALLOC true
#define _FREE false
@@ -422,29 +425,110 @@ xfs_calc_itruncate_reservation_minlogsize(
return xfs_calc_itruncate_reservation(mp, true);
}
+static inline unsigned int xfs_calc_pptr_link_overhead(void)
+{
+ return sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) +
+ xlog_calc_iovec_len(MAXNAMELEN - 1);
+}
+static inline unsigned int xfs_calc_pptr_unlink_overhead(void)
+{
+ return sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) +
+ xlog_calc_iovec_len(MAXNAMELEN - 1);
+}
+static inline unsigned int xfs_calc_pptr_replace_overhead(void)
+{
+ return sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) +
+ xlog_calc_iovec_len(MAXNAMELEN - 1) +
+ xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) +
+ xlog_calc_iovec_len(MAXNAMELEN - 1);
+}
+
/*
* In renaming a files we can modify:
* the five inodes involved: 5 * inode size
* the two directory btrees: 2 * (max depth + v2) * dir block size
* the two directory bmap btrees: 2 * max depth * block size
* And the bmap_finish transaction can free dir and bmap blocks (two sets
- * of bmap blocks) giving:
+ * of bmap blocks) giving (t2):
* the agf for the ags in which the blocks live: 3 * sector size
* the agfl for the ags in which the blocks live: 3 * sector size
* the superblock for the free block count: sector size
* the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ * If parent pointers are enabled (t3), then each transaction in the chain
+ * must be capable of setting or removing the extended attribute
+ * containing the parent information. It must also be able to handle
+ * the three xattr intent items that track the progress of the parent
+ * pointer update.
*/
STATIC uint
xfs_calc_rename_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
- max((xfs_calc_inode_res(mp, 5) +
- xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
- XFS_FSB_TO_B(mp, 1))));
+ unsigned int overhead = XFS_DQUOT_LOGRES(mp);
+ struct xfs_trans_resv *resp = M_RES(mp);
+ unsigned int t1, t2, t3 = 0;
+
+ t1 = xfs_calc_inode_res(mp, 5) +
+ xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1));
+
+ t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
+ XFS_FSB_TO_B(mp, 1));
+
+ if (xfs_has_parent(mp)) {
+ unsigned int rename_overhead, exchange_overhead;
+
+ t3 = max(resp->tr_attrsetm.tr_logres,
+ resp->tr_attrrm.tr_logres);
+
+ /*
+ * For a standard rename, the three xattr intent log items
+ * are (1) replacing the pptr for the source file; (2)
+ * removing the pptr on the dest file; and (3) adding a
+ * pptr for the whiteout file in the src dir.
+ *
+ * For an RENAME_EXCHANGE, there are two xattr intent
+ * items to replace the pptr for both src and dest
+ * files. Link counts don't change and there is no
+ * whiteout.
+ *
+ * In the worst case we can end up relogging all log
+ * intent items to allow the log tail to move ahead, so
+ * they become overhead added to each transaction in a
+ * processing chain.
+ */
+ rename_overhead = xfs_calc_pptr_replace_overhead() +
+ xfs_calc_pptr_unlink_overhead() +
+ xfs_calc_pptr_link_overhead();
+ exchange_overhead = 2 * xfs_calc_pptr_replace_overhead();
+
+ overhead += max(rename_overhead, exchange_overhead);
+ }
+
+ return overhead + max3(t1, t2, t3);
+}
+
+static inline unsigned int
+xfs_rename_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ /* One for the rename, one more for freeing blocks */
+ unsigned int ret = XFS_RENAME_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to remove or add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += max(resp->tr_attrsetm.tr_logcount,
+ resp->tr_attrrm.tr_logcount);
+
+ return ret;
}
/*
@@ -461,6 +545,23 @@ xfs_calc_iunlink_remove_reservation(
2 * M_IGEO(mp)->inode_cluster_size;
}
+static inline unsigned int
+xfs_link_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ unsigned int ret = XFS_LINK_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += resp->tr_attrsetm.tr_logcount;
+
+ return ret;
+}
+
/*
* For creating a link to an inode:
* the parent directory inode: inode size
@@ -477,14 +578,23 @@ STATIC uint
xfs_calc_link_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
- xfs_calc_iunlink_remove_reservation(mp) +
- max((xfs_calc_inode_res(mp, 2) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
- XFS_FSB_TO_B(mp, 1))));
+ unsigned int overhead = XFS_DQUOT_LOGRES(mp);
+ struct xfs_trans_resv *resp = M_RES(mp);
+ unsigned int t1, t2, t3 = 0;
+
+ overhead += xfs_calc_iunlink_remove_reservation(mp);
+ t1 = xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+
+ if (xfs_has_parent(mp)) {
+ t3 = resp->tr_attrsetm.tr_logres;
+ overhead += xfs_calc_pptr_link_overhead();
+ }
+
+ return overhead + max3(t1, t2, t3);
}
/*
@@ -499,6 +609,23 @@ xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
M_IGEO(mp)->inode_cluster_size;
}
+static inline unsigned int
+xfs_remove_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ unsigned int ret = XFS_REMOVE_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += resp->tr_attrrm.tr_logcount;
+
+ return ret;
+}
+
/*
* For removing a directory entry we can modify:
* the parent directory inode: inode size
@@ -515,14 +642,24 @@ STATIC uint
xfs_calc_remove_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
- xfs_calc_iunlink_add_reservation(mp) +
- max((xfs_calc_inode_res(mp, 2) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
- XFS_FSB_TO_B(mp, 1))));
+ unsigned int overhead = XFS_DQUOT_LOGRES(mp);
+ struct xfs_trans_resv *resp = M_RES(mp);
+ unsigned int t1, t2, t3 = 0;
+
+ overhead += xfs_calc_iunlink_add_reservation(mp);
+
+ t1 = xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
+ XFS_FSB_TO_B(mp, 1));
+
+ if (xfs_has_parent(mp)) {
+ t3 = resp->tr_attrrm.tr_logres;
+ overhead += xfs_calc_pptr_unlink_overhead();
+ }
+
+ return overhead + max3(t1, t2, t3);
}
/*
@@ -571,12 +708,40 @@ xfs_calc_icreate_resv_alloc(
xfs_calc_finobt_res(mp);
}
+static inline unsigned int
+xfs_icreate_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ unsigned int ret = XFS_CREATE_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += resp->tr_attrsetm.tr_logcount;
+
+ return ret;
+}
+
STATIC uint
-xfs_calc_icreate_reservation(xfs_mount_t *mp)
+xfs_calc_icreate_reservation(
+ struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
- max(xfs_calc_icreate_resv_alloc(mp),
- xfs_calc_create_resv_modify(mp));
+ struct xfs_trans_resv *resp = M_RES(mp);
+ unsigned int overhead = XFS_DQUOT_LOGRES(mp);
+ unsigned int t1, t2, t3 = 0;
+
+ t1 = xfs_calc_icreate_resv_alloc(mp);
+ t2 = xfs_calc_create_resv_modify(mp);
+
+ if (xfs_has_parent(mp)) {
+ t3 = resp->tr_attrsetm.tr_logres;
+ overhead += xfs_calc_pptr_link_overhead();
+ }
+
+ return overhead + max3(t1, t2, t3);
}
STATIC uint
@@ -589,6 +754,23 @@ xfs_calc_create_tmpfile_reservation(
return res + xfs_calc_iunlink_add_reservation(mp);
}
+static inline unsigned int
+xfs_mkdir_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ unsigned int ret = XFS_MKDIR_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += resp->tr_attrsetm.tr_logcount;
+
+ return ret;
+}
+
/*
* Making a new directory is the same as creating a new file.
*/
@@ -599,6 +781,22 @@ xfs_calc_mkdir_reservation(
return xfs_calc_icreate_reservation(mp);
}
+static inline unsigned int
+xfs_symlink_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ unsigned int ret = XFS_SYMLINK_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += resp->tr_attrsetm.tr_logcount;
+
+ return ret;
+}
/*
* Making a new symplink is the same as creating a new file, but
@@ -911,54 +1109,76 @@ xfs_calc_sb_reservation(
return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
}
-void
-xfs_trans_resv_calc(
+/*
+ * Namespace reservations.
+ *
+ * These get tricky when parent pointers are enabled as we have attribute
+ * modifications occurring from within these transactions. Rather than confuse
+ * each of these reservation calculations with the conditional attribute
+ * reservations, add them here in a clear and concise manner. This requires that
+ * the attribute reservations have already been calculated.
+ *
+ * Note that we only include the static attribute reservation here; the runtime
+ * reservation will have to be modified by the size of the attributes being
+ * added/removed/modified. See the comments on the attribute reservation
+ * calculations for more details.
+ */
+STATIC void
+xfs_calc_namespace_reservations(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
- int logcount_adj = 0;
-
- /*
- * The following transactions are logged in physical format and
- * require a permanent reservation on space.
- */
- resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
- resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
- resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
- resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
- resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+ ASSERT(resp->tr_attrsetm.tr_logres > 0);
resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
- resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+ resp->tr_rename.tr_logcount = xfs_rename_log_count(mp, resp);
resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
- resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+ resp->tr_link.tr_logcount = xfs_link_log_count(mp, resp);
resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
- resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+ resp->tr_remove.tr_logcount = xfs_remove_log_count(mp, resp);
resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
- resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+ resp->tr_symlink.tr_logcount = xfs_symlink_log_count(mp, resp);
resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_create.tr_logres = xfs_calc_icreate_reservation(mp);
- resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+ resp->tr_create.tr_logcount = xfs_icreate_log_count(mp, resp);
resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+ resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+ resp->tr_mkdir.tr_logcount = xfs_mkdir_log_count(mp, resp);
+ resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+}
+
+void
+xfs_trans_resv_calc(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ int logcount_adj = 0;
+
+ /*
+ * The following transactions are logged in physical format and
+ * require a permanent reservation on space.
+ */
+ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
resp->tr_create_tmpfile.tr_logres =
xfs_calc_create_tmpfile_reservation(mp);
resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
- resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
- resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
@@ -988,6 +1208,8 @@ xfs_trans_resv_calc(
resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+ xfs_calc_namespace_reservations(mp, resp);
+
/*
* The following transactions are logged in logical format with
* a default log count.
diff --git a/fs/xfs/libxfs/xfs_trans_space.c b/fs/xfs/libxfs/xfs_trans_space.c
new file mode 100644
index 000000000000..b9dc3752f702
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_space.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_da_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+
+/* Calculate the disk space required to add a parent pointer. */
+unsigned int
+xfs_parent_calc_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ /*
+ * Parent pointers are always the first attr in an attr tree, and never
+ * larger than a block
+ */
+ return XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) +
+ XFS_NEXTENTADD_SPACE_RES(mp, namelen, XFS_ATTR_FORK);
+}
+
+unsigned int
+xfs_create_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ unsigned int ret;
+
+ ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen);
+ if (xfs_has_parent(mp))
+ ret += xfs_parent_calc_space_res(mp, namelen);
+
+ return ret;
+}
+
+unsigned int
+xfs_mkdir_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ return xfs_create_space_res(mp, namelen);
+}
+
+unsigned int
+xfs_link_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ unsigned int ret;
+
+ ret = XFS_DIRENTER_SPACE_RES(mp, namelen);
+ if (xfs_has_parent(mp))
+ ret += xfs_parent_calc_space_res(mp, namelen);
+
+ return ret;
+}
+
+unsigned int
+xfs_symlink_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen,
+ unsigned int fsblocks)
+{
+ unsigned int ret;
+
+ ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen) +
+ fsblocks;
+
+ if (xfs_has_parent(mp))
+ ret += xfs_parent_calc_space_res(mp, namelen);
+
+ return ret;
+}
+
+unsigned int
+xfs_remove_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ unsigned int ret = XFS_DIRREMOVE_SPACE_RES(mp);
+
+ if (xfs_has_parent(mp))
+ ret += xfs_parent_calc_space_res(mp, namelen);
+
+ return ret;
+}
+
+unsigned int
+xfs_rename_space_res(
+ struct xfs_mount *mp,
+ unsigned int src_namelen,
+ bool target_exists,
+ unsigned int target_namelen,
+ bool has_whiteout)
+{
+ unsigned int ret;
+
+ ret = XFS_DIRREMOVE_SPACE_RES(mp) +
+ XFS_DIRENTER_SPACE_RES(mp, target_namelen);
+
+ if (xfs_has_parent(mp)) {
+ if (has_whiteout)
+ ret += xfs_parent_calc_space_res(mp, src_namelen);
+ ret += 2 * xfs_parent_calc_space_res(mp, target_namelen);
+ }
+
+ if (target_exists)
+ ret += xfs_parent_calc_space_res(mp, target_namelen);
+
+ return ret;
+}
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 87b31c69a773..1155ff2d37e2 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -10,6 +10,10 @@
* Components of space reservations.
*/
+/* Worst case number of bmaps that can be held in a block. */
+#define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp) \
+ (((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0]))
+
/* Worst case number of rmaps that can be held in a block. */
#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \
(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
@@ -76,31 +80,32 @@
/* This macro is not used - see inline code in xfs_attr_set */
#define XFS_ATTRSET_SPACE_RES(mp, v) \
(XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
-#define XFS_CREATE_SPACE_RES(mp,nl) \
- (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
#define XFS_GROWFS_SPACE_RES(mp) \
(2 * (mp)->m_alloc_maxlevels)
#define XFS_GROWFSRT_SPACE_RES(mp,b) \
((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
-#define XFS_LINK_SPACE_RES(mp,nl) \
- XFS_DIRENTER_SPACE_RES(mp,nl)
-#define XFS_MKDIR_SPACE_RES(mp,nl) \
- (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
#define XFS_QM_DQALLOC_SPACE_RES(mp) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
XFS_DQUOT_CLUSTER_SIZE_FSB)
#define XFS_QM_QINOCREATE_SPACE_RES(mp) \
XFS_IALLOC_SPACE_RES(mp)
-#define XFS_REMOVE_SPACE_RES(mp) \
- XFS_DIRREMOVE_SPACE_RES(mp)
-#define XFS_RENAME_SPACE_RES(mp,nl) \
- (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
-#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
- (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
#define XFS_IFREE_SPACE_RES(mp) \
(xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0)
+unsigned int xfs_parent_calc_space_res(struct xfs_mount *mp,
+ unsigned int namelen);
+
+unsigned int xfs_create_space_res(struct xfs_mount *mp, unsigned int namelen);
+unsigned int xfs_mkdir_space_res(struct xfs_mount *mp, unsigned int namelen);
+unsigned int xfs_link_space_res(struct xfs_mount *mp, unsigned int namelen);
+unsigned int xfs_symlink_space_res(struct xfs_mount *mp, unsigned int namelen,
+ unsigned int fsblocks);
+unsigned int xfs_remove_space_res(struct xfs_mount *mp, unsigned int namelen);
+
+unsigned int xfs_rename_space_res(struct xfs_mount *mp,
+ unsigned int src_namelen, bool target_exists,
+ unsigned int target_namelen, bool has_whiteout);
#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index e954f07679dd..f8e5b67128d2 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -15,6 +15,7 @@
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_inode.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -165,8 +166,7 @@ xchk_superblock(
xchk_block_set_corrupt(sc, bp);
/* Check sb_versionnum bits that are set at mkfs time. */
- vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS |
- XFS_SB_VERSION_NUMBITS |
+ vernum_mask = cpu_to_be16(XFS_SB_VERSION_NUMBITS |
XFS_SB_VERSION_ALIGNBIT |
XFS_SB_VERSION_DALIGNBIT |
XFS_SB_VERSION_SHAREDBIT |
@@ -865,6 +865,43 @@ xchk_agi_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Check the unlinked buckets for links to bad inodes. We hold the AGI, so
+ * there cannot be any threads updating unlinked list pointers in this AG.
+ */
+STATIC void
+xchk_iunlink(
+ struct xfs_scrub *sc,
+ struct xfs_agi *agi)
+{
+ unsigned int i;
+ struct xfs_inode *ip;
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ xfs_agino_t agino = be32_to_cpu(agi->agi_unlinked[i]);
+
+ while (agino != NULLAGINO) {
+ if (agino % XFS_AGI_UNLINKED_BUCKETS != i) {
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ return;
+ }
+
+ ip = xfs_iunlink_lookup(sc->sa.pag, agino);
+ if (!ip) {
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ return;
+ }
+
+ if (!xfs_inode_on_unlinked_list(ip)) {
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ return;
+ }
+
+ agino = ip->i_next_unlinked;
+ }
+ }
+}
+
/* Scrub the AGI. */
int
xchk_agi(
@@ -949,6 +986,8 @@ xchk_agi(
if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ xchk_iunlink(sc, agi);
+
xchk_agi_xref(sc);
out:
return error;
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 427054b65b23..0dbc484b182f 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -21,13 +21,18 @@
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_ag.h"
+#include "xfs_inode.h"
+#include "xfs_iunlink_item.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
+#include "scrub/agino_bitmap.h"
#include "scrub/reap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
/* Superblock */
@@ -796,15 +801,57 @@ enum {
XREP_AGI_MAX
};
+#define XREP_AGI_LOOKUP_BATCH 32
+
+struct xrep_agi {
+ struct xfs_scrub *sc;
+
+ /* AGI buffer, tracked separately */
+ struct xfs_buf *agi_bp;
+
+ /* context for finding btree roots */
+ struct xrep_find_ag_btree fab[XREP_AGI_MAX];
+
+ /* old AGI contents in case we have to revert */
+ struct xfs_agi old_agi;
+
+ /* bitmap of which inodes are unlinked */
+ struct xagino_bitmap iunlink_bmp;
+
+ /* heads of the unlinked inode bucket lists */
+ xfs_agino_t iunlink_heads[XFS_AGI_UNLINKED_BUCKETS];
+
+ /* scratchpad for batched lookups of the radix tree */
+ struct xfs_inode *lookup_batch[XREP_AGI_LOOKUP_BATCH];
+
+ /* Map of ino -> next_ino for unlinked inode processing. */
+ struct xfarray *iunlink_next;
+
+ /* Map of ino -> prev_ino for unlinked inode processing. */
+ struct xfarray *iunlink_prev;
+};
+
+static void
+xrep_agi_buf_cleanup(
+ void *buf)
+{
+ struct xrep_agi *ragi = buf;
+
+ xfarray_destroy(ragi->iunlink_prev);
+ xfarray_destroy(ragi->iunlink_next);
+ xagino_bitmap_destroy(&ragi->iunlink_bmp);
+}
+
/*
* Given the inode btree roots described by *fab, find the roots, check them
* for sanity, and pass the root data back out via *fab.
*/
STATIC int
xrep_agi_find_btrees(
- struct xfs_scrub *sc,
- struct xrep_find_ag_btree *fab)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xrep_find_ag_btree *fab = ragi->fab;
struct xfs_buf *agf_bp;
struct xfs_mount *mp = sc->mp;
int error;
@@ -837,10 +884,11 @@ xrep_agi_find_btrees(
*/
STATIC void
xrep_agi_init_header(
- struct xfs_scrub *sc,
- struct xfs_buf *agi_bp,
- struct xfs_agi *old_agi)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
+ struct xfs_agi *old_agi = &ragi->old_agi;
struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_perag *pag = sc->sa.pag;
struct xfs_mount *mp = sc->mp;
@@ -856,10 +904,6 @@ xrep_agi_init_header(
if (xfs_has_crc(mp))
uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
- /* We don't know how to fix the unlinked list yet. */
- memcpy(&agi->agi_unlinked, &old_agi->agi_unlinked,
- sizeof(agi->agi_unlinked));
-
/* Mark the incore AGF data stale until we're done fixing things. */
ASSERT(xfs_perag_initialised_agi(pag));
clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
@@ -868,10 +912,12 @@ xrep_agi_init_header(
/* Set btree root information in an AGI. */
STATIC void
xrep_agi_set_roots(
- struct xfs_scrub *sc,
- struct xfs_agi *agi,
- struct xrep_find_ag_btree *fab)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_agi *agi = ragi->agi_bp->b_addr;
+ struct xrep_find_ag_btree *fab = ragi->fab;
+
agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root);
agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height);
@@ -884,9 +930,10 @@ xrep_agi_set_roots(
/* Update the AGI counters. */
STATIC int
xrep_agi_calc_from_btrees(
- struct xfs_scrub *sc,
- struct xfs_buf *agi_bp)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
struct xfs_btree_cur *cur;
struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_mount *mp = sc->mp;
@@ -928,12 +975,721 @@ err:
return error;
}
+/*
+ * Record a forwards unlinked chain pointer from agino -> next_agino in our
+ * staging information.
+ */
+static inline int
+xrep_iunlink_store_next(
+ struct xrep_agi *ragi,
+ xfs_agino_t agino,
+ xfs_agino_t next_agino)
+{
+ ASSERT(next_agino != 0);
+
+ return xfarray_store(ragi->iunlink_next, agino, &next_agino);
+}
+
+/*
+ * Record a backwards unlinked chain pointer from prev_ino <- agino in our
+ * staging information.
+ */
+static inline int
+xrep_iunlink_store_prev(
+ struct xrep_agi *ragi,
+ xfs_agino_t agino,
+ xfs_agino_t prev_agino)
+{
+ ASSERT(prev_agino != 0);
+
+ return xfarray_store(ragi->iunlink_prev, agino, &prev_agino);
+}
+
+/*
+ * Given an @agino, look up the next inode in the iunlink bucket. Returns
+ * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory
+ * like it should be, or a per-AG inode number.
+ */
+static inline xfs_agino_t
+xrep_iunlink_next(
+ struct xfs_scrub *sc,
+ xfs_agino_t agino)
+{
+ struct xfs_inode *ip;
+
+ ip = xfs_iunlink_lookup(sc->sa.pag, agino);
+ if (!ip)
+ return 0;
+
+ return ip->i_next_unlinked;
+}
+
+/*
+ * Load the inode @agino into memory, set its i_prev_unlinked, and drop the
+ * inode so it can be inactivated. Returns NULLAGINO if we're at the end of
+ * the chain or if we should stop walking the chain due to corruption; or a
+ * per-AG inode number.
+ */
+STATIC xfs_agino_t
+xrep_iunlink_reload_next(
+ struct xrep_agi *ragi,
+ xfs_agino_t prev_agino,
+ xfs_agino_t agino)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_inode *ip;
+ xfs_ino_t ino;
+ xfs_agino_t ret = NULLAGINO;
+ int error;
+
+ ino = XFS_AGINO_TO_INO(sc->mp, sc->sa.pag->pag_agno, agino);
+ error = xchk_iget(ragi->sc, ino, &ip);
+ if (error)
+ return ret;
+
+ trace_xrep_iunlink_reload_next(ip, prev_agino);
+
+ /* If this is a linked inode, stop processing the chain. */
+ if (VFS_I(ip)->i_nlink != 0) {
+ xrep_iunlink_store_next(ragi, agino, NULLAGINO);
+ goto rele;
+ }
+
+ ip->i_prev_unlinked = prev_agino;
+ ret = ip->i_next_unlinked;
+
+ /*
+ * Drop the inode reference that we just took. We hold the AGI, so
+ * this inode cannot move off the unlinked list and hence cannot be
+ * reclaimed.
+ */
+rele:
+ xchk_irele(sc, ip);
+ return ret;
+}
+
+/*
+ * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that
+ * still existed at mount time. This can happen if iunlink processing fails
+ * during log recovery.
+ */
+STATIC int
+xrep_iunlink_walk_ondisk_bucket(
+ struct xrep_agi *ragi,
+ unsigned int bucket)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+ xfs_agino_t prev_agino = NULLAGINO;
+ xfs_agino_t next_agino;
+ int error = 0;
+
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ while (next_agino != NULLAGINO) {
+ xfs_agino_t agino = next_agino;
+
+ if (xchk_should_terminate(ragi->sc, &error))
+ return error;
+
+ trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket,
+ prev_agino, agino);
+
+ if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS)
+ break;
+
+ next_agino = xrep_iunlink_next(sc, agino);
+ if (!next_agino)
+ next_agino = xrep_iunlink_reload_next(ragi, prev_agino,
+ agino);
+
+ prev_agino = agino;
+ }
+
+ return 0;
+}
+
+/* Decide if this is an unlinked inode in this AG. */
+STATIC bool
+xrep_iunlink_igrab(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ return false;
+
+ if (!xfs_inode_on_unlinked_list(ip))
+ return false;
+
+ return true;
+}
+
+/*
+ * Mark the given inode in the lookup batch in our unlinked inode bitmap, and
+ * remember if this inode is the start of the unlinked chain.
+ */
+STATIC int
+xrep_iunlink_visit(
+ struct xrep_agi *ragi,
+ unsigned int batch_idx)
+{
+ struct xfs_mount *mp = ragi->sc->mp;
+ struct xfs_inode *ip = ragi->lookup_batch[batch_idx];
+ xfs_agino_t agino;
+ unsigned int bucket;
+ int error;
+
+ ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == ragi->sc->sa.pag->pag_agno);
+ ASSERT(xfs_inode_on_unlinked_list(ip));
+
+ agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
+
+ trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket,
+ ragi->iunlink_heads[bucket], ip);
+
+ error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
+ if (error)
+ return error;
+
+ if (ip->i_prev_unlinked == NULLAGINO) {
+ if (ragi->iunlink_heads[bucket] == NULLAGINO)
+ ragi->iunlink_heads[bucket] = agino;
+ }
+
+ return 0;
+}
+
+/*
+ * Find all incore unlinked inodes so that we can rebuild the unlinked buckets.
+ * We hold the AGI so there should not be any modifications to the unlinked
+ * list.
+ */
+STATIC int
+xrep_iunlink_mark_incore(
+ struct xrep_agi *ragi)
+{
+ struct xfs_perag *pag = ragi->sc->sa.pag;
+ struct xfs_mount *mp = pag->pag_mount;
+ uint32_t first_index = 0;
+ bool done = false;
+ unsigned int nr_found = 0;
+
+ do {
+ unsigned int i;
+ int error = 0;
+
+ if (xchk_should_terminate(ragi->sc, &error))
+ return error;
+
+ rcu_read_lock();
+
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ (void **)&ragi->lookup_batch, first_index,
+ XREP_AGI_LOOKUP_BATCH);
+ if (!nr_found) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = ragi->lookup_batch[i];
+
+ if (done || !xrep_iunlink_igrab(pag, ip))
+ ragi->lookup_batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can occur if
+ * we have inodes in the last block of the AG and we
+ * are currently pointing to the last inode.
+ *
+ * Because we may see inodes that are from the wrong AG
+ * due to RCU freeing and reallocation, only update the
+ * index if it lies in this AG. It was a race that lead
+ * us to see this inode, so another lookup from the
+ * same index will not find it again.
+ */
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ continue;
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = true;
+ }
+
+ /* unlock now we've grabbed the inodes. */
+ rcu_read_unlock();
+
+ for (i = 0; i < nr_found; i++) {
+ if (!ragi->lookup_batch[i])
+ continue;
+ error = xrep_iunlink_visit(ragi, i);
+ if (error)
+ return error;
+ }
+ } while (!done);
+
+ return 0;
+}
+
+/* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */
+STATIC int
+xrep_iunlink_mark_ondisk_rec(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xrep_agi *ragi = priv;
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agino_t agino;
+ unsigned int i;
+ int error = 0;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+
+ for (i = 0, agino = irec.ir_startino;
+ i < XFS_INODES_PER_CHUNK;
+ i++, agino++) {
+ struct xfs_inode *ip;
+ unsigned int len = 1;
+
+ /* Skip free inodes */
+ if (XFS_INOBT_MASK(i) & irec.ir_free)
+ continue;
+ /* Skip inodes we've seen before */
+ if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len))
+ continue;
+
+ /*
+ * Skip incore inodes; these were already picked up by
+ * the _mark_incore step.
+ */
+ rcu_read_lock();
+ ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino);
+ rcu_read_unlock();
+ if (ip)
+ continue;
+
+ /*
+ * Try to look up this inode. If we can't get it, just move
+ * on because we haven't actually scrubbed the inobt or the
+ * inodes yet.
+ */
+ error = xchk_iget(ragi->sc,
+ XFS_AGINO_TO_INO(mp, sc->sa.pag->pag_agno,
+ agino),
+ &ip);
+ if (error)
+ continue;
+
+ trace_xrep_iunlink_reload_ondisk(ip);
+
+ if (VFS_I(ip)->i_nlink == 0)
+ error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * Find ondisk inodes that are unlinked and not in cache, and mark them in
+ * iunlink_bmp. We haven't checked the inobt yet, so we don't error out if
+ * the btree is corrupt.
+ */
+STATIC void
+xrep_iunlink_mark_ondisk(
+ struct xrep_agi *ragi)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
+ error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi);
+ xfs_btree_del_cursor(cur, error);
+}
+
+/*
+ * Walk an iunlink bucket's inode list. For each inode that should be on this
+ * chain, clear its entry in in iunlink_bmp because it's ok and we don't need
+ * to touch it further.
+ */
+STATIC int
+xrep_iunlink_resolve_bucket(
+ struct xrep_agi *ragi,
+ unsigned int bucket)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_inode *ip;
+ xfs_agino_t prev_agino = NULLAGINO;
+ xfs_agino_t next_agino = ragi->iunlink_heads[bucket];
+ int error = 0;
+
+ while (next_agino != NULLAGINO) {
+ if (xchk_should_terminate(ragi->sc, &error))
+ return error;
+
+ /* Find the next inode in the chain. */
+ ip = xfs_iunlink_lookup(sc->sa.pag, next_agino);
+ if (!ip) {
+ /* Inode not incore? Terminate the chain. */
+ trace_xrep_iunlink_resolve_uncached(sc->sa.pag,
+ bucket, prev_agino, next_agino);
+
+ next_agino = NULLAGINO;
+ break;
+ }
+
+ if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) {
+ /*
+ * Inode is in the wrong bucket. Advance the list,
+ * but pretend we didn't see this inode.
+ */
+ trace_xrep_iunlink_resolve_wronglist(sc->sa.pag,
+ bucket, prev_agino, next_agino);
+
+ next_agino = ip->i_next_unlinked;
+ continue;
+ }
+
+ if (!xfs_inode_on_unlinked_list(ip)) {
+ /*
+ * Incore inode doesn't think this inode is on an
+ * unlinked list. This is probably because we reloaded
+ * it from disk. Advance the list, but pretend we
+ * didn't see this inode; we'll fix that later.
+ */
+ trace_xrep_iunlink_resolve_nolist(sc->sa.pag,
+ bucket, prev_agino, next_agino);
+ next_agino = ip->i_next_unlinked;
+ continue;
+ }
+
+ trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino,
+ next_agino);
+
+ /*
+ * Otherwise, this inode's unlinked pointers are ok. Clear it
+ * from the unlinked bitmap since we're done with it, and make
+ * sure the chain is still correct.
+ */
+ error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1);
+ if (error)
+ return error;
+
+ /* Remember the previous inode's next pointer. */
+ if (prev_agino != NULLAGINO) {
+ error = xrep_iunlink_store_next(ragi, prev_agino,
+ next_agino);
+ if (error)
+ return error;
+ }
+
+ /* Remember this inode's previous pointer. */
+ error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino);
+ if (error)
+ return error;
+
+ /* Advance the list and remember this inode. */
+ prev_agino = next_agino;
+ next_agino = ip->i_next_unlinked;
+ }
+
+ /* Update the previous inode's next pointer. */
+ if (prev_agino != NULLAGINO) {
+ error = xrep_iunlink_store_next(ragi, prev_agino, next_agino);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Reinsert this unlinked inode into the head of the staged bucket list. */
+STATIC int
+xrep_iunlink_add_to_bucket(
+ struct xrep_agi *ragi,
+ xfs_agino_t agino)
+{
+ xfs_agino_t current_head;
+ unsigned int bucket;
+ int error;
+
+ bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
+
+ /* Point this inode at the current head of the bucket list. */
+ current_head = ragi->iunlink_heads[bucket];
+
+ trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino,
+ current_head);
+
+ error = xrep_iunlink_store_next(ragi, agino, current_head);
+ if (error)
+ return error;
+
+ /* Remember the head inode's previous pointer. */
+ if (current_head != NULLAGINO) {
+ error = xrep_iunlink_store_prev(ragi, current_head, agino);
+ if (error)
+ return error;
+ }
+
+ ragi->iunlink_heads[bucket] = agino;
+ return 0;
+}
+
+/* Reinsert unlinked inodes into the staged iunlink buckets. */
+STATIC int
+xrep_iunlink_add_lost_inodes(
+ uint32_t start,
+ uint32_t len,
+ void *priv)
+{
+ struct xrep_agi *ragi = priv;
+ int error;
+
+ for (; len > 0; start++, len--) {
+ error = xrep_iunlink_add_to_bucket(ragi, start);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Figure out the iunlink bucket values and find inodes that need to be
+ * reinserted into the list.
+ */
+STATIC int
+xrep_iunlink_rebuild_buckets(
+ struct xrep_agi *ragi)
+{
+ unsigned int i;
+ int error;
+
+ /*
+ * Walk the ondisk AGI unlinked list to find inodes that are on the
+ * list but aren't in memory. This can happen if a past log recovery
+ * tried to clear the iunlinked list but failed. Our scan rebuilds the
+ * unlinked list using incore inodes, so we must load and link them
+ * properly.
+ */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ error = xrep_iunlink_walk_ondisk_bucket(ragi, i);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Record all the incore unlinked inodes in iunlink_bmp that we didn't
+ * find by walking the ondisk iunlink buckets. This shouldn't happen,
+ * but we can't risk forgetting an inode somewhere.
+ */
+ error = xrep_iunlink_mark_incore(ragi);
+ if (error)
+ return error;
+
+ /*
+ * If there are ondisk inodes that are unlinked and are not been loaded
+ * into cache, record them in iunlink_bmp.
+ */
+ xrep_iunlink_mark_ondisk(ragi);
+
+ /*
+ * Walk each iunlink bucket to (re)construct as much of the incore list
+ * as would be correct. For each inode that survives this step, mark
+ * it clear in iunlink_bmp; we're done with those inodes.
+ */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ error = xrep_iunlink_resolve_bucket(ragi, i);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Any unlinked inodes that we didn't find through the bucket list
+ * walk (or was ignored by the walk) must be inserted into the bucket
+ * list. Stage this in memory for now.
+ */
+ return xagino_bitmap_walk(&ragi->iunlink_bmp,
+ xrep_iunlink_add_lost_inodes, ragi);
+}
+
+/* Update i_next_iunlinked for the inode @agino. */
+STATIC int
+xrep_iunlink_relink_next(
+ struct xrep_agi *ragi,
+ xfarray_idx_t idx,
+ xfs_agino_t next_agino)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_inode *ip;
+ xfarray_idx_t agino = idx - 1;
+ bool want_rele = false;
+ int error = 0;
+
+ ip = xfs_iunlink_lookup(pag, agino);
+ if (!ip) {
+ xfs_ino_t ino;
+ xfs_agino_t prev_agino;
+
+ /*
+ * No inode exists in cache. Load it off the disk so that we
+ * can reinsert it into the incore unlinked list.
+ */
+ ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
+ error = xchk_iget(sc, ino, &ip);
+ if (error)
+ return -EFSCORRUPTED;
+
+ want_rele = true;
+
+ /* Set the backward pointer since this just came off disk. */
+ error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino);
+ if (error)
+ goto out_rele;
+
+ trace_xrep_iunlink_relink_prev(ip, prev_agino);
+ ip->i_prev_unlinked = prev_agino;
+ }
+
+ /* Update the forward pointer. */
+ if (ip->i_next_unlinked != next_agino) {
+ error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
+ if (error)
+ goto out_rele;
+
+ trace_xrep_iunlink_relink_next(ip, next_agino);
+ ip->i_next_unlinked = next_agino;
+ }
+
+out_rele:
+ /*
+ * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
+ * and the inode cannot be reclaimed. However, if we used iget to load
+ * a missing inode, we must irele it here.
+ */
+ if (want_rele)
+ xchk_irele(sc, ip);
+ return error;
+}
+
+/* Update i_prev_iunlinked for the inode @agino. */
+STATIC int
+xrep_iunlink_relink_prev(
+ struct xrep_agi *ragi,
+ xfarray_idx_t idx,
+ xfs_agino_t prev_agino)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_inode *ip;
+ xfarray_idx_t agino = idx - 1;
+ bool want_rele = false;
+ int error = 0;
+
+ ASSERT(prev_agino != 0);
+
+ ip = xfs_iunlink_lookup(pag, agino);
+ if (!ip) {
+ xfs_ino_t ino;
+ xfs_agino_t next_agino;
+
+ /*
+ * No inode exists in cache. Load it off the disk so that we
+ * can reinsert it into the incore unlinked list.
+ */
+ ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
+ error = xchk_iget(sc, ino, &ip);
+ if (error)
+ return -EFSCORRUPTED;
+
+ want_rele = true;
+
+ /* Set the forward pointer since this just came off disk. */
+ error = xfarray_load(ragi->iunlink_prev, agino, &next_agino);
+ if (error)
+ goto out_rele;
+
+ error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
+ if (error)
+ goto out_rele;
+
+ trace_xrep_iunlink_relink_next(ip, next_agino);
+ ip->i_next_unlinked = next_agino;
+ }
+
+ /* Update the backward pointer. */
+ if (ip->i_prev_unlinked != prev_agino) {
+ trace_xrep_iunlink_relink_prev(ip, prev_agino);
+ ip->i_prev_unlinked = prev_agino;
+ }
+
+out_rele:
+ /*
+ * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
+ * and the inode cannot be reclaimed. However, if we used iget to load
+ * a missing inode, we must irele it here.
+ */
+ if (want_rele)
+ xchk_irele(sc, ip);
+ return error;
+}
+
+/* Log all the iunlink updates we need to finish regenerating the AGI. */
+STATIC int
+xrep_iunlink_commit(
+ struct xrep_agi *ragi)
+{
+ struct xfs_agi *agi = ragi->agi_bp->b_addr;
+ xfarray_idx_t idx = XFARRAY_CURSOR_INIT;
+ xfs_agino_t agino;
+ unsigned int i;
+ int error;
+
+ /* Fix all the forward links */
+ while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) {
+ error = xrep_iunlink_relink_next(ragi, idx, agino);
+ if (error)
+ return error;
+ }
+
+ /* Fix all the back links */
+ idx = XFARRAY_CURSOR_INIT;
+ while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) {
+ error = xrep_iunlink_relink_prev(ragi, idx, agino);
+ if (error)
+ return error;
+ }
+
+ /* Copy the staged iunlink buckets to the new AGI. */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i,
+ be32_to_cpu(ragi->old_agi.agi_unlinked[i]),
+ ragi->iunlink_heads[i]);
+
+ agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]);
+ }
+
+ return 0;
+}
+
/* Trigger reinitialization of the in-core data. */
STATIC int
xrep_agi_commit_new(
- struct xfs_scrub *sc,
- struct xfs_buf *agi_bp)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
struct xfs_perag *pag;
struct xfs_agi *agi = agi_bp->b_addr;
@@ -956,33 +1712,58 @@ xrep_agi_commit_new(
/* Repair the AGI. */
int
xrep_agi(
- struct xfs_scrub *sc)
+ struct xfs_scrub *sc)
{
- struct xrep_find_ag_btree fab[XREP_AGI_MAX] = {
- [XREP_AGI_INOBT] = {
- .rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_inobt_buf_ops,
- .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
- },
- [XREP_AGI_FINOBT] = {
- .rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_finobt_buf_ops,
- .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
- },
- [XREP_AGI_END] = {
- .buf_ops = NULL
- },
- };
- struct xfs_agi old_agi;
- struct xfs_mount *mp = sc->mp;
- struct xfs_buf *agi_bp;
- struct xfs_agi *agi;
- int error;
+ struct xrep_agi *ragi;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ unsigned int i;
+ int error;
/* We require the rmapbt to rebuild anything. */
if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
+ sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+ ragi = sc->buf;
+ ragi->sc = sc;
+
+ ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){
+ .rmap_owner = XFS_RMAP_OWN_INOBT,
+ .buf_ops = &xfs_inobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
+ };
+ ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){
+ .rmap_owner = XFS_RMAP_OWN_INOBT,
+ .buf_ops = &xfs_finobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
+ };
+ ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){
+ .buf_ops = NULL,
+ };
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+ ragi->iunlink_heads[i] = NULLAGINO;
+
+ xagino_bitmap_init(&ragi->iunlink_bmp);
+ sc->buf_cleanup = xrep_agi_buf_cleanup;
+
+ descr = xchk_xfile_ag_descr(sc, "iunlinked next pointers");
+ error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
+ &ragi->iunlink_next);
+ kfree(descr);
+ if (error)
+ return error;
+
+ descr = xchk_xfile_ag_descr(sc, "iunlinked prev pointers");
+ error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
+ &ragi->iunlink_prev);
+ kfree(descr);
+ if (error)
+ return error;
+
/*
* Make sure we have the AGI buffer, as scrub might have decided it
* was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
@@ -990,14 +1771,17 @@ xrep_agi(
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL);
+ XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL);
if (error)
return error;
- agi_bp->b_ops = &xfs_agi_buf_ops;
- agi = agi_bp->b_addr;
+ ragi->agi_bp->b_ops = &xfs_agi_buf_ops;
/* Find the AGI btree roots. */
- error = xrep_agi_find_btrees(sc, fab);
+ error = xrep_agi_find_btrees(ragi);
+ if (error)
+ return error;
+
+ error = xrep_iunlink_rebuild_buckets(ragi);
if (error)
return error;
@@ -1006,18 +1790,21 @@ xrep_agi(
return error;
/* Start rewriting the header and implant the btrees we found. */
- xrep_agi_init_header(sc, agi_bp, &old_agi);
- xrep_agi_set_roots(sc, agi, fab);
- error = xrep_agi_calc_from_btrees(sc, agi_bp);
+ xrep_agi_init_header(ragi);
+ xrep_agi_set_roots(ragi);
+ error = xrep_agi_calc_from_btrees(ragi);
+ if (error)
+ goto out_revert;
+ error = xrep_iunlink_commit(ragi);
if (error)
goto out_revert;
/* Reinitialize in-core state. */
- return xrep_agi_commit_new(sc, agi_bp);
+ return xrep_agi_commit_new(ragi);
out_revert:
/* Mark the incore AGI state stale and revert the AGI. */
clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate);
- memcpy(agi, &old_agi, sizeof(old_agi));
+ memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi));
return error;
}
diff --git a/fs/xfs/scrub/agino_bitmap.h b/fs/xfs/scrub/agino_bitmap.h
new file mode 100644
index 000000000000..56d7db5f1699
--- /dev/null
+++ b/fs/xfs/scrub/agino_bitmap.h
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_AGINO_BITMAP_H__
+#define __XFS_SCRUB_AGINO_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_agino_t */
+
+struct xagino_bitmap {
+ struct xbitmap32 aginobitmap;
+};
+
+static inline void xagino_bitmap_init(struct xagino_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->aginobitmap);
+}
+
+static inline void xagino_bitmap_destroy(struct xagino_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->aginobitmap);
+}
+
+static inline int xagino_bitmap_clear(struct xagino_bitmap *bitmap,
+ xfs_agino_t agino, unsigned int len)
+{
+ return xbitmap32_clear(&bitmap->aginobitmap, agino, len);
+}
+
+static inline int xagino_bitmap_set(struct xagino_bitmap *bitmap,
+ xfs_agino_t agino, unsigned int len)
+{
+ return xbitmap32_set(&bitmap->aginobitmap, agino, len);
+}
+
+static inline bool xagino_bitmap_test(struct xagino_bitmap *bitmap,
+ xfs_agino_t agino, unsigned int *len)
+{
+ return xbitmap32_test(&bitmap->aginobitmap, agino, len);
+}
+
+static inline int xagino_bitmap_walk(struct xagino_bitmap *bitmap,
+ xbitmap32_walk_fn fn, void *priv)
+{
+ return xbitmap32_walk(&bitmap->aginobitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_AGINO_BITMAP_H__ */
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index d421b253923e..30295898cc8a 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -778,7 +778,7 @@ xrep_abt_build_new_trees(
error = xrep_bnobt_sort_records(ra);
if (error)
- return error;
+ goto err_levels;
/* Load the free space by block number tree. */
ra->array_cur = XFARRAY_CURSOR_INIT;
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 83c7feb38714..708334f9b2bd 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -10,16 +10,20 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_attr_sf.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
#include "scrub/attr.h"
+#include "scrub/listxattr.h"
+#include "scrub/repair.h"
/* Free the buffers linked from the xattr buffer. */
static void
@@ -35,6 +39,8 @@ xchk_xattr_buf_cleanup(
kvfree(ab->value);
ab->value = NULL;
ab->value_sz = 0;
+ kvfree(ab->name);
+ ab->name = NULL;
}
/*
@@ -65,7 +71,7 @@ xchk_xattr_want_freemap(
* reallocating the buffer if necessary. Buffer contents are not preserved
* across a reallocation.
*/
-static int
+int
xchk_setup_xattr_buf(
struct xfs_scrub *sc,
size_t value_size)
@@ -95,6 +101,12 @@ xchk_setup_xattr_buf(
return -ENOMEM;
}
+ if (xchk_could_repair(sc)) {
+ ab->name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS);
+ if (!ab->name)
+ return -ENOMEM;
+ }
+
resize_value:
if (ab->value_sz >= value_size)
return 0;
@@ -121,6 +133,12 @@ xchk_setup_xattr(
{
int error;
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_xattr(sc);
+ if (error)
+ return error;
+ }
+
/*
* We failed to get memory while checking attrs, so this time try to
* get all the memory we're ever going to need. Allocate the buffer
@@ -137,106 +155,105 @@ xchk_setup_xattr(
/* Extended Attributes */
-struct xchk_xattr {
- struct xfs_attr_list_context context;
- struct xfs_scrub *sc;
-};
-
/*
* Check that an extended attribute key can be looked up by hash.
*
- * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked)
- * to call this function for every attribute key in an inode. Once
- * we're here, we load the attribute value to see if any errors happen,
- * or if we get more or less data than we expected.
+ * We use the extended attribute walk helper to call this function for every
+ * attribute key in an inode. Once we're here, we load the attribute value to
+ * see if any errors happen, or if we get more or less data than we expected.
*/
-static void
-xchk_xattr_listent(
- struct xfs_attr_list_context *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen)
+static int
+xchk_xattr_actor(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
{
struct xfs_da_args args = {
- .op_flags = XFS_DA_OP_NOTIME,
- .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK,
- .geo = context->dp->i_mount->m_attr_geo,
+ .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK,
+ .geo = sc->mp->m_attr_geo,
.whichfork = XFS_ATTR_FORK,
- .dp = context->dp,
+ .dp = ip,
.name = name,
.namelen = namelen,
- .hashval = xfs_da_hashname(name, namelen),
- .trans = context->tp,
+ .trans = sc->tp,
.valuelen = valuelen,
+ .owner = ip->i_ino,
};
struct xchk_xattr_buf *ab;
- struct xchk_xattr *sx;
int error = 0;
- sx = container_of(context, struct xchk_xattr, context);
- ab = sx->sc->buf;
+ ab = sc->buf;
- if (xchk_should_terminate(sx->sc, &error)) {
- context->seen_enough = error;
- return;
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (attr_flags & ~XFS_ATTR_ONDISK_MASK) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+ return -ECANCELED;
}
- if (flags & XFS_ATTR_INCOMPLETE) {
+ if (attr_flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */
- xchk_ino_set_preen(sx->sc, context->dp->i_ino);
- return;
+ xchk_ino_set_preen(sc, ip->i_ino);
+ return 0;
}
- /* Only one namespace bit allowed. */
- if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) {
- xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
- goto fail_xref;
+ /* Does this name make sense? */
+ if (!xfs_attr_namecheck(attr_flags, name, namelen)) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+ return -ECANCELED;
}
- /* Does this name make sense? */
- if (!xfs_attr_namecheck(name, namelen)) {
- xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
- goto fail_xref;
+ /* Check parent pointer record. */
+ if ((attr_flags & XFS_ATTR_PARENT) &&
+ !xfs_parent_valuecheck(sc->mp, value, valuelen)) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+ return -ECANCELED;
}
/*
- * Local xattr values are stored in the attr leaf block, so we don't
- * need to retrieve the value from a remote block to detect corruption
- * problems.
+ * Try to allocate enough memory to extract the attr value. If that
+ * doesn't work, return -EDEADLOCK as a signal to try again with a
+ * maximally sized buffer.
*/
- if (flags & XFS_ATTR_LOCAL)
- goto fail_xref;
+ error = xchk_setup_xattr_buf(sc, valuelen);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error)
+ return error;
/*
- * Try to allocate enough memory to extrat the attr value. If that
- * doesn't work, we overload the seen_enough variable to convey
- * the error message back to the main scrub function.
+ * Parent pointers are matched on attr name and value, so we must
+ * supply the xfs_parent_rec here when confirming that the dabtree
+ * indexing works correctly.
*/
- error = xchk_setup_xattr_buf(sx->sc, valuelen);
- if (error == -ENOMEM)
- error = -EDEADLOCK;
- if (error) {
- context->seen_enough = error;
- return;
- }
+ if (attr_flags & XFS_ATTR_PARENT)
+ memcpy(ab->value, value, valuelen);
args.value = ab->value;
+ /*
+ * Get the attr value to ensure that lookup can find this attribute
+ * through the dabtree indexing and that remote value retrieval also
+ * works correctly.
+ */
+ xfs_attr_sethash(&args);
error = xfs_attr_get_ilocked(&args);
/* ENODATA means the hash lookup failed and the attr is bad */
if (error == -ENODATA)
error = -EFSCORRUPTED;
- if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
+ if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, args.blkno,
&error))
- goto fail_xref;
+ return error;
if (args.valuelen != valuelen)
- xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK,
- args.blkno);
-fail_xref:
- if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- context->seen_enough = 1;
- return;
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+
+ return 0;
}
/*
@@ -246,7 +263,7 @@ fail_xref:
* Within a char, the lowest bit of the char represents the byte with
* the smallest address
*/
-STATIC bool
+bool
xchk_xattr_set_map(
struct xfs_scrub *sc,
unsigned long *map,
@@ -403,6 +420,17 @@ xchk_xattr_block(
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+ /*
+ * Empty xattr leaf blocks mapped at block 0 are probably a byproduct
+ * of a race between setxattr and a log shutdown. Anywhere else in the
+ * attr fork is a corruption.
+ */
+ if (leafhdr.count == 0) {
+ if (blk->blkno == 0)
+ xchk_da_set_preen(ds, level);
+ else
+ xchk_da_set_corrupt(ds, level);
+ }
if (leafhdr.usedbytes > mp->m_attr_geo->blksize)
xchk_da_set_corrupt(ds, level);
if (leafhdr.firstused > mp->m_attr_geo->blksize)
@@ -411,6 +439,8 @@ xchk_xattr_block(
xchk_da_set_corrupt(ds, level);
if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize))
xchk_da_set_corrupt(ds, level);
+ if (leafhdr.holes)
+ xchk_da_set_preen(ds, level);
if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out;
@@ -463,7 +493,6 @@ xchk_xattr_rec(
xfs_dahash_t hash;
int nameidx;
int hdrsize;
- unsigned int badflags;
int error;
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -493,10 +522,15 @@ xchk_xattr_rec(
/* Retrieve the entry and check it. */
hash = be32_to_cpu(ent->hashval);
- badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE |
- XFS_ATTR_INCOMPLETE);
- if ((ent->flags & badflags) != 0)
+ if (ent->flags & ~XFS_ATTR_ONDISK_MASK) {
+ xchk_da_set_corrupt(ds, level);
+ return 0;
+ }
+ if (!xfs_attr_check_namespace(ent->flags)) {
xchk_da_set_corrupt(ds, level);
+ return 0;
+ }
+
if (ent->flags & XFS_ATTR_LOCAL) {
lentry = (struct xfs_attr_leaf_name_local *)
(((char *)bp->b_addr) + nameidx);
@@ -504,7 +538,10 @@ xchk_xattr_rec(
xchk_da_set_corrupt(ds, level);
goto out;
}
- calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen);
+ calc_hash = xfs_attr_hashval(mp, ent->flags, lentry->nameval,
+ lentry->namelen,
+ lentry->nameval + lentry->namelen,
+ be16_to_cpu(lentry->valuelen));
} else {
rentry = (struct xfs_attr_leaf_name_remote *)
(((char *)bp->b_addr) + nameidx);
@@ -512,7 +549,13 @@ xchk_xattr_rec(
xchk_da_set_corrupt(ds, level);
goto out;
}
- calc_hash = xfs_da_hashname(rentry->name, rentry->namelen);
+ if (ent->flags & XFS_ATTR_PARENT) {
+ xchk_da_set_corrupt(ds, level);
+ goto out;
+ }
+ calc_hash = xfs_attr_hashval(mp, ent->flags, rentry->name,
+ rentry->namelen, NULL,
+ be32_to_cpu(rentry->valuelen));
}
if (calc_hash != hash)
xchk_da_set_corrupt(ds, level);
@@ -556,6 +599,15 @@ xchk_xattr_check_sf(
break;
}
+ /*
+ * Shortform entries do not set LOCAL or INCOMPLETE, so the
+ * only valid flag bits here are for namespaces.
+ */
+ if (sfe->flags & ~XFS_ATTR_NSP_ONDISK_MASK) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ break;
+ }
+
if (!xchk_xattr_set_map(sc, ab->usedmap,
(char *)sfe - (char *)sf,
sizeof(struct xfs_attr_sf_entry))) {
@@ -588,16 +640,6 @@ int
xchk_xattr(
struct xfs_scrub *sc)
{
- struct xchk_xattr sx = {
- .sc = sc,
- .context = {
- .dp = sc->ip,
- .tp = sc->tp,
- .resynch = 1,
- .put_listent = xchk_xattr_listent,
- .allow_incomplete = true,
- },
- };
xfs_dablk_t last_checked = -1U;
int error = 0;
@@ -626,12 +668,6 @@ xchk_xattr(
/*
* Look up every xattr in this file by name and hash.
*
- * Use the backend implementation of xfs_attr_list to call
- * xchk_xattr_listent on every attribute key in this inode.
- * In other words, we use the same iterator/callback mechanism
- * that listattr uses to scrub extended attributes, though in our
- * _listent function, we check the value of the attribute.
- *
* The VFS only locks i_rwsem when modifying attrs, so keep all
* three locks held because that's the only way to ensure we're
* the only thread poking into the da btree. We traverse the da
@@ -639,13 +675,9 @@ xchk_xattr(
* iteration, which doesn't really follow the usual buffer
* locking order.
*/
- error = xfs_attr_list_ilocked(&sx.context);
+ error = xchk_xattr_walk(sc, sc->ip, xchk_xattr_actor, NULL, NULL);
if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
return error;
- /* Did our listent function try to return any errors? */
- if (sx.context.seen_enough < 0)
- return sx.context.seen_enough;
-
return 0;
}
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
index 48fd9402c432..7db58af56646 100644
--- a/fs/xfs/scrub/attr.h
+++ b/fs/xfs/scrub/attr.h
@@ -16,9 +16,16 @@ struct xchk_xattr_buf {
/* Bitmap of free space in xattr leaf blocks. */
unsigned long *freemap;
+ /* Memory buffer used to hold salvaged xattr names. */
+ unsigned char *name;
+
/* Memory buffer used to extract xattr values. */
void *value;
size_t value_sz;
};
+bool xchk_xattr_set_map(struct xfs_scrub *sc, unsigned long *map,
+ unsigned int start, unsigned int len);
+int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size);
+
#endif /* __XFS_SCRUB_ATTR_H__ */
diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c
new file mode 100644
index 000000000000..c7eb94069caf
--- /dev/null
+++ b/fs/xfs/scrub/attr_repair.c
@@ -0,0 +1,1663 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_acl.h"
+#include "xfs_parent.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/attr.h"
+#include "scrub/reap.h"
+#include "scrub/attr_repair.h"
+
+/*
+ * Extended Attribute Repair
+ * =========================
+ *
+ * We repair extended attributes by reading the attr leaf blocks looking for
+ * attributes entries that look salvageable (name passes verifiers, value can
+ * be retrieved, etc). Each extended attribute worth salvaging is stashed in
+ * memory, and the stashed entries are periodically replayed into a temporary
+ * file to constrain memory use. Batching the construction of the temporary
+ * extended attribute structure in this fashion reduces lock cycling of the
+ * file being repaired and the temporary file.
+ *
+ * When salvaging completes, the remaining stashed attributes are replayed to
+ * the temporary file. An atomic file contents exchange is used to commit the
+ * new xattr blocks to the file being repaired. This will disrupt attrmulti
+ * cursors.
+ */
+
+struct xrep_xattr_key {
+ /* Cookie for retrieval of the xattr name. */
+ xfblob_cookie name_cookie;
+
+ /* Cookie for retrieval of the xattr value. */
+ xfblob_cookie value_cookie;
+
+ /* XFS_ATTR_* flags */
+ int flags;
+
+ /* Length of the value and name. */
+ uint32_t valuelen;
+ uint16_t namelen;
+};
+
+/*
+ * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write
+ * them to the temp file.
+ */
+#define XREP_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+struct xrep_xattr {
+ struct xfs_scrub *sc;
+
+ /* Information for exchanging attr fork mappings at the end. */
+ struct xrep_tempexch tx;
+
+ /* xattr keys */
+ struct xfarray *xattr_records;
+
+ /* xattr values */
+ struct xfblob *xattr_blobs;
+
+ /* Number of attributes that we are salvaging. */
+ unsigned long long attrs_found;
+
+ /* Can we flush stashed attrs to the tempfile? */
+ bool can_flush;
+
+ /* Did the live update fail, and hence the repair is now out of date? */
+ bool live_update_aborted;
+
+ /* Lock protecting parent pointer updates */
+ struct mutex lock;
+
+ /* Fixed-size array of xrep_xattr_pptr structures. */
+ struct xfarray *pptr_recs;
+
+ /* Blobs containing parent pointer names. */
+ struct xfblob *pptr_names;
+
+ /* Hook to capture parent pointer updates. */
+ struct xfs_dir_hook dhook;
+
+ /* Scratch buffer for capturing parent pointers. */
+ struct xfs_da_args pptr_args;
+
+ /* Name buffer */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
+};
+
+/* Create a parent pointer in the tempfile. */
+#define XREP_XATTR_PPTR_ADD (1)
+
+/* Remove a parent pointer from the tempfile. */
+#define XREP_XATTR_PPTR_REMOVE (2)
+
+/* A stashed parent pointer update. */
+struct xrep_xattr_pptr {
+ /* Cookie for retrieval of the pptr name. */
+ xfblob_cookie name_cookie;
+
+ /* Parent pointer record. */
+ struct xfs_parent_rec pptr_rec;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+
+ /* XREP_XATTR_PPTR_{ADD,REMOVE} */
+ uint8_t action;
+};
+
+/* Set up to recreate the extended attributes. */
+int
+xrep_setup_xattr(
+ struct xfs_scrub *sc)
+{
+ if (xfs_has_parent(sc->mp))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ return xrep_tempfile_create(sc, S_IFREG);
+}
+
+/*
+ * Decide if we want to salvage this attribute. We don't bother with
+ * incomplete or oversized keys or values. The @value parameter can be null
+ * for remote attrs.
+ */
+STATIC int
+xrep_xattr_want_salvage(
+ struct xrep_xattr *rx,
+ unsigned int attr_flags,
+ const void *name,
+ int namelen,
+ const void *value,
+ int valuelen)
+{
+ if (attr_flags & XFS_ATTR_INCOMPLETE)
+ return false;
+ if (namelen > XATTR_NAME_MAX || namelen <= 0)
+ return false;
+ if (!xfs_attr_namecheck(attr_flags, name, namelen))
+ return false;
+ if (valuelen > XATTR_SIZE_MAX || valuelen < 0)
+ return false;
+ if (attr_flags & XFS_ATTR_PARENT)
+ return xfs_parent_valuecheck(rx->sc->mp, value, valuelen);
+
+ return true;
+}
+
+/* Allocate an in-core record to hold xattrs while we rebuild the xattr data. */
+STATIC int
+xrep_xattr_salvage_key(
+ struct xrep_xattr *rx,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ unsigned char *value,
+ int valuelen)
+{
+ struct xrep_xattr_key key = {
+ .valuelen = valuelen,
+ .flags = flags & XFS_ATTR_NSP_ONDISK_MASK,
+ };
+ unsigned int i = 0;
+ int error = 0;
+
+ if (xchk_should_terminate(rx->sc, &error))
+ return error;
+
+ /*
+ * Truncate the name to the first character that would trip namecheck.
+ * If we no longer have a name after that, ignore this attribute.
+ */
+ if (flags & XFS_ATTR_PARENT) {
+ key.namelen = namelen;
+
+ trace_xrep_xattr_salvage_pptr(rx->sc->ip, flags, name,
+ key.namelen, value, valuelen);
+ } else {
+ while (i < namelen && name[i] != 0)
+ i++;
+ if (i == 0)
+ return 0;
+ key.namelen = i;
+
+ trace_xrep_xattr_salvage_rec(rx->sc->ip, flags, name,
+ key.namelen, valuelen);
+ }
+
+ error = xfblob_store(rx->xattr_blobs, &key.name_cookie, name,
+ key.namelen);
+ if (error)
+ return error;
+
+ error = xfblob_store(rx->xattr_blobs, &key.value_cookie, value,
+ key.valuelen);
+ if (error)
+ return error;
+
+ error = xfarray_append(rx->xattr_records, &key);
+ if (error)
+ return error;
+
+ rx->attrs_found++;
+ return 0;
+}
+
+/*
+ * Record a shortform extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+STATIC int
+xrep_xattr_salvage_sf_attr(
+ struct xrep_xattr *rx,
+ struct xfs_attr_sf_hdr *hdr,
+ struct xfs_attr_sf_entry *sfe)
+{
+ struct xfs_scrub *sc = rx->sc;
+ struct xchk_xattr_buf *ab = sc->buf;
+ unsigned char *name = sfe->nameval;
+ unsigned char *value = &sfe->nameval[sfe->namelen];
+
+ if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)name - (char *)hdr,
+ sfe->namelen))
+ return 0;
+
+ if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)value - (char *)hdr,
+ sfe->valuelen))
+ return 0;
+
+ if (!xrep_xattr_want_salvage(rx, sfe->flags, sfe->nameval,
+ sfe->namelen, value, sfe->valuelen))
+ return 0;
+
+ return xrep_xattr_salvage_key(rx, sfe->flags, sfe->nameval,
+ sfe->namelen, value, sfe->valuelen);
+}
+
+/*
+ * Record a local format extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+STATIC int
+xrep_xattr_salvage_local_attr(
+ struct xrep_xattr *rx,
+ struct xfs_attr_leaf_entry *ent,
+ unsigned int nameidx,
+ const char *buf_end,
+ struct xfs_attr_leaf_name_local *lentry)
+{
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ unsigned char *value;
+ unsigned int valuelen;
+ unsigned int namesize;
+
+ /*
+ * Decode the leaf local entry format. If something seems wrong, we
+ * junk the attribute.
+ */
+ value = &lentry->nameval[lentry->namelen];
+ valuelen = be16_to_cpu(lentry->valuelen);
+ namesize = xfs_attr_leaf_entsize_local(lentry->namelen, valuelen);
+ if ((char *)lentry + namesize > buf_end)
+ return 0;
+ if (!xrep_xattr_want_salvage(rx, ent->flags, lentry->nameval,
+ lentry->namelen, value, valuelen))
+ return 0;
+ if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize))
+ return 0;
+
+ /* Try to save this attribute. */
+ return xrep_xattr_salvage_key(rx, ent->flags, lentry->nameval,
+ lentry->namelen, value, valuelen);
+}
+
+/*
+ * Record a remote format extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+STATIC int
+xrep_xattr_salvage_remote_attr(
+ struct xrep_xattr *rx,
+ struct xfs_attr_leaf_entry *ent,
+ unsigned int nameidx,
+ const char *buf_end,
+ struct xfs_attr_leaf_name_remote *rentry,
+ unsigned int ent_idx,
+ struct xfs_buf *leaf_bp)
+{
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ struct xfs_da_args args = {
+ .trans = rx->sc->tp,
+ .dp = rx->sc->ip,
+ .index = ent_idx,
+ .geo = rx->sc->mp->m_attr_geo,
+ .owner = rx->sc->ip->i_ino,
+ .attr_filter = ent->flags & XFS_ATTR_NSP_ONDISK_MASK,
+ .namelen = rentry->namelen,
+ .name = rentry->name,
+ .value = ab->value,
+ .valuelen = be32_to_cpu(rentry->valuelen),
+ };
+ unsigned int namesize;
+ int error;
+
+ /*
+ * Decode the leaf remote entry format. If something seems wrong, we
+ * junk the attribute. Note that we should never find a zero-length
+ * remote attribute value.
+ */
+ namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+ if ((char *)rentry + namesize > buf_end)
+ return 0;
+ if (args.valuelen == 0 ||
+ !xrep_xattr_want_salvage(rx, ent->flags, rentry->name,
+ rentry->namelen, NULL, args.valuelen))
+ return 0;
+ if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize))
+ return 0;
+
+ /*
+ * Enlarge the buffer (if needed) to hold the value that we're trying
+ * to salvage from the old extended attribute data.
+ */
+ error = xchk_setup_xattr_buf(rx->sc, args.valuelen);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error)
+ return error;
+
+ /* Look up the remote value and stash it for reconstruction. */
+ error = xfs_attr3_leaf_getvalue(leaf_bp, &args);
+ if (error || args.rmtblkno == 0)
+ goto err_free;
+
+ error = xfs_attr_rmtval_get(&args);
+ if (error)
+ goto err_free;
+
+ /* Try to save this attribute. */
+ error = xrep_xattr_salvage_key(rx, ent->flags, rentry->name,
+ rentry->namelen, ab->value, args.valuelen);
+err_free:
+ /* remote value was garbage, junk it */
+ if (error == -EFSBADCRC || error == -EFSCORRUPTED)
+ error = 0;
+ return error;
+}
+
+/* Extract every xattr key that we can from this attr fork block. */
+STATIC int
+xrep_xattr_recover_leaf(
+ struct xrep_xattr *rx,
+ struct xfs_buf *bp)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xfs_scrub *sc = rx->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ struct xfs_attr_leaf_entry *ent;
+ struct xfs_attr_leaf_entry *entries;
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ char *buf_end;
+ size_t off;
+ unsigned int nameidx;
+ unsigned int hdrsize;
+ int i;
+ int error = 0;
+
+ bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize);
+
+ /* Check the leaf header */
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+ xchk_xattr_set_map(sc, ab->usedmap, 0, hdrsize);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
+ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+ for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /* Skip key if it conflicts with something else? */
+ off = (char *)ent - (char *)leaf;
+ if (!xchk_xattr_set_map(sc, ab->usedmap, off,
+ sizeof(xfs_attr_leaf_entry_t)))
+ continue;
+
+ /* Check the name information. */
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < leafhdr.firstused ||
+ nameidx >= mp->m_attr_geo->blksize)
+ continue;
+
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = xfs_attr3_leaf_name_local(leaf, i);
+ error = xrep_xattr_salvage_local_attr(rx, ent, nameidx,
+ buf_end, lentry);
+ } else {
+ rentry = xfs_attr3_leaf_name_remote(leaf, i);
+ error = xrep_xattr_salvage_remote_attr(rx, ent, nameidx,
+ buf_end, rentry, i, bp);
+ }
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Try to recover shortform attrs. */
+STATIC int
+xrep_xattr_recover_sf(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ struct xchk_xattr_buf *ab = sc->buf;
+ struct xfs_attr_sf_hdr *hdr;
+ struct xfs_attr_sf_entry *sfe;
+ struct xfs_attr_sf_entry *next;
+ struct xfs_ifork *ifp;
+ unsigned char *end;
+ int i;
+ int error = 0;
+
+ ifp = xfs_ifork_ptr(rx->sc->ip, XFS_ATTR_FORK);
+ hdr = ifp->if_data;
+
+ bitmap_zero(ab->usedmap, ifp->if_bytes);
+ end = (unsigned char *)ifp->if_data + ifp->if_bytes;
+ xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*hdr));
+
+ sfe = xfs_attr_sf_firstentry(hdr);
+ if ((unsigned char *)sfe > end)
+ return 0;
+
+ for (i = 0; i < hdr->count; i++) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ next = xfs_attr_sf_nextentry(sfe);
+ if ((unsigned char *)next > end)
+ break;
+
+ if (xchk_xattr_set_map(sc, ab->usedmap,
+ (char *)sfe - (char *)hdr,
+ sizeof(struct xfs_attr_sf_entry))) {
+ /*
+ * No conflicts with the sf entry; let's save this
+ * attribute.
+ */
+ error = xrep_xattr_salvage_sf_attr(rx, hdr, sfe);
+ if (error)
+ return error;
+ }
+
+ sfe = next;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to return a buffer of xattr data for a given physical extent.
+ *
+ * Because the buffer cache get function complains if it finds a buffer
+ * matching the block number but not matching the length, we must be careful to
+ * look for incore buffers (up to the maximum length of a remote value) that
+ * could be hiding anywhere in the physical range. If we find an incore
+ * buffer, we can pass that to the caller. Optionally, read a single block and
+ * pass that back.
+ *
+ * Note the subtlety that remote attr value blocks for which there is no incore
+ * buffer will be passed to the callback one block at a time. These buffers
+ * will not have any ops attached and must be staled to prevent aliasing with
+ * multiblock buffers once we drop the ILOCK.
+ */
+STATIC int
+xrep_xattr_find_buf(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t max_len,
+ bool can_read,
+ struct xfs_buf **bpp)
+{
+ struct xrep_bufscan scan = {
+ .daddr = XFS_FSB_TO_DADDR(mp, fsbno),
+ .max_sectors = xrep_bufscan_max_sectors(mp, max_len),
+ .daddr_step = XFS_FSB_TO_BB(mp, 1),
+ };
+ struct xfs_buf *bp;
+
+ while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
+ *bpp = bp;
+ return 0;
+ }
+
+ if (!can_read) {
+ *bpp = NULL;
+ return 0;
+ }
+
+ return xfs_buf_read(mp->m_ddev_targp, scan.daddr, XFS_FSB_TO_BB(mp, 1),
+ XBF_TRYLOCK, bpp, NULL);
+}
+
+/*
+ * Deal with a buffer that we found during our walk of the attr fork.
+ *
+ * Attribute leaf and node blocks are simple -- they're a single block, so we
+ * can walk them one at a time and we never have to worry about discontiguous
+ * multiblock buffers like we do for directories.
+ *
+ * Unfortunately, remote attr blocks add a lot of complexity here. Each disk
+ * block is totally self contained, in the sense that the v5 header provides no
+ * indication that there could be more data in the next block. The incore
+ * buffers can span multiple blocks, though they never cross extent records.
+ * However, they don't necessarily start or end on an extent record boundary.
+ * Therefore, we need a special buffer find function to walk the buffer cache
+ * for us.
+ *
+ * The caller must hold the ILOCK on the file being repaired. We use
+ * XBF_TRYLOCK here to skip any locked buffer on the assumption that we don't
+ * own the block and don't want to hang the system on a potentially garbage
+ * buffer.
+ */
+STATIC int
+xrep_xattr_recover_block(
+ struct xrep_xattr *rx,
+ xfs_dablk_t dabno,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t max_len,
+ xfs_extlen_t *actual_len)
+{
+ struct xfs_da_blkinfo *info;
+ struct xfs_buf *bp;
+ int error;
+
+ error = xrep_xattr_find_buf(rx->sc->mp, fsbno, max_len, true, &bp);
+ if (error)
+ return error;
+ info = bp->b_addr;
+ *actual_len = XFS_BB_TO_FSB(rx->sc->mp, bp->b_length);
+
+ trace_xrep_xattr_recover_leafblock(rx->sc->ip, dabno,
+ be16_to_cpu(info->magic));
+
+ /*
+ * If the buffer has the right magic number for an attr leaf block and
+ * passes a structure check (we don't care about checksums), salvage
+ * as much as we can from the block. */
+ if (info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) &&
+ xrep_buf_verify_struct(bp, &xfs_attr3_leaf_buf_ops) &&
+ xfs_attr3_leaf_header_check(bp, rx->sc->ip->i_ino) == NULL)
+ error = xrep_xattr_recover_leaf(rx, bp);
+
+ /*
+ * If the buffer didn't already have buffer ops set, it was read in by
+ * the _find_buf function and could very well be /part/ of a multiblock
+ * remote block. Mark it stale so that it doesn't hang around in
+ * memory to cause problems.
+ */
+ if (bp->b_ops == NULL)
+ xfs_buf_stale(bp);
+
+ xfs_buf_relse(bp);
+ return error;
+}
+
+/* Insert one xattr key/value. */
+STATIC int
+xrep_xattr_insert_rec(
+ struct xrep_xattr *rx,
+ const struct xrep_xattr_key *key)
+{
+ struct xfs_da_args args = {
+ .dp = rx->sc->tempip,
+ .attr_filter = key->flags,
+ .namelen = key->namelen,
+ .valuelen = key->valuelen,
+ .owner = rx->sc->ip->i_ino,
+ .geo = rx->sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .op_flags = XFS_DA_OP_OKNOENT,
+ };
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ int error;
+
+ /*
+ * Grab pointers to the scrub buffer so that we can use them to insert
+ * attrs into the temp file.
+ */
+ args.name = ab->name;
+ args.value = ab->value;
+
+ /*
+ * The attribute name is stored near the end of the in-core buffer,
+ * though we reserve one more byte to ensure null termination.
+ */
+ ab->name[XATTR_NAME_MAX] = 0;
+
+ error = xfblob_load(rx->xattr_blobs, key->name_cookie, ab->name,
+ key->namelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rx->xattr_blobs, key->name_cookie);
+ if (error)
+ return error;
+
+ error = xfblob_load(rx->xattr_blobs, key->value_cookie, args.value,
+ key->valuelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rx->xattr_blobs, key->value_cookie);
+ if (error)
+ return error;
+
+ ab->name[key->namelen] = 0;
+
+ if (key->flags & XFS_ATTR_PARENT) {
+ trace_xrep_xattr_insert_pptr(rx->sc->tempip, key->flags,
+ ab->name, key->namelen, ab->value,
+ key->valuelen);
+ args.op_flags |= XFS_DA_OP_LOGGED;
+ } else {
+ trace_xrep_xattr_insert_rec(rx->sc->tempip, key->flags,
+ ab->name, key->namelen, key->valuelen);
+ }
+
+ /*
+ * xfs_attr_set creates and commits its own transaction. If the attr
+ * already exists, we'll just drop it during the rebuild.
+ */
+ xfs_attr_sethash(&args);
+ error = xfs_attr_set(&args, XFS_ATTRUPDATE_CREATE, false);
+ if (error == -EEXIST)
+ error = 0;
+
+ return error;
+}
+
+/*
+ * Periodically flush salvaged attributes to the temporary file. This is done
+ * to reduce the memory requirements of the xattr rebuild because files can
+ * contain millions of attributes.
+ */
+STATIC int
+xrep_xattr_flush_stashed(
+ struct xrep_xattr *rx)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ /*
+ * Entering this function, the scrub context has a reference to the
+ * inode being repaired, the temporary file, and a scrub transaction
+ * that we use during xattr salvaging to avoid livelocking if there
+ * are cycles in the xattr structures. We hold ILOCK_EXCL on both
+ * the inode being repaired, though it is not ijoined to the scrub
+ * transaction.
+ *
+ * To constrain kernel memory use, we occasionally flush salvaged
+ * xattrs from the xfarray and xfblob structures into the temporary
+ * file in preparation for exchanging the xattr structures at the end.
+ * Updating the temporary file requires a transaction, so we commit the
+ * scrub transaction and drop the two ILOCKs so that xfs_attr_set can
+ * allocate whatever transaction it wants.
+ *
+ * We still hold IOLOCK_EXCL on the inode being repaired, which
+ * prevents anyone from modifying the damaged xattr data while we
+ * repair it.
+ */
+ error = xrep_trans_commit(rx->sc);
+ if (error)
+ return error;
+ xchk_iunlock(rx->sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK of the temporary file while we modify xattrs. This
+ * isn't strictly required because the temporary file is never revealed
+ * to userspace, but we follow the same locking rules. We still hold
+ * sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rx->sc);
+ if (error)
+ return error;
+
+ /* Add all the salvaged attrs to the temporary file. */
+ foreach_xfarray_idx(rx->xattr_records, array_cur) {
+ struct xrep_xattr_key key;
+
+ error = xfarray_load(rx->xattr_records, array_cur, &key);
+ if (error)
+ return error;
+
+ error = xrep_xattr_insert_rec(rx, &key);
+ if (error)
+ return error;
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rx->xattr_records);
+ xfblob_truncate(rx->xattr_blobs);
+
+ xrep_tempfile_iounlock(rx->sc);
+
+ /* Recreate the salvage transaction and relock the inode. */
+ error = xchk_trans_alloc(rx->sc, 0);
+ if (error)
+ return error;
+ xchk_ilock(rx->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Decide if we've stashed too much xattr data in memory. */
+static inline bool
+xrep_xattr_want_flush_stashed(
+ struct xrep_xattr *rx)
+{
+ unsigned long long bytes;
+
+ if (!rx->can_flush)
+ return false;
+
+ bytes = xfarray_bytes(rx->xattr_records) +
+ xfblob_bytes(rx->xattr_blobs);
+ return bytes > XREP_XATTR_MAX_STASH_BYTES;
+}
+
+/*
+ * Did we observe rename changing parent pointer xattrs while we were flushing
+ * salvaged attrs?
+ */
+static inline bool
+xrep_xattr_saw_pptr_conflict(
+ struct xrep_xattr *rx)
+{
+ bool ret;
+
+ ASSERT(rx->can_flush);
+
+ if (!xfs_has_parent(rx->sc->mp))
+ return false;
+
+ xfs_assert_ilocked(rx->sc->ip, XFS_ILOCK_EXCL);
+
+ mutex_lock(&rx->lock);
+ ret = xfarray_bytes(rx->pptr_recs) > 0;
+ mutex_unlock(&rx->lock);
+
+ return ret;
+}
+
+/*
+ * Reset the entire repair state back to initial conditions, now that we've
+ * detected a parent pointer update to the attr structure while we were
+ * flushing salvaged attrs. See the locking notes in dir_repair.c for more
+ * information on why this is all necessary.
+ */
+STATIC int
+xrep_xattr_full_reset(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ struct xfs_attr_sf_hdr *hdr;
+ struct xfs_ifork *ifp = &sc->tempip->i_af;
+ int error;
+
+ trace_xrep_xattr_full_reset(sc->ip, sc->tempip);
+
+ /* The temporary file's data fork had better not be in btree format. */
+ if (sc->tempip->i_df.if_format == XFS_DINODE_FMT_BTREE) {
+ ASSERT(0);
+ return -EIO;
+ }
+
+ /*
+ * We begin in transaction context with sc->ip ILOCKed but not joined
+ * to the transaction. To reset to the initial state, we must hold
+ * sc->ip's ILOCK to prevent rename from updating parent pointer
+ * information and the tempfile's ILOCK to clear its contents.
+ */
+ xchk_iunlock(rx->sc, XFS_ILOCK_EXCL);
+ xrep_tempfile_ilock_both(sc);
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+ /*
+ * Free all the blocks of the attr fork of the temp file, and reset
+ * it back to local format.
+ */
+ if (xfs_ifork_has_extents(&sc->tempip->i_af)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ ASSERT(ifp->if_bytes == 0);
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
+ xfs_idata_realloc(sc->tempip, sizeof(*hdr), XFS_ATTR_FORK);
+ }
+
+ /* Reinitialize the attr fork to an empty shortform structure. */
+ hdr = ifp->if_data;
+ memset(hdr, 0, sizeof(*hdr));
+ hdr->totsize = cpu_to_be16(sizeof(*hdr));
+ xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+ /*
+ * Roll this transaction to commit our reset ondisk. The tempfile
+ * should no longer be joined to the transaction, so we drop its ILOCK.
+ * This should leave us in transaction context with sc->ip ILOCKed but
+ * not joined to the transaction.
+ */
+ error = xrep_roll_trans(sc);
+ if (error)
+ return error;
+ xrep_tempfile_iunlock(sc);
+
+ /*
+ * Erase any accumulated parent pointer updates now that we've erased
+ * the tempfile's attr fork. We're resetting the entire repair state
+ * back to where we were initially, except now we won't flush salvaged
+ * xattrs until the very end.
+ */
+ mutex_lock(&rx->lock);
+ xfarray_truncate(rx->pptr_recs);
+ xfblob_truncate(rx->pptr_names);
+ mutex_unlock(&rx->lock);
+
+ rx->can_flush = false;
+ rx->attrs_found = 0;
+
+ ASSERT(xfarray_bytes(rx->xattr_records) == 0);
+ ASSERT(xfblob_bytes(rx->xattr_blobs) == 0);
+ return 0;
+}
+
+/* Extract as many attribute keys and values as we can. */
+STATIC int
+xrep_xattr_recover(
+ struct xrep_xattr *rx)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub *sc = rx->sc;
+ struct xfs_da_geometry *geo = sc->mp->m_attr_geo;
+ xfs_fileoff_t offset;
+ xfs_extlen_t len;
+ xfs_dablk_t dabno;
+ int nmap;
+ int error;
+
+restart:
+ /*
+ * Iterate each xattr leaf block in the attr fork to scan them for any
+ * attributes that we might salvage.
+ */
+ for (offset = 0;
+ offset < XFS_MAX_FILEOFF;
+ offset = got.br_startoff + got.br_blockcount) {
+ nmap = 1;
+ error = xfs_bmapi_read(sc->ip, offset, XFS_MAX_FILEOFF - offset,
+ &got, &nmap, XFS_BMAPI_ATTRFORK);
+ if (error)
+ return error;
+ if (nmap != 1)
+ return -EFSCORRUPTED;
+ if (!xfs_bmap_is_written_extent(&got))
+ continue;
+
+ for (dabno = round_up(got.br_startoff, geo->fsbcount);
+ dabno < got.br_startoff + got.br_blockcount;
+ dabno += len) {
+ xfs_fileoff_t curr_offset = dabno - got.br_startoff;
+ xfs_extlen_t maxlen;
+
+ if (xchk_should_terminate(rx->sc, &error))
+ return error;
+
+ maxlen = min_t(xfs_filblks_t, INT_MAX,
+ got.br_blockcount - curr_offset);
+ error = xrep_xattr_recover_block(rx, dabno,
+ curr_offset + got.br_startblock,
+ maxlen, &len);
+ if (error)
+ return error;
+
+ if (xrep_xattr_want_flush_stashed(rx)) {
+ error = xrep_xattr_flush_stashed(rx);
+ if (error)
+ return error;
+
+ if (xrep_xattr_saw_pptr_conflict(rx)) {
+ error = xrep_xattr_full_reset(rx);
+ if (error)
+ return error;
+
+ goto restart;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Reset the extended attribute fork to a state where we can start re-adding
+ * the salvaged attributes.
+ */
+STATIC int
+xrep_xattr_fork_remove(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_attr_sf_hdr *hdr;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
+
+ /*
+ * If the data fork is in btree format, we can't change di_forkoff
+ * because we could run afoul of the rule that the data fork isn't
+ * supposed to be in btree format if there's enough space in the fork
+ * that it could have used extents format. Instead, reinitialize the
+ * attr fork to have a shortform structure with zero attributes.
+ */
+ if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) {
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
+ hdr = xfs_idata_realloc(ip, (int)sizeof(*hdr) - ifp->if_bytes,
+ XFS_ATTR_FORK);
+ hdr->count = 0;
+ hdr->totsize = cpu_to_be16(sizeof(*hdr));
+ xfs_trans_log_inode(sc->tp, ip,
+ XFS_ILOG_CORE | XFS_ILOG_ADATA);
+ return 0;
+ }
+
+ /* If we still have attr fork extents, something's wrong. */
+ if (ifp->if_nextents != 0) {
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec irec;
+ unsigned int i = 0;
+
+ xfs_emerg(sc->mp,
+ "inode 0x%llx attr fork still has %llu attr extents, format %d?!",
+ ip->i_ino, ifp->if_nextents, ifp->if_format);
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ xfs_err(sc->mp,
+ "[%u]: startoff %llu startblock %llu blockcount %llu state %u",
+ i++, irec.br_startoff,
+ irec.br_startblock, irec.br_blockcount,
+ irec.br_state);
+ }
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_attr_fork_remove(ip, sc->tp);
+ return 0;
+}
+
+/*
+ * Free all the attribute fork blocks of the file being repaired and delete the
+ * fork. The caller must ILOCK the scrub file and join it to the transaction.
+ * This function returns with the inode joined to a clean transaction.
+ */
+int
+xrep_xattr_reset_fork(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ trace_xrep_xattr_reset_fork(sc->ip, sc->ip);
+
+ /* Unmap all the attr blocks. */
+ if (xfs_ifork_has_extents(&sc->ip->i_af)) {
+ error = xrep_reap_ifork(sc, sc->ip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ }
+
+ error = xrep_xattr_fork_remove(sc, sc->ip);
+ if (error)
+ return error;
+
+ return xfs_trans_roll_inode(&sc->tp, sc->ip);
+}
+
+/*
+ * Free all the attribute fork blocks of the temporary file and delete the attr
+ * fork. The caller must ILOCK the tempfile and join it to the transaction.
+ * This function returns with the inode joined to a clean scrub transaction.
+ */
+int
+xrep_xattr_reset_tempfile_fork(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ trace_xrep_xattr_reset_fork(sc->ip, sc->tempip);
+
+ /*
+ * Wipe out the attr fork of the temp file so that regular inode
+ * inactivation won't trip over the corrupt attr fork.
+ */
+ if (xfs_ifork_has_extents(&sc->tempip->i_af)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ }
+
+ return xrep_xattr_fork_remove(sc, sc->tempip);
+}
+
+/*
+ * Find all the extended attributes for this inode by scraping them out of the
+ * attribute key blocks by hand, and flushing them into the temp file.
+ * When we're done, free the staging memory before exchanging the xattr
+ * structures to reduce memory usage.
+ */
+STATIC int
+xrep_xattr_salvage_attributes(
+ struct xrep_xattr *rx)
+{
+ struct xfs_inode *ip = rx->sc->ip;
+ int error;
+
+ /* Short format xattrs are easy! */
+ if (rx->sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) {
+ error = xrep_xattr_recover_sf(rx);
+ if (error)
+ return error;
+
+ return xrep_xattr_flush_stashed(rx);
+ }
+
+ /*
+ * For non-inline xattr structures, the salvage function scans the
+ * buffer cache looking for potential attr leaf blocks. The scan
+ * requires the ability to lock any buffer found and runs independently
+ * of any transaction <-> buffer item <-> buffer linkage. Therefore,
+ * roll the transaction to ensure there are no buffers joined. We hold
+ * the ILOCK independently of the transaction.
+ */
+ error = xfs_trans_roll(&rx->sc->tp);
+ if (error)
+ return error;
+
+ error = xfs_iread_extents(rx->sc->tp, ip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ error = xrep_xattr_recover(rx);
+ if (error)
+ return error;
+
+ return xrep_xattr_flush_stashed(rx);
+}
+
+/*
+ * Add this stashed incore parent pointer to the temporary file. The caller
+ * must hold the tempdir's IOLOCK, must not hold any ILOCKs, and must not be in
+ * transaction context.
+ */
+STATIC int
+xrep_xattr_replay_pptr_update(
+ struct xrep_xattr *rx,
+ const struct xfs_name *xname,
+ struct xrep_xattr_pptr *pptr)
+{
+ struct xfs_scrub *sc = rx->sc;
+ int error;
+
+ switch (pptr->action) {
+ case XREP_XATTR_PPTR_ADD:
+ /* Create parent pointer. */
+ trace_xrep_xattr_replay_parentadd(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ error = xfs_parent_set(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rx->pptr_args);
+ ASSERT(error != -EEXIST);
+ return error;
+ case XREP_XATTR_PPTR_REMOVE:
+ /* Remove parent pointer. */
+ trace_xrep_xattr_replay_parentremove(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ error = xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rx->pptr_args);
+ ASSERT(error != -ENOATTR);
+ return error;
+ }
+
+ ASSERT(0);
+ return -EIO;
+}
+
+/*
+ * Flush stashed parent pointer updates that have been recorded by the scanner.
+ * This is done to reduce the memory requirements of the xattr rebuild, since
+ * files can have a lot of hardlinks and the fs can be busy.
+ *
+ * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile
+ * IOLOCK.
+ */
+STATIC int
+xrep_xattr_replay_pptr_updates(
+ struct xrep_xattr *rx)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ mutex_lock(&rx->lock);
+ foreach_xfarray_idx(rx->pptr_recs, array_cur) {
+ struct xrep_xattr_pptr pptr;
+
+ error = xfarray_load(rx->pptr_recs, array_cur, &pptr);
+ if (error)
+ goto out_unlock;
+
+ error = xfblob_loadname(rx->pptr_names, pptr.name_cookie,
+ &rx->xname, pptr.namelen);
+ if (error)
+ goto out_unlock;
+ mutex_unlock(&rx->lock);
+
+ error = xrep_xattr_replay_pptr_update(rx, &rx->xname, &pptr);
+ if (error)
+ return error;
+
+ mutex_lock(&rx->lock);
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rx->pptr_recs);
+ xfblob_truncate(rx->pptr_names);
+ mutex_unlock(&rx->lock);
+ return 0;
+out_unlock:
+ mutex_unlock(&rx->lock);
+ return error;
+}
+
+/*
+ * Remember that we want to create a parent pointer in the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_xattr_stash_parentadd(
+ struct xrep_xattr *rx,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_xattr_pptr pptr = {
+ .action = XREP_XATTR_PPTR_ADD,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_xattr_stash_parentadd(rx->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rx->pptr_recs, &pptr);
+}
+
+/*
+ * Remember that we want to remove a parent pointer from the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_xattr_stash_parentremove(
+ struct xrep_xattr *rx,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_xattr_pptr pptr = {
+ .action = XREP_XATTR_PPTR_REMOVE,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_xattr_stash_parentremove(rx->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rx->pptr_recs, &pptr);
+}
+
+/*
+ * Capture dirent updates being made by other threads. We will have to replay
+ * the parent pointer updates before exchanging attr forks.
+ */
+STATIC int
+xrep_xattr_live_dirent_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_xattr *rx;
+ struct xfs_scrub *sc;
+ int error;
+
+ rx = container_of(nb, struct xrep_xattr, dhook.dirent_hook.nb);
+ sc = rx->sc;
+
+ /*
+ * This thread updated a dirent that points to the file that we're
+ * repairing, so stash the update for replay against the temporary
+ * file.
+ */
+ if (p->ip->i_ino != sc->ip->i_ino)
+ return NOTIFY_DONE;
+
+ mutex_lock(&rx->lock);
+ if (p->delta > 0)
+ error = xrep_xattr_stash_parentadd(rx, p->name, p->dp);
+ else
+ error = xrep_xattr_stash_parentremove(rx, p->name, p->dp);
+ if (error)
+ rx->live_update_aborted = true;
+ mutex_unlock(&rx->lock);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Prepare both inodes' attribute forks for an exchange. Promote the tempfile
+ * from short format to leaf format, and if the file being repaired has a short
+ * format attr fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_xattr_swap_prep(
+ struct xfs_scrub *sc,
+ bool temp_local,
+ bool ip_local)
+{
+ int error;
+
+ /*
+ * If the tempfile's attributes are in shortform format, convert that
+ * to a single leaf extent so that we can use the atomic mapping
+ * exchange.
+ */
+ if (temp_local) {
+ struct xfs_da_args args = {
+ .dp = sc->tempip,
+ .geo = sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .trans = sc->tp,
+ .total = 1,
+ .owner = sc->ip->i_ino,
+ };
+
+ error = xfs_attr_shortform_to_leaf(&args);
+ if (error)
+ return error;
+
+ /*
+ * Roll the deferred log items to get us back to a clean
+ * transaction.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If the file being repaired had a shortform attribute fork, convert
+ * that to an empty extent list in preparation for the atomic mapping
+ * exchange.
+ */
+ if (ip_local) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+
+ xfs_idestroy_fork(ifp);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ ifp->if_bytes = 0;
+ ifp->if_data = NULL;
+ ifp->if_height = 0;
+
+ xfs_trans_log_inode(sc->tp, sc->ip,
+ XFS_ILOG_CORE | XFS_ILOG_ADATA);
+ }
+
+ return 0;
+}
+
+/* Exchange the temporary file's attribute fork with the one being repaired. */
+int
+xrep_xattr_swap(
+ struct xfs_scrub *sc,
+ struct xrep_tempexch *tx)
+{
+ bool ip_local, temp_local;
+ int error = 0;
+
+ ip_local = sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL;
+ temp_local = sc->tempip->i_af.if_format == XFS_DINODE_FMT_LOCAL;
+
+ /*
+ * If the both files have a local format attr fork and the rebuilt
+ * xattr data would fit in the repaired file's attr fork, just copy
+ * the contents from the tempfile and declare ourselves done.
+ */
+ if (ip_local && temp_local) {
+ int forkoff;
+ int newsize;
+
+ newsize = xfs_attr_sf_totsize(sc->tempip);
+ forkoff = xfs_attr_shortform_bytesfit(sc->ip, newsize);
+ if (forkoff > 0) {
+ sc->ip->i_forkoff = forkoff;
+ xrep_tempfile_copyout_local(sc, XFS_ATTR_FORK);
+ return 0;
+ }
+ }
+
+ /* Otherwise, make sure both attr forks are in block-mapping mode. */
+ error = xrep_xattr_swap_prep(sc, temp_local, ip_local);
+ if (error)
+ return error;
+
+ return xrep_tempexch_contents(sc, tx);
+}
+
+/*
+ * Finish replaying stashed parent pointer updates, allocate a transaction for
+ * exchanging extent mappings, and take the ILOCKs of both files before we
+ * commit the new extended attribute structure.
+ */
+STATIC int
+xrep_xattr_finalize_tempfile(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ int error;
+
+ if (!xfs_has_parent(sc->mp))
+ return xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx);
+
+ /*
+ * Repair relies on the ILOCK to quiesce all possible xattr updates.
+ * Replay all queued parent pointer updates into the tempfile before
+ * exchanging the contents, even if that means dropping the ILOCKs and
+ * the transaction.
+ */
+ do {
+ error = xrep_xattr_replay_pptr_updates(rx);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx);
+ if (error)
+ return error;
+
+ if (xfarray_length(rx->pptr_recs) == 0)
+ break;
+
+ xchk_trans_cancel(sc);
+ xrep_tempfile_iunlock_both(sc);
+ } while (!xchk_should_terminate(sc, &error));
+ return error;
+}
+
+/*
+ * Exchange the new extended attribute data (which we created in the tempfile)
+ * with the file being repaired.
+ */
+STATIC int
+xrep_xattr_rebuild_tree(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ int error;
+
+ /*
+ * If we didn't find any attributes to salvage, repair the file by
+ * zapping its attr fork.
+ */
+ if (rx->attrs_found == 0) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ error = xrep_xattr_reset_fork(sc);
+ if (error)
+ return error;
+
+ goto forget_acls;
+ }
+
+ trace_xrep_xattr_rebuild_tree(sc->ip, sc->tempip);
+
+ /*
+ * Commit the repair transaction and drop the ILOCKs so that we can use
+ * the atomic file content exchange helper functions to compute the
+ * correct resource reservations.
+ *
+ * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent xattr
+ * modifications, but there's nothing to prevent userspace from reading
+ * the attributes until we're ready for the exchange operation. Reads
+ * will return -EIO without shutting down the fs, so we're ok with
+ * that.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK on the temporary file so that we can run xattr
+ * operations with the same locks held as we would for a normal file.
+ * We still hold sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rx->sc);
+ if (error)
+ return error;
+
+ /*
+ * Allocate transaction, lock inodes, and make sure that we've replayed
+ * all the stashed parent pointer updates to the temp file. After this
+ * point, we're ready to exchange attr fork mappings.
+ */
+ error = xrep_xattr_finalize_tempfile(rx);
+ if (error)
+ return error;
+
+ /*
+ * Exchange the blocks mapped by the tempfile's attr fork with the file
+ * being repaired. The old attr blocks will then be attached to the
+ * tempfile, so reap its attr fork.
+ */
+ error = xrep_xattr_swap(sc, &rx->tx);
+ if (error)
+ return error;
+
+ error = xrep_xattr_reset_tempfile_fork(sc);
+ if (error)
+ return error;
+
+ /*
+ * Roll to get a transaction without any inodes joined to it. Then we
+ * can drop the tempfile's ILOCK and IOLOCK before doing more work on
+ * the scrub target file.
+ */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(sc);
+ xrep_tempfile_iounlock(sc);
+
+forget_acls:
+ /* Invalidate cached ACLs now that we've reloaded all the xattrs. */
+ xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_FILE);
+ xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_DEFAULT);
+ return 0;
+}
+
+/* Tear down all the incore scan stuff we created. */
+STATIC void
+xrep_xattr_teardown(
+ struct xrep_xattr *rx)
+{
+ if (xfs_has_parent(rx->sc->mp))
+ xfs_dir_hook_del(rx->sc->mp, &rx->dhook);
+ if (rx->pptr_names)
+ xfblob_destroy(rx->pptr_names);
+ if (rx->pptr_recs)
+ xfarray_destroy(rx->pptr_recs);
+ xfblob_destroy(rx->xattr_blobs);
+ xfarray_destroy(rx->xattr_records);
+ mutex_destroy(&rx->lock);
+ kfree(rx);
+}
+
+/* Set up the filesystem scan so we can regenerate extended attributes. */
+STATIC int
+xrep_xattr_setup_scan(
+ struct xfs_scrub *sc,
+ struct xrep_xattr **rxp)
+{
+ struct xrep_xattr *rx;
+ char *descr;
+ int max_len;
+ int error;
+
+ rx = kzalloc(sizeof(struct xrep_xattr), XCHK_GFP_FLAGS);
+ if (!rx)
+ return -ENOMEM;
+ rx->sc = sc;
+ rx->can_flush = true;
+ rx->xname.name = rx->namebuf;
+
+ mutex_init(&rx->lock);
+
+ /*
+ * Allocate enough memory to handle loading local attr values from the
+ * xfblob data while flushing stashed attrs to the temporary file.
+ * We only realloc the buffer when salvaging remote attr values.
+ */
+ max_len = xfs_attr_leaf_entsize_local_max(sc->mp->m_attr_geo->blksize);
+ error = xchk_setup_xattr_buf(rx->sc, max_len);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error)
+ goto out_rx;
+
+ /* Set up some staging for salvaged attribute keys and values */
+ descr = xchk_xfile_ino_descr(sc, "xattr keys");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_xattr_key),
+ &rx->xattr_records);
+ kfree(descr);
+ if (error)
+ goto out_rx;
+
+ descr = xchk_xfile_ino_descr(sc, "xattr names");
+ error = xfblob_create(descr, &rx->xattr_blobs);
+ kfree(descr);
+ if (error)
+ goto out_keys;
+
+ if (xfs_has_parent(sc->mp)) {
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+
+ descr = xchk_xfile_ino_descr(sc,
+ "xattr retained parent pointer entries");
+ error = xfarray_create(descr, 0,
+ sizeof(struct xrep_xattr_pptr),
+ &rx->pptr_recs);
+ kfree(descr);
+ if (error)
+ goto out_values;
+
+ descr = xchk_xfile_ino_descr(sc,
+ "xattr retained parent pointer names");
+ error = xfblob_create(descr, &rx->pptr_names);
+ kfree(descr);
+ if (error)
+ goto out_pprecs;
+
+ xfs_dir_hook_setup(&rx->dhook, xrep_xattr_live_dirent_update);
+ error = xfs_dir_hook_add(sc->mp, &rx->dhook);
+ if (error)
+ goto out_ppnames;
+ }
+
+ *rxp = rx;
+ return 0;
+out_ppnames:
+ xfblob_destroy(rx->pptr_names);
+out_pprecs:
+ xfarray_destroy(rx->pptr_recs);
+out_values:
+ xfblob_destroy(rx->xattr_blobs);
+out_keys:
+ xfarray_destroy(rx->xattr_records);
+out_rx:
+ mutex_destroy(&rx->lock);
+ kfree(rx);
+ return error;
+}
+
+/*
+ * Repair the extended attribute metadata.
+ *
+ * XXX: Remote attribute value buffers encompass the entire (up to 64k) buffer.
+ * The buffer cache in XFS can't handle aliased multiblock buffers, so this
+ * might misbehave if the attr fork is crosslinked with other filesystem
+ * metadata.
+ */
+int
+xrep_xattr(
+ struct xfs_scrub *sc)
+{
+ struct xrep_xattr *rx = NULL;
+ int error;
+
+ if (!xfs_inode_hasattr(sc->ip))
+ return -ENOENT;
+
+ /* The rmapbt is required to reap the old attr fork. */
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
+ error = xrep_xattr_setup_scan(sc, &rx);
+ if (error)
+ return error;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ error = xrep_xattr_salvage_attributes(rx);
+ if (error)
+ goto out_scan;
+
+ if (rx->live_update_aborted) {
+ error = -EIO;
+ goto out_scan;
+ }
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_scan;
+
+ error = xrep_xattr_rebuild_tree(rx);
+ if (error)
+ goto out_scan;
+
+out_scan:
+ xrep_xattr_teardown(rx);
+ return error;
+}
diff --git a/fs/xfs/scrub/attr_repair.h b/fs/xfs/scrub/attr_repair.h
new file mode 100644
index 000000000000..979729bd4a5f
--- /dev/null
+++ b/fs/xfs/scrub/attr_repair.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ATTR_REPAIR_H__
+#define __XFS_SCRUB_ATTR_REPAIR_H__
+
+struct xrep_tempexch;
+
+int xrep_xattr_swap(struct xfs_scrub *sc, struct xrep_tempexch *tx);
+int xrep_xattr_reset_fork(struct xfs_scrub *sc);
+int xrep_xattr_reset_tempfile_fork(struct xfs_scrub *sc);
+
+#endif /* __XFS_SCRUB_ATTR_REPAIR_H__ */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 0cb8d43912a8..7ba35a7a7920 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -40,22 +40,23 @@ struct xbitmap64_node {
* These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
* forward-declare them anyway for clarity.
*/
-static inline void
+static inline __maybe_unused void
xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root);
-static inline void
+static inline __maybe_unused void
xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root);
-static inline struct xbitmap64_node *
+static inline __maybe_unused struct xbitmap64_node *
xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start,
uint64_t last);
-static inline struct xbitmap64_node *
+static inline __maybe_unused struct xbitmap64_node *
xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start,
uint64_t last);
INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t,
- __bn_subtree_last, START, LAST, static inline, xbitmap64_tree)
+ __bn_subtree_last, START, LAST, static inline __maybe_unused,
+ xbitmap64_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
#define for_each_xbitmap64_extent(bn, bitmap) \
@@ -314,22 +315,23 @@ struct xbitmap32_node {
* These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
* forward-declare them anyway for clarity.
*/
-static inline void
+static inline __maybe_unused void
xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root);
-static inline void
+static inline __maybe_unused void
xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root);
-static inline struct xbitmap32_node *
+static inline __maybe_unused struct xbitmap32_node *
xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start,
uint32_t last);
-static inline struct xbitmap32_node *
+static inline __maybe_unused struct xbitmap32_node *
xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start,
uint32_t last);
INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t,
- __bn_subtree_last, START, LAST, static inline, xbitmap32_tree)
+ __bn_subtree_last, START, LAST, static inline __maybe_unused,
+ xbitmap32_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
#define for_each_xbitmap32_extent(bn, bitmap) \
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 47a20cf5205f..1ad8ec63a7f4 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -31,6 +31,8 @@
#include "xfs_ag.h"
#include "xfs_error.h"
#include "xfs_quota.h"
+#include "xfs_exchmaps.h"
+#include "xfs_rtbitmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -445,7 +447,7 @@ xchk_perag_read_headers(
{
int error;
- error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
+ error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
return error;
@@ -781,7 +783,7 @@ xchk_iget(
{
ASSERT(sc->tp != NULL);
- return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
+ return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp);
}
/*
@@ -827,13 +829,13 @@ again:
* in the iget cache miss path.
*/
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
- error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
xfs_perag_put(pag);
if (error)
return error;
- error = xfs_iget(mp, tp, inum,
- XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp);
+ error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0,
+ ipp);
if (error == -EAGAIN) {
/*
* The inode may be in core but temporarily unavailable and may
@@ -1060,12 +1062,6 @@ xchk_irele(
spin_lock(&VFS_I(ip)->i_lock);
VFS_I(ip)->i_state &= ~I_DONTCACHE;
spin_unlock(&VFS_I(ip)->i_lock);
- } else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
- /*
- * If this is the last reference to the inode and the caller
- * permits it, set DONTCACHE to avoid thrashing.
- */
- d_mark_dontcache(VFS_I(ip));
}
xfs_irele(ip);
@@ -1202,27 +1198,12 @@ xchk_metadata_inode_subtype(
struct xfs_scrub *sc,
unsigned int scrub_type)
{
- __u32 smtype = sc->sm->sm_type;
- unsigned int sick_mask = sc->sick_mask;
+ struct xfs_scrub_subord *sub;
int error;
- sc->sm->sm_type = scrub_type;
-
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xchk_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xchk_bmap_data(sc);
- break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- break;
- }
-
- sc->sick_mask = sick_mask;
- sc->sm->sm_type = smtype;
+ sub = xchk_scrub_create_subord(sc, scrub_type);
+ error = sub->sc.ops->scrub(&sub->sc);
+ xchk_scrub_free_subord(sub);
return error;
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 89f7bbec887e..3d5f1f6b4b7b 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -6,31 +6,6 @@
#ifndef __XFS_SCRUB_COMMON_H__
#define __XFS_SCRUB_COMMON_H__
-/*
- * We /could/ terminate a scrub/repair operation early. If we're not
- * in a good place to continue (fatal signal, etc.) then bail out.
- * Note that we're careful not to make any judgements about *error.
- */
-static inline bool
-xchk_should_terminate(
- struct xfs_scrub *sc,
- int *error)
-{
- /*
- * If preemption is disabled, we need to yield to the scheduler every
- * few seconds so that we don't run afoul of the soft lockup watchdog
- * or RCU stall detector.
- */
- cond_resched();
-
- if (fatal_signal_pending(current)) {
- if (*error == 0)
- *error = -EINTR;
- return true;
- }
- return false;
-}
-
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
int xchk_trans_alloc_empty(struct xfs_scrub *sc);
void xchk_trans_cancel(struct xfs_scrub *sc);
@@ -92,6 +67,7 @@ int xchk_setup_directory(struct xfs_scrub *sc);
int xchk_setup_xattr(struct xfs_scrub *sc);
int xchk_setup_symlink(struct xfs_scrub *sc);
int xchk_setup_parent(struct xfs_scrub *sc);
+int xchk_setup_dirtree(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xchk_setup_rtbitmap(struct xfs_scrub *sc);
int xchk_setup_rtsummary(struct xfs_scrub *sc);
@@ -212,6 +188,7 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
}
bool xchk_dir_looks_zapped(struct xfs_inode *dp);
+bool xchk_pptr_looks_zapped(struct xfs_inode *ip);
#ifdef CONFIG_XFS_ONLINE_REPAIR
/* Decide if a repair is required. */
diff --git a/fs/xfs/scrub/dab_bitmap.h b/fs/xfs/scrub/dab_bitmap.h
new file mode 100644
index 000000000000..0c6e3aad4395
--- /dev/null
+++ b/fs/xfs/scrub/dab_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_DAB_BITMAP_H__
+#define __XFS_SCRUB_DAB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_dablk_t */
+
+struct xdab_bitmap {
+ struct xbitmap32 dabitmap;
+};
+
+static inline void xdab_bitmap_init(struct xdab_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->dabitmap);
+}
+
+static inline void xdab_bitmap_destroy(struct xdab_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->dabitmap);
+}
+
+static inline int xdab_bitmap_set(struct xdab_bitmap *bitmap,
+ xfs_dablk_t dabno, xfs_extlen_t len)
+{
+ return xbitmap32_set(&bitmap->dabitmap, dabno, len);
+}
+
+static inline bool xdab_bitmap_test(struct xdab_bitmap *bitmap,
+ xfs_dablk_t dabno, xfs_extlen_t *len)
+{
+ return xbitmap32_test(&bitmap->dabitmap, dabno, len);
+}
+
+#endif /* __XFS_SCRUB_DAB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 82b150d3b8b7..056de4819f86 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -78,6 +78,22 @@ xchk_da_set_corrupt(
__return_address);
}
+/* Flag a da btree node in need of optimization. */
+void
+xchk_da_set_preen(
+ struct xchk_da_btree *ds,
+ int level)
+{
+ struct xfs_scrub *sc = ds->sc;
+
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xchk_fblock_preen(sc, ds->dargs.whichfork,
+ xfs_dir2_da_to_db(ds->dargs.geo,
+ ds->state->path.blk[level].blkno),
+ __return_address);
+}
+
+/* Find an entry at a certain level in a da btree. */
static struct xfs_da_node_entry *
xchk_da_btree_node_entry(
struct xchk_da_btree *ds,
@@ -320,6 +336,7 @@ xchk_da_btree_block(
struct xfs_da3_blkinfo *hdr3;
struct xfs_da_args *dargs = &ds->dargs;
struct xfs_inode *ip = ds->dargs.dp;
+ xfs_failaddr_t fa;
xfs_ino_t owner;
int *pmaxrecs;
struct xfs_da3_icnode_hdr nodehdr;
@@ -442,6 +459,12 @@ xchk_da_btree_block(
goto out_freebp;
}
+ fa = xfs_da3_header_check(blk->bp, dargs->owner);
+ if (fa) {
+ xchk_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+
/*
* If we've been handed a block that is below the dabtree root, does
* its hashval match what the parent block expected to see?
@@ -494,6 +517,7 @@ xchk_da_btree(
ds->dargs.whichfork = whichfork;
ds->dargs.trans = sc->tp;
ds->dargs.op_flags = XFS_DA_OP_OKNOENT;
+ ds->dargs.owner = sc->ip->i_ino;
ds->state = xfs_da_state_alloc(&ds->dargs);
ds->sc = sc;
ds->private = private;
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
index 4f8c2138a1ec..de291e3b77dd 100644
--- a/fs/xfs/scrub/dabtree.h
+++ b/fs/xfs/scrub/dabtree.h
@@ -35,6 +35,9 @@ bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error);
/* Check for da btree corruption. */
void xchk_da_set_corrupt(struct xchk_da_btree *ds, int level);
+void xchk_da_set_preen(struct xchk_da_btree *ds, int level);
+
+void xchk_da_set_preen(struct xchk_da_btree *ds, int level);
int xchk_da_btree_hash(struct xchk_da_btree *ds, int level, __be32 *hashp);
int xchk_da_btree(struct xfs_scrub *sc, int whichfork,
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 076a310b8eb0..bf9199e8df63 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -16,22 +16,70 @@
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_health.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
#include "scrub/readdir.h"
#include "scrub/health.h"
+#include "scrub/repair.h"
+#include "scrub/trace.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
/* Set us up to scrub directories. */
int
xchk_setup_directory(
struct xfs_scrub *sc)
{
+ int error;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_directory(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_inode_contents(sc, 0);
}
/* Directories */
+/* Deferred directory entry that we saved for later. */
+struct xchk_dirent {
+ /* Cookie for retrieval of the dirent name. */
+ xfblob_cookie name_cookie;
+
+ /* Child inode number. */
+ xfs_ino_t ino;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+};
+
+struct xchk_dir {
+ struct xfs_scrub *sc;
+
+ /* information for parent pointer validation. */
+ struct xfs_parent_rec pptr_rec;
+ struct xfs_da_args pptr_args;
+
+ /* Fixed-size array of xchk_dirent structures. */
+ struct xfarray *dir_entries;
+
+ /* Blobs containing dirent names. */
+ struct xfblob *dir_names;
+
+ /* If we've cycled the ILOCK, we must revalidate deferred dirents. */
+ bool need_revalidate;
+
+ /* Name buffer for dirent revalidation. */
+ struct xfs_name xname;
+ uint8_t namebuf[MAXNAMELEN];
+};
+
/* Scrub a directory entry. */
/* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */
@@ -55,6 +103,108 @@ xchk_dir_check_ftype(
}
/*
+ * Try to lock a child file for checking parent pointers. Returns the inode
+ * flags for the locks we now hold, or zero if we failed.
+ */
+STATIC unsigned int
+xchk_dir_lock_child(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+ return 0;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ if (!xfs_inode_has_attr_fork(ip) || !xfs_need_iread_extents(&ip->i_af))
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED;
+
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL;
+}
+
+/* Check the backwards link (parent pointer) associated with this dirent. */
+STATIC int
+xchk_dir_parent_pointer(
+ struct xchk_dir *sd,
+ const struct xfs_name *name,
+ struct xfs_inode *ip)
+{
+ struct xfs_scrub *sc = sd->sc;
+ int error;
+
+ xfs_inode_to_parent_rec(&sd->pptr_rec, sc->ip);
+ error = xfs_parent_lookup(sc->tp, ip, name, &sd->pptr_rec,
+ &sd->pptr_args);
+ if (error == -ENOATTR)
+ xchk_fblock_xref_set_corrupt(sc, XFS_DATA_FORK, 0);
+
+ return 0;
+}
+
+/* Look for a parent pointer matching this dirent, if the child isn't busy. */
+STATIC int
+xchk_dir_check_pptr_fast(
+ struct xchk_dir *sd,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ struct xfs_inode *ip)
+{
+ struct xfs_scrub *sc = sd->sc;
+ unsigned int lockmode;
+ int error;
+
+ /* dot and dotdot entries do not have parent pointers */
+ if (xfs_dir2_samename(name, &xfs_name_dot) ||
+ xfs_dir2_samename(name, &xfs_name_dotdot))
+ return 0;
+
+ /* No self-referential non-dot or dotdot dirents. */
+ if (ip == sc->ip) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return -ECANCELED;
+ }
+
+ /* Try to lock the inode. */
+ lockmode = xchk_dir_lock_child(sc, ip);
+ if (!lockmode) {
+ struct xchk_dirent save_de = {
+ .namelen = name->len,
+ .ino = ip->i_ino,
+ };
+
+ /* Couldn't lock the inode, so save the dirent for later. */
+ trace_xchk_dir_defer(sc->ip, name, ip->i_ino);
+
+ error = xfblob_storename(sd->dir_names, &save_de.name_cookie,
+ name);
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+
+ error = xfarray_append(sd->dir_entries, &save_de);
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+
+ return 0;
+ }
+
+ error = xchk_dir_parent_pointer(sd, name, ip);
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
+/*
* Scrub a single directory entry.
*
* Check the inode number to make sure it's sane, then we check that we can
@@ -71,6 +221,7 @@ xchk_dir_actor(
{
struct xfs_mount *mp = dp->i_mount;
struct xfs_inode *ip;
+ struct xchk_dir *sd = priv;
xfs_ino_t lookup_ino;
xfs_dablk_t offset;
int error = 0;
@@ -137,6 +288,14 @@ xchk_dir_actor(
goto out;
xchk_dir_check_ftype(sc, offset, ip, name->type);
+
+ if (xfs_has_parent(mp)) {
+ error = xchk_dir_check_pptr_fast(sd, dapos, name, ip);
+ if (error)
+ goto out_rele;
+ }
+
+out_rele:
xchk_irele(sc, ip);
out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
@@ -196,8 +355,8 @@ xchk_dir_rec(
xchk_da_set_corrupt(ds, level);
goto out;
}
- error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno,
- XFS_DABUF_MAP_HOLE_OK, &bp);
+ error = xfs_dir3_data_read(ds->dargs.trans, dp, ds->dargs.owner,
+ rec_bno, XFS_DABUF_MAP_HOLE_OK, &bp);
if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno,
&error))
goto out;
@@ -315,10 +474,11 @@ xchk_directory_data_bestfree(
/* dir block format */
if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
- error = xfs_dir3_block_read(sc->tp, sc->ip, &bp);
+ error = xfs_dir3_block_read(sc->tp, sc->ip, sc->ip->i_ino, &bp);
} else {
/* dir data format */
- error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp);
+ error = xfs_dir3_data_read(sc->tp, sc->ip, sc->ip->i_ino, lblk,
+ 0, &bp);
}
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
@@ -470,7 +630,7 @@ xchk_directory_leaf1_bestfree(
int error;
/* Read the free space block. */
- error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp);
+ error = xfs_dir3_leaf_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
return error;
xchk_buffer_recheck(sc, bp);
@@ -531,10 +691,9 @@ xchk_directory_leaf1_bestfree(
/* Check all the bestfree entries. */
for (i = 0; i < bestcount; i++, bestp++) {
best = be16_to_cpu(*bestp);
- error = xfs_dir3_data_read(sc->tp, sc->ip,
+ error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner,
xfs_dir2_db_to_da(args->geo, i),
- XFS_DABUF_MAP_HOLE_OK,
- &dbp);
+ XFS_DABUF_MAP_HOLE_OK, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
break;
@@ -577,7 +736,7 @@ xchk_directory_free_bestfree(
int error;
/* Read the free space block */
- error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp);
+ error = xfs_dir2_free_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
return error;
xchk_buffer_recheck(sc, bp);
@@ -597,7 +756,7 @@ xchk_directory_free_bestfree(
stale++;
continue;
}
- error = xfs_dir3_data_read(sc->tp, sc->ip,
+ error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner,
(freehdr.firstdb + i) * args->geo->fsbcount,
0, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
@@ -621,10 +780,11 @@ xchk_directory_blocks(
{
struct xfs_bmbt_irec got;
struct xfs_da_args args = {
- .dp = sc ->ip,
+ .dp = sc->ip,
.whichfork = XFS_DATA_FORK,
.geo = sc->mp->m_dir_geo,
.trans = sc->tp,
+ .owner = sc->ip->i_ino,
};
struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
struct xfs_mount *mp = sc->mp;
@@ -648,7 +808,8 @@ xchk_directory_blocks(
free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
/* Is this a block dir? */
- error = xfs_dir2_isblock(&args, &is_block);
+ if (xfs_dir2_format(&args, &error) == XFS_DIR2_FMT_BLOCK)
+ is_block = true;
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
@@ -752,11 +913,148 @@ out:
return error;
}
+/*
+ * Revalidate a dirent that we collected in the past but couldn't check because
+ * of lock contention. Returns 0 if the dirent is still valid, -ENOENT if it
+ * has gone away on us, or a negative errno.
+ */
+STATIC int
+xchk_dir_revalidate_dirent(
+ struct xchk_dir *sd,
+ const struct xfs_name *xname,
+ xfs_ino_t ino)
+{
+ struct xfs_scrub *sc = sd->sc;
+ xfs_ino_t child_ino;
+ int error;
+
+ /*
+ * Look up the directory entry. If we get -ENOENT, the directory entry
+ * went away and there's nothing to revalidate. Return any other
+ * error.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, xname, &child_ino);
+ if (error)
+ return error;
+
+ /* The inode number changed, nothing to revalidate. */
+ if (ino != child_ino)
+ return -ENOENT;
+
+ return 0;
+}
+
+/*
+ * Check a directory entry's parent pointers the slow way, which means we cycle
+ * locks a bunch and put up with revalidation until we get it done.
+ */
+STATIC int
+xchk_dir_slow_dirent(
+ struct xchk_dir *sd,
+ struct xchk_dirent *dirent,
+ const struct xfs_name *xname)
+{
+ struct xfs_scrub *sc = sd->sc;
+ struct xfs_inode *ip;
+ unsigned int lockmode;
+ int error;
+
+ /* Check that the deferred dirent still exists. */
+ if (sd->need_revalidate) {
+ error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino);
+ if (error == -ENOENT)
+ return 0;
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+ }
+
+ error = xchk_iget(sc, dirent->ino, &ip);
+ if (error == -EINVAL || error == -ENOENT) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ return error;
+
+ /*
+ * If we can grab both IOLOCK and ILOCK of the alleged child, we can
+ * proceed with the validation.
+ */
+ lockmode = xchk_dir_lock_child(sc, ip);
+ if (lockmode) {
+ trace_xchk_dir_slowpath(sc->ip, xname, ip->i_ino);
+ goto check_pptr;
+ }
+
+ /*
+ * We couldn't lock the child file. Drop all the locks and try to
+ * get them again, one at a time.
+ */
+ xchk_iunlock(sc, sc->ilock_flags);
+ sd->need_revalidate = true;
+
+ trace_xchk_dir_ultraslowpath(sc->ip, xname, ip->i_ino);
+
+ error = xchk_dir_trylock_for_pptrs(sc, ip, &lockmode);
+ if (error)
+ goto out_rele;
+
+ /* Revalidate, since we just cycled the locks. */
+ error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino);
+ if (error == -ENOENT) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_unlock;
+
+check_pptr:
+ error = xchk_dir_parent_pointer(sd, xname, ip);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+out_rele:
+ xchk_irele(sc, ip);
+ return error;
+}
+
+/* Check all the dirents that we deferred the first time around. */
+STATIC int
+xchk_dir_finish_slow_dirents(
+ struct xchk_dir *sd)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ foreach_xfarray_idx(sd->dir_entries, array_cur) {
+ struct xchk_dirent dirent;
+
+ if (sd->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ error = xfarray_load(sd->dir_entries, array_cur, &dirent);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(sd->dir_names, dirent.name_cookie,
+ &sd->xname, dirent.namelen);
+ if (error)
+ return error;
+
+ error = xchk_dir_slow_dirent(sd, &dirent, &sd->xname);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
/* Scrub a whole directory. */
int
xchk_directory(
struct xfs_scrub *sc)
{
+ struct xchk_dir *sd;
int error;
if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
@@ -789,9 +1087,60 @@ xchk_directory(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return 0;
+ sd = kvzalloc(sizeof(struct xchk_dir), XCHK_GFP_FLAGS);
+ if (!sd)
+ return -ENOMEM;
+ sd->sc = sc;
+ sd->xname.name = sd->namebuf;
+
+ if (xfs_has_parent(sc->mp)) {
+ char *descr;
+
+ /*
+ * Set up some staging memory for dirents that we can't check
+ * due to locking contention.
+ */
+ descr = xchk_xfile_ino_descr(sc, "slow directory entries");
+ error = xfarray_create(descr, 0, sizeof(struct xchk_dirent),
+ &sd->dir_entries);
+ kfree(descr);
+ if (error)
+ goto out_sd;
+
+ descr = xchk_xfile_ino_descr(sc, "slow directory entry names");
+ error = xfblob_create(descr, &sd->dir_names);
+ kfree(descr);
+ if (error)
+ goto out_entries;
+ }
+
/* Look up every name in this directory by hash. */
- error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL);
- if (error && error != -ECANCELED)
+ error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, sd);
+ if (error == -ECANCELED)
+ error = 0;
+ if (error)
+ goto out_names;
+
+ if (xfs_has_parent(sc->mp)) {
+ error = xchk_dir_finish_slow_dirents(sd);
+ if (error == -ETIMEDOUT) {
+ /* Couldn't grab a lock, scrub was marked incomplete */
+ error = 0;
+ goto out_names;
+ }
+ if (error)
+ goto out_names;
+ }
+
+out_names:
+ if (sd->dir_names)
+ xfblob_destroy(sd->dir_names);
+out_entries:
+ if (sd->dir_entries)
+ xfarray_destroy(sd->dir_entries);
+out_sd:
+ kvfree(sd);
+ if (error)
return error;
/* If the dir is clean, it is clearly not zapped. */
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
new file mode 100644
index 000000000000..64679fe08446
--- /dev/null
+++ b/fs/xfs/scrub/dir_repair.c
@@ -0,0 +1,1958 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_bmap_util.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_ag.h"
+#include "xfs_parent.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/iscan.h"
+#include "scrub/readdir.h"
+#include "scrub/reap.h"
+#include "scrub/findparent.h"
+#include "scrub/orphanage.h"
+#include "scrub/listxattr.h"
+
+/*
+ * Directory Repair
+ * ================
+ *
+ * We repair directories by reading the directory data blocks looking for
+ * directory entries that look salvageable (name passes verifiers, entry points
+ * to a valid allocated inode, etc). Each entry worth salvaging is stashed in
+ * memory, and the stashed entries are periodically replayed into a temporary
+ * directory to constrain memory use. Batching the construction of the
+ * temporary directory in this fashion reduces lock cycling of the directory
+ * being repaired and the temporary directory, and will later become important
+ * for parent pointer scanning.
+ *
+ * If parent pointers are enabled on this filesystem, we instead reconstruct
+ * the directory by visiting each parent pointer of each file in the filesystem
+ * and translating the relevant parent pointer records into dirents. In this
+ * case, it is advantageous to stash all directory entries created from parent
+ * pointers for a single child file before replaying them into the temporary
+ * directory. To save memory, the live filesystem scan reuses the findparent
+ * fields. Directory repair chooses either parent pointer scanning or
+ * directory entry salvaging, but not both.
+ *
+ * Directory entries added to the temporary directory do not elevate the link
+ * counts of the inodes found. When salvaging completes, the remaining stashed
+ * entries are replayed to the temporary directory. An atomic mapping exchange
+ * is used to commit the new directory blocks to the directory being repaired.
+ * This will disrupt readdir cursors.
+ *
+ * Locking Issues
+ * --------------
+ *
+ * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
+ * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects
+ * b's dotdot update. This is in contrast to every other dotdot update (link,
+ * remove, mkdir). If the repair code drops the ILOCK, it must either
+ * revalidate the dotdot entry or use dirent hooks to capture updates from
+ * other threads.
+ */
+
+/* Create a dirent in the tempdir. */
+#define XREP_DIRENT_ADD (1)
+
+/* Remove a dirent from the tempdir. */
+#define XREP_DIRENT_REMOVE (2)
+
+/* Directory entry to be restored in the new directory. */
+struct xrep_dirent {
+ /* Cookie for retrieval of the dirent name. */
+ xfblob_cookie name_cookie;
+
+ /* Target inode number. */
+ xfs_ino_t ino;
+
+ /* Length of the dirent name. */
+ uint8_t namelen;
+
+ /* File type of the dirent. */
+ uint8_t ftype;
+
+ /* XREP_DIRENT_{ADD,REMOVE} */
+ uint8_t action;
+};
+
+/*
+ * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
+ * before we write them to the temp dir.
+ */
+#define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+struct xrep_dir {
+ struct xfs_scrub *sc;
+
+ /* Fixed-size array of xrep_dirent structures. */
+ struct xfarray *dir_entries;
+
+ /* Blobs containing directory entry names. */
+ struct xfblob *dir_names;
+
+ /* Information for exchanging data forks at the end. */
+ struct xrep_tempexch tx;
+
+ /* Preallocated args struct for performing dir operations */
+ struct xfs_da_args args;
+
+ /*
+ * Information used to scan the filesystem to find the inumber of the
+ * dotdot entry for this directory. For directory salvaging when
+ * parent pointers are not enabled, we use the findparent_* functions
+ * on this object and access only the parent_ino field directly.
+ *
+ * When parent pointers are enabled, however, the pptr scanner uses the
+ * iscan, hooks, lock, and parent_ino fields of this object directly.
+ * @pscan.lock coordinates access to dir_entries, dir_names,
+ * parent_ino, subdirs, dirents, and args. This reduces the memory
+ * requirements of this structure.
+ */
+ struct xrep_parent_scan_info pscan;
+
+ /*
+ * Context information for attaching this directory to the lost+found
+ * if this directory does not have a parent.
+ */
+ struct xrep_adoption adoption;
+
+ /* How many subdirectories did we find? */
+ uint64_t subdirs;
+
+ /* How many dirents did we find? */
+ unsigned int dirents;
+
+ /* Should we move this directory to the orphanage? */
+ bool needs_adoption;
+
+ /* Directory entry name, plus the trailing null. */
+ struct xfs_name xname;
+ unsigned char namebuf[MAXNAMELEN];
+};
+
+/* Tear down all the incore stuff we created. */
+static void
+xrep_dir_teardown(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd = sc->buf;
+
+ xrep_findparent_scan_teardown(&rd->pscan);
+ xfblob_destroy(rd->dir_names);
+ xfarray_destroy(rd->dir_entries);
+}
+
+/* Set up for a directory repair. */
+int
+xrep_setup_directory(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ error = xrep_orphanage_try_create(sc);
+ if (error)
+ return error;
+
+ error = xrep_tempfile_create(sc, S_IFDIR);
+ if (error)
+ return error;
+
+ rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
+ if (!rd)
+ return -ENOMEM;
+ rd->sc = sc;
+ rd->xname.name = rd->namebuf;
+ sc->buf = rd;
+
+ return 0;
+}
+
+/*
+ * Look up the dotdot entry and confirm that it's really the parent.
+ * Returns NULLFSINO if we don't know what to do.
+ */
+static inline xfs_ino_t
+xrep_dir_lookup_parent(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t ino;
+ int error;
+
+ error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
+ if (error)
+ return NULLFSINO;
+ if (!xfs_verify_dir_ino(sc->mp, ino))
+ return NULLFSINO;
+
+ error = xrep_findparent_confirm(sc, &ino);
+ if (error)
+ return NULLFSINO;
+
+ return ino;
+}
+
+/*
+ * Look up '..' in the dentry cache and confirm that it's really the parent.
+ * Returns NULLFSINO if the dcache misses or if the hit is implausible.
+ */
+static inline xfs_ino_t
+xrep_dir_dcache_parent(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t parent_ino;
+ int error;
+
+ parent_ino = xrep_findparent_from_dcache(sc);
+ if (parent_ino == NULLFSINO)
+ return parent_ino;
+
+ error = xrep_findparent_confirm(sc, &parent_ino);
+ if (error)
+ return NULLFSINO;
+
+ return parent_ino;
+}
+
+/* Try to find the parent of the directory being repaired. */
+STATIC int
+xrep_dir_find_parent(
+ struct xrep_dir *rd)
+{
+ xfs_ino_t ino;
+
+ ino = xrep_findparent_self_reference(rd->sc);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ ino = xrep_dir_dcache_parent(rd);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ ino = xrep_dir_lookup_parent(rd);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ /*
+ * A full filesystem scan is the last resort. On a busy filesystem,
+ * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means
+ * that we don't know what who the parent is, so we should return to
+ * userspace.
+ */
+ return xrep_findparent_scan(&rd->pscan);
+}
+
+/*
+ * Decide if we want to salvage this entry. We don't bother with oversized
+ * names or the dot entry.
+ */
+STATIC int
+xrep_dir_want_salvage(
+ struct xrep_dir *rd,
+ const char *name,
+ int namelen,
+ xfs_ino_t ino)
+{
+ struct xfs_mount *mp = rd->sc->mp;
+
+ /* No pointers to ourselves or to garbage. */
+ if (ino == rd->sc->ip->i_ino)
+ return false;
+ if (!xfs_verify_dir_ino(mp, ino))
+ return false;
+
+ /* No weird looking names or dot entries. */
+ if (namelen >= MAXNAMELEN || namelen <= 0)
+ return false;
+ if (namelen == 1 && name[0] == '.')
+ return false;
+ if (!xfs_dir2_namecheck(name, namelen))
+ return false;
+
+ return true;
+}
+
+/*
+ * Remember that we want to create a dirent in the tempdir. These stashed
+ * actions will be replayed later.
+ */
+STATIC int
+xrep_dir_stash_createname(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t ino)
+{
+ struct xrep_dirent dirent = {
+ .action = XREP_DIRENT_ADD,
+ .ino = ino,
+ .namelen = name->len,
+ .ftype = name->type,
+ };
+ int error;
+
+ trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
+
+ error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rd->dir_entries, &dirent);
+}
+
+/*
+ * Remember that we want to remove a dirent from the tempdir. These stashed
+ * actions will be replayed later.
+ */
+STATIC int
+xrep_dir_stash_removename(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t ino)
+{
+ struct xrep_dirent dirent = {
+ .action = XREP_DIRENT_REMOVE,
+ .ino = ino,
+ .namelen = name->len,
+ .ftype = name->type,
+ };
+ int error;
+
+ trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino);
+
+ error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rd->dir_entries, &dirent);
+}
+
+/* Allocate an in-core record to hold entries while we rebuild the dir data. */
+STATIC int
+xrep_dir_salvage_entry(
+ struct xrep_dir *rd,
+ unsigned char *name,
+ unsigned int namelen,
+ xfs_ino_t ino)
+{
+ struct xfs_name xname = {
+ .name = name,
+ };
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *ip;
+ unsigned int i = 0;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Truncate the name to the first character that would trip namecheck.
+ * If we no longer have a name after that, ignore this entry.
+ */
+ while (i < namelen && name[i] != 0 && name[i] != '/')
+ i++;
+ if (i == 0)
+ return 0;
+ xname.len = i;
+
+ /* Ignore '..' entries; we already picked the new parent. */
+ if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
+ trace_xrep_dir_salvaged_parent(sc->ip, ino);
+ return 0;
+ }
+
+ trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
+
+ /*
+ * Compute the ftype or dump the entry if we can't. We don't lock the
+ * inode because inodes can't change type while we have a reference.
+ */
+ error = xchk_iget(sc, ino, &ip);
+ if (error)
+ return 0;
+
+ xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+ xchk_irele(sc, ip);
+
+ return xrep_dir_stash_createname(rd, &xname, ino);
+}
+
+/* Record a shortform directory entry for later reinsertion. */
+STATIC int
+xrep_dir_salvage_sf_entry(
+ struct xrep_dir *rd,
+ struct xfs_dir2_sf_hdr *sfp,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ xfs_ino_t ino;
+
+ ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
+ if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
+ return 0;
+
+ return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
+}
+
+/* Record a regular directory entry for later reinsertion. */
+STATIC int
+xrep_dir_salvage_data_entry(
+ struct xrep_dir *rd,
+ struct xfs_dir2_data_entry *dep)
+{
+ xfs_ino_t ino;
+
+ ino = be64_to_cpu(dep->inumber);
+ if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
+ return 0;
+
+ return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
+}
+
+/* Try to recover block/data format directory entries. */
+STATIC int
+xrep_dir_recover_data(
+ struct xrep_dir *rd,
+ struct xfs_buf *bp)
+{
+ struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo;
+ unsigned int offset;
+ unsigned int end;
+ int error = 0;
+
+ /*
+ * Loop over the data portion of the block.
+ * Each object is a real entry (dep) or an unused one (dup).
+ */
+ offset = geo->data_entry_offset;
+ end = min_t(unsigned int, BBTOB(bp->b_length),
+ xfs_dir3_data_end_offset(geo, bp->b_addr));
+
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ /* Skip unused entries. */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ offset += be16_to_cpu(dup->length);
+ continue;
+ }
+
+ /* Don't walk off the end of the block. */
+ offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
+ if (offset > end)
+ break;
+
+ /* Ok, let's save this entry. */
+ error = xrep_dir_salvage_data_entry(rd, dep);
+ if (error)
+ return error;
+
+ }
+
+ return 0;
+}
+
+/* Try to recover shortform directory entries. */
+STATIC int
+xrep_dir_recover_sf(
+ struct xrep_dir *rd)
+{
+ struct xfs_dir2_sf_hdr *hdr;
+ struct xfs_dir2_sf_entry *sfep;
+ struct xfs_dir2_sf_entry *next;
+ struct xfs_ifork *ifp;
+ xfs_ino_t ino;
+ unsigned char *end;
+ int error = 0;
+
+ ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
+ hdr = ifp->if_data;
+ end = (unsigned char *)ifp->if_data + ifp->if_bytes;
+
+ ino = xfs_dir2_sf_get_parent_ino(hdr);
+ trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
+
+ sfep = xfs_dir2_sf_firstentry(hdr);
+ while ((unsigned char *)sfep < end) {
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
+ if ((unsigned char *)next > end)
+ break;
+
+ /* Ok, let's save this entry. */
+ error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
+ if (error)
+ return error;
+
+ sfep = next;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to figure out the format of this directory from the data fork mappings
+ * and the directory size. If we can be reasonably sure of format, we can be
+ * more aggressive in salvaging directory entries. On return, @magic_guess
+ * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
+ * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
+ * and 0 if we can't tell.
+ */
+STATIC void
+xrep_dir_guess_format(
+ struct xrep_dir *rd,
+ __be32 *magic_guess)
+{
+ struct xfs_inode *dp = rd->sc->ip;
+ struct xfs_mount *mp = rd->sc->mp;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ xfs_fileoff_t last;
+ int error;
+
+ ASSERT(xfs_has_crc(mp));
+
+ *magic_guess = 0;
+
+ /*
+ * If there's a single directory block and the directory size is
+ * exactly one block, this has to be a single block format directory.
+ */
+ error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
+ if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
+ dp->i_disk_size == geo->blksize) {
+ *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+ return;
+ }
+
+ /*
+ * If the last extent before the leaf offset matches the directory
+ * size and the directory size is larger than 1 block, this is a
+ * data format directory.
+ */
+ last = geo->leafblk;
+ error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
+ if (!error &&
+ XFS_FSB_TO_B(mp, last) > geo->blksize &&
+ XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
+ *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+ return;
+ }
+}
+
+/* Recover directory entries from a specific directory block. */
+STATIC int
+xrep_dir_recover_dirblock(
+ struct xrep_dir *rd,
+ __be32 magic_guess,
+ xfs_dablk_t dabno)
+{
+ struct xfs_dir2_data_hdr *hdr;
+ struct xfs_buf *bp;
+ __be32 oldmagic;
+ int error;
+
+ /*
+ * Try to read buffer. We invalidate them in the next step so we don't
+ * bother to set a buffer type or ops.
+ */
+ error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
+ XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
+ if (error || !bp)
+ return error;
+
+ hdr = bp->b_addr;
+ oldmagic = hdr->magic;
+
+ trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
+ be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
+
+ /*
+ * If we're sure of the block's format, proceed with the salvage
+ * operation using the specified magic number.
+ */
+ if (magic_guess) {
+ hdr->magic = magic_guess;
+ goto recover;
+ }
+
+ /*
+ * If we couldn't guess what type of directory this is, then we will
+ * only salvage entries from directory blocks that match the magic
+ * number and pass verifiers.
+ */
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
+ goto out;
+ if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
+ goto out;
+ break;
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
+ goto out;
+ if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
+ goto out;
+ break;
+ default:
+ goto out;
+ }
+
+recover:
+ error = xrep_dir_recover_data(rd, bp);
+
+out:
+ hdr->magic = oldmagic;
+ xfs_trans_brelse(rd->sc->tp, bp);
+ return error;
+}
+
+static inline void
+xrep_dir_init_args(
+ struct xrep_dir *rd,
+ struct xfs_inode *dp,
+ const struct xfs_name *name)
+{
+ memset(&rd->args, 0, sizeof(struct xfs_da_args));
+ rd->args.geo = rd->sc->mp->m_dir_geo;
+ rd->args.whichfork = XFS_DATA_FORK;
+ rd->args.owner = rd->sc->ip->i_ino;
+ rd->args.trans = rd->sc->tp;
+ rd->args.dp = dp;
+ if (!name)
+ return;
+ rd->args.name = name->name;
+ rd->args.namelen = name->len;
+ rd->args.filetype = name->type;
+ rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
+}
+
+/* Replay a stashed createname into the temporary directory. */
+STATIC int
+xrep_dir_replay_createname(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t inum,
+ xfs_extlen_t total)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *dp = rd->sc->tempip;
+ int error;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ error = xfs_dir_ino_validate(sc->mp, inum);
+ if (error)
+ return error;
+
+ trace_xrep_dir_replay_createname(dp, name, inum);
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.inumber = inum;
+ rd->args.total = total;
+ rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+ return xfs_dir_createname_args(&rd->args);
+}
+
+/* Replay a stashed removename onto the temporary directory. */
+STATIC int
+xrep_dir_replay_removename(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_extlen_t total)
+{
+ struct xfs_inode *dp = rd->args.dp;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.op_flags = 0;
+ rd->args.total = total;
+
+ trace_xrep_dir_replay_removename(dp, name, 0);
+ return xfs_dir_removename_args(&rd->args);
+}
+
+/*
+ * Add this stashed incore directory entry to the temporary directory.
+ * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
+ * must not be in transaction context.
+ */
+STATIC int
+xrep_dir_replay_update(
+ struct xrep_dir *rd,
+ const struct xfs_name *xname,
+ const struct xrep_dirent *dirent)
+{
+ struct xfs_mount *mp = rd->sc->mp;
+#ifdef DEBUG
+ xfs_ino_t ino;
+#endif
+ uint resblks;
+ int error;
+
+ resblks = xfs_link_space_res(mp, xname->len);
+ error = xchk_trans_alloc(rd->sc, resblks);
+ if (error)
+ return error;
+
+ /* Lock the temporary directory and join it to the transaction */
+ xrep_tempfile_ilock(rd->sc);
+ xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
+
+ switch (dirent->action) {
+ case XREP_DIRENT_ADD:
+ /*
+ * Create a replacement dirent in the temporary directory.
+ * Note that _createname doesn't check for existing entries.
+ * There shouldn't be any in the temporary dir, but we'll
+ * verify this in debug mode.
+ */
+#ifdef DEBUG
+ error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
+ if (error != -ENOENT) {
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+#endif
+
+ error = xrep_dir_replay_createname(rd, xname, dirent->ino,
+ resblks);
+ if (error)
+ goto out_cancel;
+
+ if (xname->type == XFS_DIR3_FT_DIR)
+ rd->subdirs++;
+ rd->dirents++;
+ break;
+ case XREP_DIRENT_REMOVE:
+ /*
+ * Remove a dirent from the temporary directory. Note that
+ * _removename doesn't check the inode target of the exist
+ * entry. There should be a perfect match in the temporary
+ * dir, but we'll verify this in debug mode.
+ */
+#ifdef DEBUG
+ error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
+ if (error) {
+ ASSERT(error != 0);
+ goto out_cancel;
+ }
+ if (ino != dirent->ino) {
+ ASSERT(ino == dirent->ino);
+ error = -EIO;
+ goto out_cancel;
+ }
+#endif
+
+ error = xrep_dir_replay_removename(rd, xname, resblks);
+ if (error)
+ goto out_cancel;
+
+ if (xname->type == XFS_DIR3_FT_DIR)
+ rd->subdirs--;
+ rd->dirents--;
+ break;
+ default:
+ ASSERT(0);
+ error = -EIO;
+ goto out_cancel;
+ }
+
+ /* Commit and unlock. */
+ error = xrep_trans_commit(rd->sc);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(rd->sc);
+ return 0;
+out_cancel:
+ xchk_trans_cancel(rd->sc);
+ xrep_tempfile_iunlock(rd->sc);
+ return error;
+}
+
+/*
+ * Flush stashed incore dirent updates that have been recorded by the scanner.
+ * This is done to reduce the memory requirements of the directory rebuild,
+ * since directories can contain up to 32GB of directory data.
+ *
+ * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir
+ * IOLOCK.
+ */
+STATIC int
+xrep_dir_replay_updates(
+ struct xrep_dir *rd)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ /* Add all the salvaged dirents to the temporary directory. */
+ mutex_lock(&rd->pscan.lock);
+ foreach_xfarray_idx(rd->dir_entries, array_cur) {
+ struct xrep_dirent dirent;
+
+ error = xfarray_load(rd->dir_entries, array_cur, &dirent);
+ if (error)
+ goto out_unlock;
+
+ error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
+ &rd->xname, dirent.namelen);
+ if (error)
+ goto out_unlock;
+ rd->xname.type = dirent.ftype;
+ mutex_unlock(&rd->pscan.lock);
+
+ error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
+ if (error)
+ return error;
+ mutex_lock(&rd->pscan.lock);
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rd->dir_entries);
+ xfblob_truncate(rd->dir_names);
+ mutex_unlock(&rd->pscan.lock);
+ return 0;
+out_unlock:
+ mutex_unlock(&rd->pscan.lock);
+ return error;
+}
+
+/*
+ * Periodically flush stashed directory entries to the temporary dir. This
+ * is done to reduce the memory requirements of the directory rebuild, since
+ * directories can contain up to 32GB of directory data.
+ */
+STATIC int
+xrep_dir_flush_stashed(
+ struct xrep_dir *rd)
+{
+ int error;
+
+ /*
+ * Entering this function, the scrub context has a reference to the
+ * inode being repaired, the temporary file, and a scrub transaction
+ * that we use during dirent salvaging to avoid livelocking if there
+ * are cycles in the directory structures. We hold ILOCK_EXCL on both
+ * the inode being repaired and the temporary file, though they are
+ * not ijoined to the scrub transaction.
+ *
+ * To constrain kernel memory use, we occasionally write salvaged
+ * dirents from the xfarray and xfblob structures into the temporary
+ * directory in preparation for exchanging the directory structures at
+ * the end. Updating the temporary file requires a transaction, so we
+ * commit the scrub transaction and drop the two ILOCKs so that
+ * we can allocate whatever transaction we want.
+ *
+ * We still hold IOLOCK_EXCL on the inode being repaired, which
+ * prevents anyone from accessing the damaged directory data while we
+ * repair it.
+ */
+ error = xrep_trans_commit(rd->sc);
+ if (error)
+ return error;
+ xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK of the temporary file while we modify dirents. This
+ * isn't strictly required because the temporary file is never revealed
+ * to userspace, but we follow the same locking rules. We still hold
+ * sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rd->sc);
+ if (error)
+ return error;
+
+ /* Write to the tempdir all the updates that we've stashed. */
+ error = xrep_dir_replay_updates(rd);
+ xrep_tempfile_iounlock(rd->sc);
+ if (error)
+ return error;
+
+ /*
+ * Recreate the salvage transaction and relock the dir we're salvaging.
+ */
+ error = xchk_trans_alloc(rd->sc, 0);
+ if (error)
+ return error;
+ xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Decide if we've stashed too much dirent data in memory. */
+static inline bool
+xrep_dir_want_flush_stashed(
+ struct xrep_dir *rd)
+{
+ unsigned long long bytes;
+
+ bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
+ return bytes > XREP_DIR_MAX_STASH_BYTES;
+}
+
+/* Extract as many directory entries as we can. */
+STATIC int
+xrep_dir_recover(
+ struct xrep_dir *rd)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
+ xfs_fileoff_t offset;
+ xfs_dablk_t dabno;
+ __be32 magic_guess;
+ int nmap;
+ int error;
+
+ xrep_dir_guess_format(rd, &magic_guess);
+
+ /* Iterate each directory data block in the data fork. */
+ for (offset = 0;
+ offset < geo->leafblk;
+ offset = got.br_startoff + got.br_blockcount) {
+ nmap = 1;
+ error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
+ &got, &nmap, 0);
+ if (error)
+ return error;
+ if (nmap != 1)
+ return -EFSCORRUPTED;
+ if (!xfs_bmap_is_written_extent(&got))
+ continue;
+
+ for (dabno = round_up(got.br_startoff, geo->fsbcount);
+ dabno < got.br_startoff + got.br_blockcount;
+ dabno += geo->fsbcount) {
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ error = xrep_dir_recover_dirblock(rd,
+ magic_guess, dabno);
+ if (error)
+ return error;
+
+ /* Flush dirents to constrain memory usage. */
+ if (xrep_dir_want_flush_stashed(rd)) {
+ error = xrep_dir_flush_stashed(rd);
+ if (error)
+ return error;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Find all the directory entries for this inode by scraping them out of the
+ * directory leaf blocks by hand, and flushing them into the temp dir.
+ */
+STATIC int
+xrep_dir_find_entries(
+ struct xrep_dir *rd)
+{
+ struct xfs_inode *dp = rd->sc->ip;
+ int error;
+
+ /*
+ * Salvage directory entries from the old directory, and write them to
+ * the temporary directory.
+ */
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ error = xrep_dir_recover_sf(rd);
+ } else {
+ error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ error = xrep_dir_recover(rd);
+ }
+ if (error)
+ return error;
+
+ return xrep_dir_flush_stashed(rd);
+}
+
+/* Scan all files in the filesystem for dirents. */
+STATIC int
+xrep_dir_salvage_entries(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ /*
+ * Drop the ILOCK on this directory so that we can scan for this
+ * directory's parent. Figure out who is going to be the parent of
+ * this directory, then retake the ILOCK so that we can salvage
+ * directory entries.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ error = xrep_dir_find_parent(rd);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ /*
+ * Collect directory entries by parsing raw leaf blocks to salvage
+ * whatever we can. When we're done, free the staging memory before
+ * exchanging the directories to reduce memory usage.
+ */
+ error = xrep_dir_find_entries(rd);
+ if (error)
+ return error;
+
+ /*
+ * Cancel the repair transaction and drop the ILOCK so that we can
+ * (later) use the atomic mapping exchange functions to compute the
+ * correct block reservations and re-lock the inodes.
+ *
+ * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
+ * modifications, but there's nothing to prevent userspace from reading
+ * the directory until we're ready for the exchange operation. Reads
+ * will return -EIO without shutting down the fs, so we're ok with
+ * that.
+ *
+ * The VFS can change dotdot on us, but the findparent scan will keep
+ * our incore parent inode up to date. See the note on locking issues
+ * for more details.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+
+/*
+ * Examine a parent pointer of a file. If it leads us back to the directory
+ * that we're rebuilding, create an incore dirent from the parent pointer and
+ * stash it.
+ */
+STATIC int
+xrep_dir_scan_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode),
+ };
+ xfs_ino_t parent_ino;
+ uint32_t parent_gen;
+ struct xrep_dir *rd = priv;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ /*
+ * Ignore parent pointers that point back to a different dir, list the
+ * wrong generation number, or are invalid.
+ */
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, &parent_gen);
+ if (error)
+ return error;
+
+ if (parent_ino != sc->ip->i_ino ||
+ parent_gen != VFS_I(sc->ip)->i_generation)
+ return 0;
+
+ mutex_lock(&rd->pscan.lock);
+ error = xrep_dir_stash_createname(rd, &xname, ip->i_ino);
+ mutex_unlock(&rd->pscan.lock);
+ return error;
+}
+
+/*
+ * If this child dirent points to the directory being repaired, remember that
+ * fact so that we can reset the dotdot entry if necessary.
+ */
+STATIC int
+xrep_dir_scan_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_dir *rd = priv;
+
+ /* Dirent doesn't point to this directory. */
+ if (ino != rd->sc->ip->i_ino)
+ return 0;
+
+ /* Ignore garbage inum. */
+ if (!xfs_verify_dir_ino(rd->sc->mp, ino))
+ return 0;
+
+ /* No weird looking names. */
+ if (name->len >= MAXNAMELEN || name->len <= 0)
+ return 0;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot,
+ dp->i_ino);
+
+ xrep_findparent_scan_found(&rd->pscan, dp->i_ino);
+ return 0;
+}
+
+/*
+ * Decide if we want to look for child dirents or parent pointers in this file.
+ * Skip the dir being repaired and any files being used to stage repairs.
+ */
+static inline bool
+xrep_dir_want_scan(
+ struct xrep_dir *rd,
+ const struct xfs_inode *ip)
+{
+ return ip != rd->sc->ip && !xrep_is_tempfile(ip);
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or
+ * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_dir_scan_ilock(
+ struct xrep_dir *rd,
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ /* Need to take the shared ILOCK to advance the iscan cursor. */
+ if (!xrep_dir_want_scan(rd, ip))
+ goto lock;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ goto lock;
+ }
+
+ if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+lock:
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
+/*
+ * Scan this file for relevant child dirents or parent pointers that point to
+ * the directory we're rebuilding.
+ */
+STATIC int
+xrep_dir_scan_file(
+ struct xrep_dir *rd,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode;
+ int error = 0;
+
+ lock_mode = xrep_dir_scan_ilock(rd, ip);
+
+ if (!xrep_dir_want_scan(rd, ip))
+ goto scan_done;
+
+ /*
+ * If the extended attributes look as though they has been zapped by
+ * the inode record repair code, we cannot scan for parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(ip)) {
+ error = -EBUSY;
+ goto scan_done;
+ }
+
+ error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd);
+ if (error)
+ goto scan_done;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ /*
+ * If the directory looks as though it has been zapped by the
+ * inode record repair code, we cannot scan for child dirents.
+ */
+ if (xchk_dir_looks_zapped(ip)) {
+ error = -EBUSY;
+ goto scan_done;
+ }
+
+ error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd);
+ if (error)
+ goto scan_done;
+ }
+
+scan_done:
+ xchk_iscan_mark_visited(&rd->pscan.iscan, ip);
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/*
+ * Scan all files in the filesystem for parent pointers that we can turn into
+ * replacement dirents, and a dirent that we can use to set the dotdot pointer.
+ */
+STATIC int
+xrep_dir_scan_dirtree(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /* Roots of directory trees are their own parents. */
+ if (sc->ip == sc->mp->m_rootip)
+ xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
+
+ /*
+ * Filesystem scans are time consuming. Drop the directory ILOCK and
+ * all other resources for the duration of the scan and hope for the
+ * best. The live update hooks will keep our scan information up to
+ * date even though we've dropped the locks.
+ */
+ xchk_trans_cancel(sc);
+ if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
+ xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
+ XFS_ILOCK_EXCL));
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
+ bool flush;
+
+ error = xrep_dir_scan_file(rd, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ /* Flush stashed dirent updates to constrain memory usage. */
+ mutex_lock(&rd->pscan.lock);
+ flush = xrep_dir_want_flush_stashed(rd);
+ mutex_unlock(&rd->pscan.lock);
+ if (flush) {
+ xchk_trans_cancel(sc);
+
+ error = xrep_tempfile_iolock_polled(sc);
+ if (error)
+ break;
+
+ error = xrep_dir_replay_updates(rd);
+ xrep_tempfile_iounlock(sc);
+ if (error)
+ break;
+
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ break;
+ }
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rd->pscan.iscan);
+ if (error) {
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Cancel the empty transaction so that we can (later) use the atomic
+ * file mapping exchange functions to lock files and commit the new
+ * directory.
+ */
+ xchk_trans_cancel(rd->sc);
+ return 0;
+}
+
+/*
+ * Capture dirent updates being made by other threads which are relevant to the
+ * directory being repaired.
+ */
+STATIC int
+xrep_dir_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_dir *rd;
+ struct xfs_scrub *sc;
+ int error = 0;
+
+ rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb);
+ sc = rd->sc;
+
+ /*
+ * This thread updated a child dirent in the directory that we're
+ * rebuilding. Stash the update for replay against the temporary
+ * directory.
+ */
+ if (p->dp->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) {
+ mutex_lock(&rd->pscan.lock);
+ if (p->delta > 0)
+ error = xrep_dir_stash_createname(rd, p->name,
+ p->ip->i_ino);
+ else
+ error = xrep_dir_stash_removename(rd, p->name,
+ p->ip->i_ino);
+ mutex_unlock(&rd->pscan.lock);
+ if (error)
+ goto out_abort;
+ }
+
+ /*
+ * This thread updated another directory's child dirent that points to
+ * the directory that we're rebuilding, so remember the new dotdot
+ * target.
+ */
+ if (p->ip->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) {
+ if (p->delta > 0) {
+ trace_xrep_dir_stash_createname(sc->tempip,
+ &xfs_name_dotdot,
+ p->dp->i_ino);
+
+ xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino);
+ } else {
+ trace_xrep_dir_stash_removename(sc->tempip,
+ &xfs_name_dotdot,
+ rd->pscan.parent_ino);
+
+ xrep_findparent_scan_found(&rd->pscan, NULLFSINO);
+ }
+ }
+
+ return NOTIFY_DONE;
+out_abort:
+ xchk_iscan_abort(&rd->pscan.iscan);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Free all the directory blocks and reset the data fork. The caller must
+ * join the inode to the transaction. This function returns with the inode
+ * joined to a clean scrub transaction.
+ */
+STATIC int
+xrep_dir_reset_fork(
+ struct xrep_dir *rd,
+ xfs_ino_t parent_ino)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
+ int error;
+
+ /* Unmap all the directory buffers. */
+ if (xfs_ifork_has_extents(ifp)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
+
+ /* Reset the data fork to an empty data fork. */
+ xfs_idestroy_fork(ifp);
+ ifp->if_bytes = 0;
+ sc->tempip->i_disk_size = 0;
+
+ /* Reinitialize the short form directory. */
+ xrep_dir_init_args(rd, sc->tempip, NULL);
+ return xfs_dir2_sf_create(&rd->args, parent_ino);
+}
+
+/*
+ * Prepare both inodes' directory forks for exchanging mappings. Promote the
+ * tempfile from short format to leaf format, and if the file being repaired
+ * has a short format data fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_dir_swap_prep(
+ struct xfs_scrub *sc,
+ bool temp_local,
+ bool ip_local)
+{
+ int error;
+
+ /*
+ * If the tempfile's directory is in shortform format, convert that to
+ * a single leaf extent so that we can use the atomic mapping exchange.
+ */
+ if (temp_local) {
+ struct xfs_da_args args = {
+ .dp = sc->tempip,
+ .geo = sc->mp->m_dir_geo,
+ .whichfork = XFS_DATA_FORK,
+ .trans = sc->tp,
+ .total = 1,
+ .owner = sc->ip->i_ino,
+ };
+
+ error = xfs_dir2_sf_to_block(&args);
+ if (error)
+ return error;
+
+ /*
+ * Roll the deferred log items to get us back to a clean
+ * transaction.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If the file being repaired had a shortform data fork, convert that
+ * to an empty extent list in preparation for the atomic mapping
+ * exchange.
+ */
+ if (ip_local) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ ifp->if_bytes = 0;
+ ifp->if_data = NULL;
+ ifp->if_height = 0;
+
+ xfs_trans_log_inode(sc->tp, sc->ip,
+ XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ }
+
+ return 0;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+static int
+xrep_dir_replace(
+ struct xrep_dir *rd,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t inum,
+ xfs_extlen_t total)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ error = xfs_dir_ino_validate(sc->mp, inum);
+ if (error)
+ return error;
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.inumber = inum;
+ rd->args.total = total;
+ return xfs_dir_replace_args(&rd->args);
+}
+
+/*
+ * Reset the link count of this directory and adjust the unlinked list pointers
+ * as needed.
+ */
+STATIC int
+xrep_dir_set_nlink(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *dp = sc->ip;
+ struct xfs_perag *pag;
+ unsigned int new_nlink = min_t(unsigned long long,
+ rd->subdirs + 2,
+ XFS_NLINK_PINNED);
+ int error;
+
+ /*
+ * The directory is not on the incore unlinked list, which means that
+ * it needs to be reachable via the directory tree. Update the nlink
+ * with our observed link count. If the directory has no parent, it
+ * will be moved to the orphanage.
+ */
+ if (!xfs_inode_on_unlinked_list(dp))
+ goto reset_nlink;
+
+ /*
+ * The directory is on the unlinked list and we did not find any
+ * dirents. Set the link count to zero and let the directory
+ * inactivate when the last reference drops.
+ */
+ if (rd->dirents == 0) {
+ rd->needs_adoption = false;
+ new_nlink = 0;
+ goto reset_nlink;
+ }
+
+ /*
+ * The directory is on the unlinked list and we found dirents. This
+ * directory needs to be reachable via the directory tree. Remove the
+ * dir from the unlinked list and update nlink with the observed link
+ * count. If the directory has no parent, it will be moved to the
+ * orphanage.
+ */
+ pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_iunlink_remove(sc->tp, pag, dp);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+reset_nlink:
+ if (VFS_I(dp)->i_nlink != new_nlink)
+ set_nlink(VFS_I(dp), new_nlink);
+ return 0;
+}
+
+/*
+ * Finish replaying stashed dirent updates, allocate a transaction for
+ * exchanging data fork mappings, and take the ILOCKs of both directories
+ * before we commit the new directory structure.
+ */
+STATIC int
+xrep_dir_finalize_tempdir(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ if (!xfs_has_parent(sc->mp))
+ return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
+
+ /*
+ * Repair relies on the ILOCK to quiesce all possible dirent updates.
+ * Replay all queued dirent updates into the tempdir before exchanging
+ * the contents, even if that means dropping the ILOCKs and the
+ * transaction.
+ */
+ do {
+ error = xrep_dir_replay_updates(rd);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
+ if (error)
+ return error;
+
+ if (xfarray_length(rd->dir_entries) == 0)
+ break;
+
+ xchk_trans_cancel(sc);
+ xrep_tempfile_iunlock_both(sc);
+ } while (!xchk_should_terminate(sc, &error));
+ return error;
+}
+
+/* Exchange the temporary directory's data fork with the one being repaired. */
+STATIC int
+xrep_dir_swap(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ bool ip_local, temp_local;
+ int error = 0;
+
+ /*
+ * If we never found the parent for this directory, temporarily assign
+ * the root dir as the parent; we'll move this to the orphanage after
+ * exchanging the dir contents. We hold the ILOCK of the dir being
+ * repaired, so we're not worried about racy updates of dotdot.
+ */
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+ if (rd->pscan.parent_ino == NULLFSINO) {
+ rd->needs_adoption = true;
+ rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
+ }
+
+ /*
+ * Reset the temporary directory's '..' entry to point to the parent
+ * that we found. The temporary directory was created with the root
+ * directory as the parent, so we can skip this if repairing a
+ * subdirectory of the root.
+ *
+ * It's also possible that this replacement could also expand a sf
+ * tempdir into block format.
+ */
+ if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) {
+ error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
+ rd->pscan.parent_ino, rd->tx.req.resblks);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Changing the dot and dotdot entries could have changed the shape of
+ * the directory, so we recompute these.
+ */
+ ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+ temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+
+ /*
+ * If the both files have a local format data fork and the rebuilt
+ * directory data would fit in the repaired file's data fork, copy
+ * the contents from the tempfile and update the directory link count.
+ * We're done now.
+ */
+ if (ip_local && temp_local &&
+ sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
+ xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
+ return xrep_dir_set_nlink(rd);
+ }
+
+ /*
+ * Clean the transaction before we start working on exchanging
+ * directory contents.
+ */
+ error = xrep_tempfile_roll_trans(rd->sc);
+ if (error)
+ return error;
+
+ /* Otherwise, make sure both data forks are in block-mapping mode. */
+ error = xrep_dir_swap_prep(sc, temp_local, ip_local);
+ if (error)
+ return error;
+
+ /*
+ * Set nlink of the directory in the same transaction sequence that
+ * (atomically) commits the new directory data.
+ */
+ error = xrep_dir_set_nlink(rd);
+ if (error)
+ return error;
+
+ return xrep_tempexch_contents(sc, &rd->tx);
+}
+
+/*
+ * Exchange the new directory contents (which we created in the tempfile) with
+ * the directory being repaired.
+ */
+STATIC int
+xrep_dir_rebuild_tree(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
+
+ /*
+ * Take the IOLOCK on the temporary file so that we can run dir
+ * operations with the same locks held as we would for a normal file.
+ * We still hold sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rd->sc);
+ if (error)
+ return error;
+
+ /*
+ * Allocate transaction, lock inodes, and make sure that we've replayed
+ * all the stashed dirent updates to the tempdir. After this point,
+ * we're ready to exchange data fork mappings.
+ */
+ error = xrep_dir_finalize_tempdir(rd);
+ if (error)
+ return error;
+
+ if (xchk_iscan_aborted(&rd->pscan.iscan))
+ return -ECANCELED;
+
+ /*
+ * Exchange the tempdir's data fork with the file being repaired. This
+ * recreates the transaction and re-takes the ILOCK in the scrub
+ * context.
+ */
+ error = xrep_dir_swap(rd);
+ if (error)
+ return error;
+
+ /*
+ * Release the old directory blocks and reset the data fork of the temp
+ * directory to an empty shortform directory because inactivation does
+ * nothing for directories.
+ */
+ error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
+ if (error)
+ return error;
+
+ /*
+ * Roll to get a transaction without any inodes joined to it. Then we
+ * can drop the tempfile's ILOCK and IOLOCK before doing more work on
+ * the scrub target directory.
+ */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(sc);
+ xrep_tempfile_iounlock(sc);
+ return 0;
+}
+
+/* Set up the filesystem scan so we can regenerate directory entries. */
+STATIC int
+xrep_dir_setup_scan(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ char *descr;
+ int error;
+
+ /* Set up some staging memory for salvaging dirents. */
+ descr = xchk_xfile_ino_descr(sc, "directory entries");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_dirent),
+ &rd->dir_entries);
+ kfree(descr);
+ if (error)
+ return error;
+
+ descr = xchk_xfile_ino_descr(sc, "directory entry names");
+ error = xfblob_create(descr, &rd->dir_names);
+ kfree(descr);
+ if (error)
+ goto out_xfarray;
+
+ if (xfs_has_parent(sc->mp))
+ error = __xrep_findparent_scan_start(sc, &rd->pscan,
+ xrep_dir_live_update);
+ else
+ error = xrep_findparent_scan_start(sc, &rd->pscan);
+ if (error)
+ goto out_xfblob;
+
+ return 0;
+
+out_xfblob:
+ xfblob_destroy(rd->dir_names);
+ rd->dir_names = NULL;
+out_xfarray:
+ xfarray_destroy(rd->dir_entries);
+ rd->dir_entries = NULL;
+ return error;
+}
+
+/*
+ * Move the current file to the orphanage.
+ *
+ * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon
+ * successful return, the scrub transaction will have enough extra reservation
+ * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
+ * orphanage; and both inodes will be ijoined.
+ */
+STATIC int
+xrep_dir_move_to_orphanage(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t orig_parent, new_parent;
+ int error;
+
+ /*
+ * We are about to drop the ILOCK on sc->ip to lock the orphanage and
+ * prepare for the adoption. Therefore, look up the old dotdot entry
+ * for sc->ip so that we can compare it after we re-lock sc->ip.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
+ if (error)
+ return error;
+
+ /*
+ * Drop the ILOCK on the scrub target and commit the transaction.
+ * Adoption computes its own resource requirements and gathers the
+ * necessary components.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* If we can take the orphanage's iolock then we're ready to move. */
+ if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
+ xchk_iunlock(sc, sc->ilock_flags);
+ error = xrep_orphanage_iolock_two(sc);
+ if (error)
+ return error;
+ }
+
+ /* Grab transaction and ILOCK the two files. */
+ error = xrep_adoption_trans_alloc(sc, &rd->adoption);
+ if (error)
+ return error;
+
+ error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
+ * entry again. If the parent changed or the child was unlinked while
+ * the child directory was unlocked, we don't need to move the child to
+ * the orphanage after all.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
+ if (error)
+ return error;
+
+ /*
+ * Attach to the orphanage if we still have a linked directory and it
+ * hasn't been moved.
+ */
+ if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
+ error = xrep_adoption_move(&rd->adoption);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Launder the scrub transaction so we can drop the orphanage ILOCK
+ * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK.
+ */
+ error = xrep_adoption_trans_roll(&rd->adoption);
+ if (error)
+ return error;
+
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Repair the directory metadata.
+ *
+ * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer
+ * cache in XFS can't handle aliased multiblock buffers, so this might
+ * misbehave if the directory blocks are crosslinked with other filesystem
+ * metadata.
+ *
+ * XXX: Is it necessary to check the dcache for this directory to make sure
+ * that we always recreate every cached entry?
+ */
+int
+xrep_directory(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd = sc->buf;
+ int error;
+
+ /* The rmapbt is required to reap the old data fork. */
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
+ error = xrep_dir_setup_scan(rd);
+ if (error)
+ return error;
+
+ if (xfs_has_parent(sc->mp))
+ error = xrep_dir_scan_dirtree(rd);
+ else
+ error = xrep_dir_salvage_entries(rd);
+ if (error)
+ goto out_teardown;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_teardown;
+
+ error = xrep_dir_rebuild_tree(rd);
+ if (error)
+ goto out_teardown;
+
+ if (rd->needs_adoption) {
+ if (!xrep_orphanage_can_adopt(rd->sc))
+ error = -EFSCORRUPTED;
+ else
+ error = xrep_dir_move_to_orphanage(rd);
+ if (error)
+ goto out_teardown;
+ }
+
+out_teardown:
+ xrep_dir_teardown(sc);
+ return error;
+}
diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c
new file mode 100644
index 000000000000..bde58fb561ea
--- /dev/null
+++ b/fs/xfs/scrub/dirtree.c
@@ -0,0 +1,985 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/bitmap.h"
+#include "scrub/ino_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/listxattr.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/orphanage.h"
+#include "scrub/dirtree.h"
+
+/*
+ * Directory Tree Structure Validation
+ * ===================================
+ *
+ * Validating the tree qualities of the directory tree structure can be
+ * difficult. If the tree is frozen, running a depth (or breadth) first search
+ * and marking a bitmap suffices to determine if there is a cycle. XORing the
+ * mark bitmap with the inode bitmap afterwards tells us if there are
+ * disconnected cycles. If the tree is not frozen, directory updates can move
+ * subtrees across the scanner wavefront, which complicates the design greatly.
+ *
+ * Directory parent pointers change that by enabling an incremental approach to
+ * validation of the tree structure. Instead of using one thread to scan the
+ * entire filesystem, we instead can have multiple threads walking individual
+ * subdirectories upwards to the root. In a perfect world, the IOLOCK would
+ * suffice to stabilize two directories in a parent -> child relationship.
+ * Unfortunately, the VFS does not take the IOLOCK when moving a child
+ * subdirectory, so we instead synchronize on ILOCK and use dirent update hooks
+ * to detect a race. If a race occurs in a path, we restart the scan.
+ *
+ * If the walk terminates without reaching the root, we know the path is
+ * disconnected and ought to be attached to the lost and found. If on the walk
+ * we find the same subdir that we're scanning, we know this is a cycle and
+ * should delete an incoming edge. If we find multiple paths to the root, we
+ * know to delete an incoming edge.
+ *
+ * There are two big hitches with this approach: first, all file link counts
+ * must be correct to prevent other writers from doing the wrong thing with the
+ * directory tree structure. Second, because we're walking upwards in a tree
+ * of arbitrary depth, we cannot hold all the ILOCKs. Instead, we will use a
+ * directory update hook to invalidate the scan results if one of the paths
+ * we've scanned has changed.
+ */
+
+/* Clean up the dirtree checking resources. */
+STATIC void
+xchk_dirtree_buf_cleanup(
+ void *buf)
+{
+ struct xchk_dirtree *dl = buf;
+ struct xchk_dirpath *path, *n;
+
+ if (dl->scan_ino != NULLFSINO)
+ xfs_dir_hook_del(dl->sc->mp, &dl->dhook);
+
+ xchk_dirtree_for_each_path_safe(dl, path, n) {
+ list_del_init(&path->list);
+ xino_bitmap_destroy(&path->seen_inodes);
+ kfree(path);
+ }
+
+ xfblob_destroy(dl->path_names);
+ xfarray_destroy(dl->path_steps);
+ mutex_destroy(&dl->lock);
+}
+
+/* Set us up to look for directory loops. */
+int
+xchk_setup_dirtree(
+ struct xfs_scrub *sc)
+{
+ struct xchk_dirtree *dl;
+ char *descr;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_dirtree(sc);
+ if (error)
+ return error;
+ }
+
+ dl = kvzalloc(sizeof(struct xchk_dirtree), XCHK_GFP_FLAGS);
+ if (!dl)
+ return -ENOMEM;
+ dl->sc = sc;
+ dl->xname.name = dl->namebuf;
+ dl->hook_xname.name = dl->hook_namebuf;
+ INIT_LIST_HEAD(&dl->path_list);
+ dl->root_ino = NULLFSINO;
+ dl->scan_ino = NULLFSINO;
+ dl->parent_ino = NULLFSINO;
+
+ mutex_init(&dl->lock);
+
+ descr = xchk_xfile_ino_descr(sc, "dirtree path steps");
+ error = xfarray_create(descr, 0, sizeof(struct xchk_dirpath_step),
+ &dl->path_steps);
+ kfree(descr);
+ if (error)
+ goto out_dl;
+
+ descr = xchk_xfile_ino_descr(sc, "dirtree path names");
+ error = xfblob_create(descr, &dl->path_names);
+ kfree(descr);
+ if (error)
+ goto out_steps;
+
+ error = xchk_setup_inode_contents(sc, 0);
+ if (error)
+ goto out_names;
+
+ sc->buf = dl;
+ sc->buf_cleanup = xchk_dirtree_buf_cleanup;
+ return 0;
+
+out_names:
+ xfblob_destroy(dl->path_names);
+out_steps:
+ xfarray_destroy(dl->path_steps);
+out_dl:
+ mutex_destroy(&dl->lock);
+ kvfree(dl);
+ return error;
+}
+
+/*
+ * Add the parent pointer described by @dl->pptr to the given path as a new
+ * step. Returns -ELNRNG if the path is too deep.
+ */
+int
+xchk_dirpath_append(
+ struct xchk_dirtree *dl,
+ struct xfs_inode *ip,
+ struct xchk_dirpath *path,
+ const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr)
+{
+ struct xchk_dirpath_step step = {
+ .pptr_rec = *pptr, /* struct copy */
+ .name_len = name->len,
+ };
+ int error;
+
+ /*
+ * If this path is more than 2 billion steps long, this directory tree
+ * is too far gone to fix.
+ */
+ if (path->nr_steps >= XFS_MAXLINK)
+ return -ELNRNG;
+
+ error = xfblob_storename(dl->path_names, &step.name_cookie, name);
+ if (error)
+ return error;
+
+ error = xino_bitmap_set(&path->seen_inodes, ip->i_ino);
+ if (error)
+ return error;
+
+ error = xfarray_append(dl->path_steps, &step);
+ if (error)
+ return error;
+
+ path->nr_steps++;
+ return 0;
+}
+
+/*
+ * Create an xchk_path for each parent pointer of the directory that we're
+ * scanning. For each path created, we will eventually try to walk towards the
+ * root with the goal of deleting all parents except for one that leads to the
+ * root.
+ *
+ * Returns -EFSCORRUPTED to signal that the inode being scanned has a corrupt
+ * parent pointer and hence there's no point in continuing; or -ENOSR if there
+ * are too many parent pointers for this directory.
+ */
+STATIC int
+xchk_dirtree_create_path(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+ struct xchk_dirtree *dl = priv;
+ struct xchk_dirpath *path;
+ const struct xfs_parent_rec *rec = value;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ /*
+ * If there are more than 2 billion actual parent pointers for this
+ * subdirectory, this fs is too far gone to fix.
+ */
+ if (dl->nr_paths >= XFS_MAXLINK)
+ return -ENOSR;
+
+ trace_xchk_dirtree_create_path(sc, ip, dl->nr_paths, &xname, rec);
+
+ /*
+ * Create a new xchk_path structure to remember this parent pointer
+ * and record the first name step.
+ */
+ path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS);
+ if (!path)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&path->list);
+ xino_bitmap_init(&path->seen_inodes);
+ path->nr_steps = 0;
+ path->outcome = XCHK_DIRPATH_SCANNING;
+
+ error = xchk_dirpath_append(dl, sc->ip, path, &xname, rec);
+ if (error)
+ goto out_path;
+
+ path->first_step = xfarray_length(dl->path_steps) - 1;
+ path->second_step = XFARRAY_NULLIDX;
+ path->path_nr = dl->nr_paths;
+
+ list_add_tail(&path->list, &dl->path_list);
+ dl->nr_paths++;
+ return 0;
+out_path:
+ kfree(path);
+ return error;
+}
+
+/*
+ * Validate that the first step of this path still has a corresponding
+ * parent pointer in @sc->ip. We probably dropped @sc->ip's ILOCK while
+ * walking towards the roots, which is why this is necessary.
+ *
+ * This function has a side effect of loading the first parent pointer of this
+ * path into the parent pointer scratch pad. This prepares us to walk up the
+ * directory tree towards the root. Returns -ESTALE if the scan data is now
+ * out of date.
+ */
+STATIC int
+xchk_dirpath_revalidate(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ /*
+ * Look up the parent pointer that corresponds to the start of this
+ * path. If the parent pointer has disappeared on us, dump all the
+ * scan results and try again.
+ */
+ error = xfs_parent_lookup(sc->tp, sc->ip, &dl->xname, &dl->pptr_rec,
+ &dl->pptr_args);
+ if (error == -ENOATTR) {
+ trace_xchk_dirpath_disappeared(dl->sc, sc->ip, path->path_nr,
+ path->first_step, &dl->xname, &dl->pptr_rec);
+ dl->stale = true;
+ return -ESTALE;
+ }
+
+ return error;
+}
+
+/*
+ * Walk the parent pointers of a directory at the end of a path and record
+ * the parent that we find in @dl->xname/pptr_rec.
+ */
+STATIC int
+xchk_dirpath_find_next_step(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xchk_dirtree *dl = priv;
+ const struct xfs_parent_rec *rec = value;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ /*
+ * If we've already set @dl->pptr_rec, then this directory has multiple
+ * parents. Signal this back to the caller via -EMLINK.
+ */
+ if (dl->parents_found > 0)
+ return -EMLINK;
+
+ dl->parents_found++;
+ memcpy(dl->namebuf, name, namelen);
+ dl->xname.len = namelen;
+ dl->pptr_rec = *rec; /* struct copy */
+ return 0;
+}
+
+/* Set and log the outcome of a path walk. */
+static inline void
+xchk_dirpath_set_outcome(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ enum xchk_dirpath_outcome outcome)
+{
+ trace_xchk_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps,
+ outcome);
+
+ path->outcome = outcome;
+}
+
+/*
+ * Scan the directory at the end of this path for its parent directory link.
+ * If we find one, extend the path. Returns -ESTALE if the scan data out of
+ * date. Returns -EFSCORRUPTED if the parent pointer is bad; or -ELNRNG if
+ * the path got too deep.
+ */
+STATIC int
+xchk_dirpath_step_up(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xfs_inode *dp;
+ xfs_ino_t parent_ino = be64_to_cpu(dl->pptr_rec.p_ino);
+ unsigned int lock_mode;
+ int error;
+
+ /* Grab and lock the parent directory. */
+ error = xchk_iget(sc, parent_ino, &dp);
+ if (error)
+ return error;
+
+ lock_mode = xfs_ilock_attr_map_shared(dp);
+ mutex_lock(&dl->lock);
+
+ if (dl->stale) {
+ error = -ESTALE;
+ goto out_scanlock;
+ }
+
+ /* We've reached the root directory; the path is ok. */
+ if (parent_ino == dl->root_ino) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_OK);
+ error = 0;
+ goto out_scanlock;
+ }
+
+ /*
+ * The inode being scanned is its own distant ancestor! Get rid of
+ * this path.
+ */
+ if (parent_ino == sc->ip->i_ino) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ error = 0;
+ goto out_scanlock;
+ }
+
+ /*
+ * We've seen this inode before during the path walk. There's a loop
+ * above us in the directory tree. This probably means that we cannot
+ * continue, but let's keep walking paths to get a full picture.
+ */
+ if (xino_bitmap_test(&path->seen_inodes, parent_ino)) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_LOOP);
+ error = 0;
+ goto out_scanlock;
+ }
+
+ /* The handle encoded in the parent pointer must match. */
+ if (VFS_I(dp)->i_generation != be32_to_cpu(dl->pptr_rec.p_gen)) {
+ trace_xchk_dirpath_badgen(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
+ /* Parent pointer must point up to a directory. */
+ if (!S_ISDIR(VFS_I(dp)->i_mode)) {
+ trace_xchk_dirpath_nondir_parent(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
+ /* Parent cannot be an unlinked directory. */
+ if (VFS_I(dp)->i_nlink == 0) {
+ trace_xchk_dirpath_unlinked_parent(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
+ /*
+ * If the extended attributes look as though they has been zapped by
+ * the inode record repair code, we cannot scan for parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(dp)) {
+ error = -EBUSY;
+ xchk_set_incomplete(sc);
+ goto out_scanlock;
+ }
+
+ /*
+ * Walk the parent pointers of @dp to find the parent of this directory
+ * to find the next step in our walk. If we find that @dp has exactly
+ * one parent, the parent pointer information will be stored in
+ * @dl->pptr_rec. This prepares us for the next step of the walk.
+ */
+ mutex_unlock(&dl->lock);
+ dl->parents_found = 0;
+ error = xchk_xattr_walk(sc, dp, xchk_dirpath_find_next_step, NULL, dl);
+ mutex_lock(&dl->lock);
+ if (error == -EFSCORRUPTED || error == -EMLINK ||
+ (!error && dl->parents_found == 0)) {
+ /*
+ * Further up the directory tree from @sc->ip, we found a
+ * corrupt parent pointer, multiple parent pointers while
+ * finding this directory's parent, or zero parents despite
+ * having a nonzero link count. Keep looking for other paths.
+ */
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT);
+ error = 0;
+ goto out_scanlock;
+ }
+ if (error)
+ goto out_scanlock;
+
+ if (dl->stale) {
+ error = -ESTALE;
+ goto out_scanlock;
+ }
+
+ trace_xchk_dirpath_found_next_step(sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+
+ /* Append to the path steps */
+ error = xchk_dirpath_append(dl, dp, path, &dl->xname, &dl->pptr_rec);
+ if (error)
+ goto out_scanlock;
+
+ if (path->second_step == XFARRAY_NULLIDX)
+ path->second_step = xfarray_length(dl->path_steps) - 1;
+
+out_scanlock:
+ mutex_unlock(&dl->lock);
+ xfs_iunlock(dp, lock_mode);
+ xchk_irele(sc, dp);
+ return error;
+}
+
+/*
+ * Walk the directory tree upwards towards what is hopefully the root
+ * directory, recording path steps as we go. The current path components are
+ * stored in dl->pptr_rec and dl->xname.
+ *
+ * Returns -ESTALE if the scan data are out of date. Returns -EFSCORRUPTED
+ * only if the direct parent pointer of @sc->ip associated with this path is
+ * corrupt.
+ */
+STATIC int
+xchk_dirpath_walk_upwards(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ /* Reload the start of this path and make sure it's still there. */
+ error = xchk_dirpath_revalidate(dl, path);
+ if (error)
+ return error;
+
+ trace_xchk_dirpath_walk_upwards(sc, sc->ip, path->path_nr, &dl->xname,
+ &dl->pptr_rec);
+
+ /*
+ * The inode being scanned is its own direct ancestor!
+ * Get rid of this path.
+ */
+ if (be64_to_cpu(dl->pptr_rec.p_ino) == sc->ip->i_ino) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ return 0;
+ }
+
+ /*
+ * Drop ILOCK_EXCL on the inode being scanned. We still hold
+ * IOLOCK_EXCL on it, so it cannot move around or be renamed.
+ *
+ * Beyond this point we're walking up the directory tree, which means
+ * that we can acquire and drop the ILOCK on an alias of sc->ip. The
+ * ILOCK state is no longer tracked in the scrub context. Hence we
+ * must drop @sc->ip's ILOCK during the walk.
+ */
+ mutex_unlock(&dl->lock);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the first step in the walk towards the root by checking the
+ * start of this path, which is a direct parent pointer of @sc->ip.
+ * If we see any kind of error here (including corruptions), the parent
+ * pointer of @sc->ip is corrupt. Stop the whole scan.
+ */
+ error = xchk_dirpath_step_up(dl, path);
+ if (error) {
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+ return error;
+ }
+
+ /*
+ * Take steps upward from the second step in this path towards the
+ * root. If we hit corruption errors here, there's a problem
+ * *somewhere* in the path, but we don't need to stop scanning.
+ */
+ while (!error && path->outcome == XCHK_DIRPATH_SCANNING)
+ error = xchk_dirpath_step_up(dl, path);
+
+ /* Retake the locks we had, mark paths, etc. */
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+ if (error == -EFSCORRUPTED) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT);
+ error = 0;
+ }
+ if (!error && dl->stale)
+ return -ESTALE;
+ return error;
+}
+
+/*
+ * Decide if this path step has been touched by this live update. Returns
+ * 1 for yes, 0 for no, or a negative errno.
+ */
+STATIC int
+xchk_dirpath_step_is_stale(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ unsigned int step_nr,
+ xfarray_idx_t step_idx,
+ struct xfs_dir_update_params *p,
+ xfs_ino_t *cursor)
+{
+ struct xchk_dirpath_step step;
+ xfs_ino_t child_ino = *cursor;
+ int error;
+
+ error = xfarray_load(dl->path_steps, step_idx, &step);
+ if (error)
+ return error;
+ *cursor = be64_to_cpu(step.pptr_rec.p_ino);
+
+ /*
+ * If the parent and child being updated are not the ones mentioned in
+ * this path step, the scan data is still ok.
+ */
+ if (p->ip->i_ino != child_ino || p->dp->i_ino != *cursor)
+ return 0;
+
+ /*
+ * If the dirent name lengths or byte sequences are different, the scan
+ * data is still ok.
+ */
+ if (p->name->len != step.name_len)
+ return 0;
+
+ error = xfblob_loadname(dl->path_names, step.name_cookie,
+ &dl->hook_xname, step.name_len);
+ if (error)
+ return error;
+
+ if (memcmp(dl->hook_xname.name, p->name->name, p->name->len) != 0)
+ return 0;
+
+ /*
+ * If the update comes from the repair code itself, walk the state
+ * machine forward.
+ */
+ if (p->ip->i_ino == dl->scan_ino &&
+ path->outcome == XREP_DIRPATH_ADOPTING) {
+ xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_ADOPTED);
+ return 0;
+ }
+
+ if (p->ip->i_ino == dl->scan_ino &&
+ path->outcome == XREP_DIRPATH_DELETING) {
+ xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETED);
+ return 0;
+ }
+
+ /* Exact match, scan data is out of date. */
+ trace_xchk_dirpath_changed(dl->sc, path->path_nr, step_nr, p->dp,
+ p->ip, p->name);
+ return 1;
+}
+
+/*
+ * Decide if this path has been touched by this live update. Returns 1 for
+ * yes, 0 for no, or a negative errno.
+ */
+STATIC int
+xchk_dirpath_is_stale(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ struct xfs_dir_update_params *p)
+{
+ xfs_ino_t cursor = dl->scan_ino;
+ xfarray_idx_t idx = path->first_step;
+ unsigned int i;
+ int ret;
+
+ /*
+ * The child being updated has not been seen by this path at all; this
+ * path cannot be stale.
+ */
+ if (!xino_bitmap_test(&path->seen_inodes, p->ip->i_ino))
+ return 0;
+
+ ret = xchk_dirpath_step_is_stale(dl, path, 0, idx, p, &cursor);
+ if (ret != 0)
+ return ret;
+
+ for (i = 1, idx = path->second_step; i < path->nr_steps; i++, idx++) {
+ ret = xchk_dirpath_step_is_stale(dl, path, i, idx, p, &cursor);
+ if (ret != 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Decide if a directory update from the regular filesystem touches any of the
+ * paths we've scanned, and invalidate the scan data if true.
+ */
+STATIC int
+xchk_dirtree_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xchk_dirtree *dl;
+ struct xchk_dirpath *path;
+ int ret;
+
+ dl = container_of(nb, struct xchk_dirtree, dhook.dirent_hook.nb);
+
+ trace_xchk_dirtree_live_update(dl->sc, p->dp, action, p->ip, p->delta,
+ p->name);
+
+ mutex_lock(&dl->lock);
+
+ if (dl->stale || dl->aborted)
+ goto out_unlock;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ ret = xchk_dirpath_is_stale(dl, path, p);
+ if (ret < 0) {
+ dl->aborted = true;
+ break;
+ }
+ if (ret == 1) {
+ dl->stale = true;
+ break;
+ }
+ }
+
+out_unlock:
+ mutex_unlock(&dl->lock);
+ return NOTIFY_DONE;
+}
+
+/* Delete all the collected path information. */
+STATIC void
+xchk_dirtree_reset(
+ void *buf)
+{
+ struct xchk_dirtree *dl = buf;
+ struct xchk_dirpath *path, *n;
+
+ ASSERT(dl->sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ xchk_dirtree_for_each_path_safe(dl, path, n) {
+ list_del_init(&path->list);
+ xino_bitmap_destroy(&path->seen_inodes);
+ kfree(path);
+ }
+ dl->nr_paths = 0;
+
+ xfarray_truncate(dl->path_steps);
+ xfblob_truncate(dl->path_names);
+
+ dl->stale = false;
+}
+
+/*
+ * Load the name/pptr from the first step in this path into @dl->pptr_rec and
+ * @dl->xname.
+ */
+STATIC int
+xchk_dirtree_load_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xchk_dirpath_step step;
+ int error;
+
+ error = xfarray_load(dl->path_steps, path->first_step, &step);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(dl->path_names, step.name_cookie, &dl->xname,
+ step.name_len);
+ if (error)
+ return error;
+
+ dl->pptr_rec = step.pptr_rec; /* struct copy */
+ return 0;
+}
+
+/*
+ * For each parent pointer of this subdir, trace a path upwards towards the
+ * root directory and record what we find. Returns 0 for success;
+ * -EFSCORRUPTED if walking the parent pointers of @sc->ip failed, -ELNRNG if a
+ * path was too deep; -ENOSR if there were too many parent pointers; or
+ * a negative errno.
+ */
+int
+xchk_dirtree_find_paths_to_root(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xchk_dirpath *path;
+ int error = 0;
+
+ do {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ xchk_dirtree_reset(dl);
+
+ /*
+ * If the extended attributes look as though they has been
+ * zapped by the inode record repair code, we cannot scan for
+ * parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(sc->ip)) {
+ xchk_set_incomplete(sc);
+ return -EBUSY;
+ }
+
+ /*
+ * Create path walk contexts for each parent of the directory
+ * that is being scanned. Directories are supposed to have
+ * only one parent, but this is how we detect multiple parents.
+ */
+ error = xchk_xattr_walk(sc, sc->ip, xchk_dirtree_create_path,
+ NULL, dl);
+ if (error)
+ return error;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ /* Load path components into dl->pptr/xname */
+ error = xchk_dirtree_load_path(dl, path);
+ if (error)
+ return error;
+
+ /*
+ * Try to walk up each path to the root. This enables
+ * us to find directory loops in ancestors, and the
+ * like.
+ */
+ error = xchk_dirpath_walk_upwards(dl, path);
+ if (error == -EFSCORRUPTED) {
+ /*
+ * A parent pointer of @sc->ip is bad, don't
+ * bother continuing.
+ */
+ break;
+ }
+ if (error == -ESTALE) {
+ /* This had better be an invalidation. */
+ ASSERT(dl->stale);
+ break;
+ }
+ if (error)
+ return error;
+ if (dl->aborted)
+ return 0;
+ }
+ } while (dl->stale);
+
+ return error;
+}
+
+/*
+ * Figure out what to do with the paths we tried to find. Do not call this
+ * if the scan results are stale.
+ */
+void
+xchk_dirtree_evaluate(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+
+ ASSERT(!dl->stale);
+
+ /* Scan the paths we have to decide what to do. */
+ memset(oc, 0, sizeof(struct xchk_dirtree_outcomes));
+ xchk_dirtree_for_each_path(dl, path) {
+ trace_xchk_dirpath_evaluate_path(dl->sc, path->path_nr,
+ path->nr_steps, path->outcome);
+
+ switch (path->outcome) {
+ case XCHK_DIRPATH_SCANNING:
+ /* shouldn't get here */
+ ASSERT(0);
+ break;
+ case XCHK_DIRPATH_DELETE:
+ /* This one is already going away. */
+ oc->bad++;
+ break;
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ /* Couldn't find the end of this path. */
+ oc->suspect++;
+ break;
+ case XCHK_DIRPATH_STALE:
+ /* shouldn't get here either */
+ ASSERT(0);
+ break;
+ case XCHK_DIRPATH_OK:
+ /* This path got all the way to the root. */
+ oc->good++;
+ break;
+ case XREP_DIRPATH_DELETING:
+ case XREP_DIRPATH_DELETED:
+ case XREP_DIRPATH_ADOPTING:
+ case XREP_DIRPATH_ADOPTED:
+ /* These should not be in progress! */
+ ASSERT(0);
+ break;
+ }
+ }
+
+ trace_xchk_dirtree_evaluate(dl, oc);
+}
+
+/* Look for directory loops. */
+int
+xchk_dirtree(
+ struct xfs_scrub *sc)
+{
+ struct xchk_dirtree_outcomes oc;
+ struct xchk_dirtree *dl = sc->buf;
+ int error;
+
+ /*
+ * Nondirectories do not point downwards to other files, so they cannot
+ * cause a cycle in the directory tree.
+ */
+ if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+ return -ENOENT;
+
+ ASSERT(xfs_has_parent(sc->mp));
+
+ /*
+ * Find the root of the directory tree. Remember which directory to
+ * scan, because the hook doesn't detach until after sc->ip gets
+ * released during teardown.
+ */
+ dl->root_ino = sc->mp->m_rootip->i_ino;
+ dl->scan_ino = sc->ip->i_ino;
+
+ trace_xchk_dirtree_start(sc->ip, sc->sm, 0);
+
+ /*
+ * Hook into the directory entry code so that we can capture updates to
+ * paths that we have already scanned. The scanner thread takes each
+ * directory's ILOCK, which means that any in-progress directory update
+ * will finish before we can scan the directory.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ xfs_dir_hook_setup(&dl->dhook, xchk_dirtree_live_update);
+ error = xfs_dir_hook_add(sc->mp, &dl->dhook);
+ if (error)
+ goto out;
+
+ mutex_lock(&dl->lock);
+
+ /* Trace each parent pointer's path to the root. */
+ error = xchk_dirtree_find_paths_to_root(dl);
+ if (error == -EFSCORRUPTED || error == -ELNRNG || error == -ENOSR) {
+ /*
+ * Don't bother walking the paths if the xattr structure or the
+ * parent pointers are corrupt; this scan cannot be completed
+ * without full information.
+ */
+ xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+ error = 0;
+ goto out_scanlock;
+ }
+ if (error == -EBUSY) {
+ /*
+ * We couldn't scan some directory's parent pointers because
+ * the attr fork looked like it had been zapped. The
+ * scan was marked incomplete, so no further error code
+ * is necessary.
+ */
+ error = 0;
+ goto out_scanlock;
+ }
+ if (error)
+ goto out_scanlock;
+ if (dl->aborted) {
+ xchk_set_incomplete(sc);
+ goto out_scanlock;
+ }
+
+ /* Assess what we found in our path evaluation. */
+ xchk_dirtree_evaluate(dl, &oc);
+ if (xchk_dirtree_parentless(dl)) {
+ if (oc.good || oc.bad || oc.suspect)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ } else {
+ if (oc.bad || oc.good + oc.suspect != 1)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ if (oc.suspect)
+ xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+ }
+
+out_scanlock:
+ mutex_unlock(&dl->lock);
+out:
+ trace_xchk_dirtree_done(sc->ip, sc->sm, error);
+ return error;
+}
diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h
new file mode 100644
index 000000000000..1e1686365c61
--- /dev/null
+++ b/fs/xfs/scrub/dirtree.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_DIRTREE_H__
+#define __XFS_SCRUB_DIRTREE_H__
+
+/*
+ * Each of these represents one parent pointer path step in a chain going
+ * up towards the directory tree root. These are stored inside an xfarray.
+ */
+struct xchk_dirpath_step {
+ /* Directory entry name associated with this parent link. */
+ xfblob_cookie name_cookie;
+ unsigned int name_len;
+
+ /* Handle of the parent directory. */
+ struct xfs_parent_rec pptr_rec;
+};
+
+enum xchk_dirpath_outcome {
+ XCHK_DIRPATH_SCANNING = 0, /* still being put together */
+ XCHK_DIRPATH_DELETE, /* delete this path */
+ XCHK_DIRPATH_CORRUPT, /* corruption detected in path */
+ XCHK_DIRPATH_LOOP, /* cycle detected further up */
+ XCHK_DIRPATH_STALE, /* path is stale */
+ XCHK_DIRPATH_OK, /* path reaches the root */
+
+ XREP_DIRPATH_DELETING, /* path is being deleted */
+ XREP_DIRPATH_DELETED, /* path has been deleted */
+ XREP_DIRPATH_ADOPTING, /* path is being adopted */
+ XREP_DIRPATH_ADOPTED, /* path has been adopted */
+};
+
+/*
+ * Each of these represents one parent pointer path out of the directory being
+ * scanned. These exist in-core, and hopefully there aren't more than a
+ * handful of them.
+ */
+struct xchk_dirpath {
+ struct list_head list;
+
+ /* Index of the first step in this path. */
+ xfarray_idx_t first_step;
+
+ /* Index of the second step in this path. */
+ xfarray_idx_t second_step;
+
+ /* Inodes seen while walking this path. */
+ struct xino_bitmap seen_inodes;
+
+ /* Number of steps in this path. */
+ unsigned int nr_steps;
+
+ /* Which path is this? */
+ unsigned int path_nr;
+
+ /* What did we conclude from following this path? */
+ enum xchk_dirpath_outcome outcome;
+};
+
+struct xchk_dirtree_outcomes {
+ /* Number of XCHK_DIRPATH_DELETE */
+ unsigned int bad;
+
+ /* Number of XCHK_DIRPATH_CORRUPT or XCHK_DIRPATH_LOOP */
+ unsigned int suspect;
+
+ /* Number of XCHK_DIRPATH_OK */
+ unsigned int good;
+
+ /* Directory needs to be added to lost+found */
+ bool needs_adoption;
+};
+
+struct xchk_dirtree {
+ struct xfs_scrub *sc;
+
+ /* Root inode that we're looking for. */
+ xfs_ino_t root_ino;
+
+ /*
+ * This is the inode that we're scanning. The live update hook can
+ * continue to be called after xchk_teardown drops sc->ip but before
+ * it calls buf_cleanup, so we keep a copy.
+ */
+ xfs_ino_t scan_ino;
+
+ /*
+ * If we start deleting redundant paths to this subdirectory, this is
+ * the inode number of the surviving parent and the dotdot entry will
+ * be set to this value. If the value is NULLFSINO, then use @root_ino
+ * as a stand-in until the orphanage can adopt the subdirectory.
+ */
+ xfs_ino_t parent_ino;
+
+ /* Scratch buffer for scanning pptr xattrs */
+ struct xfs_parent_rec pptr_rec;
+ struct xfs_da_args pptr_args;
+
+ /* Name buffer */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
+
+ /* Information for reparenting this directory. */
+ struct xrep_adoption adoption;
+
+ /*
+ * Hook into directory updates so that we can receive live updates
+ * from other writer threads.
+ */
+ struct xfs_dir_hook dhook;
+
+ /* Parent pointer update arguments. */
+ struct xfs_parent_args ppargs;
+
+ /* lock for everything below here */
+ struct mutex lock;
+
+ /* buffer for the live update functions to use for dirent names */
+ struct xfs_name hook_xname;
+ unsigned char hook_namebuf[MAXNAMELEN];
+
+ /*
+ * All path steps observed during this scan. Each of the path
+ * steps for a particular pathwalk are recorded in sequential
+ * order in the xfarray. A pathwalk ends either with a step
+ * pointing to the root directory (success) or pointing to NULLFSINO
+ * (loop detected, empty dir detected, etc).
+ */
+ struct xfarray *path_steps;
+
+ /* All names observed during this scan. */
+ struct xfblob *path_names;
+
+ /* All paths being tracked by this scanner. */
+ struct list_head path_list;
+
+ /* Number of paths in path_list. */
+ unsigned int nr_paths;
+
+ /* Number of parents found by a pptr scan. */
+ unsigned int parents_found;
+
+ /* Have the path data been invalidated by a concurrent update? */
+ bool stale:1;
+
+ /* Has the scan been aborted? */
+ bool aborted:1;
+};
+
+#define xchk_dirtree_for_each_path_safe(dl, path, n) \
+ list_for_each_entry_safe((path), (n), &(dl)->path_list, list)
+
+#define xchk_dirtree_for_each_path(dl, path) \
+ list_for_each_entry((path), &(dl)->path_list, list)
+
+static inline bool
+xchk_dirtree_parentless(const struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+
+ if (sc->ip == sc->mp->m_rootip)
+ return true;
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return true;
+ return false;
+}
+
+int xchk_dirtree_find_paths_to_root(struct xchk_dirtree *dl);
+int xchk_dirpath_append(struct xchk_dirtree *dl, struct xfs_inode *ip,
+ struct xchk_dirpath *path, const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr);
+void xchk_dirtree_evaluate(struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc);
+
+#endif /* __XFS_SCRUB_DIRTREE_H__ */
diff --git a/fs/xfs/scrub/dirtree_repair.c b/fs/xfs/scrub/dirtree_repair.c
new file mode 100644
index 000000000000..5c04e70ba951
--- /dev/null
+++ b/fs/xfs/scrub/dirtree_repair.c
@@ -0,0 +1,821 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/bitmap.h"
+#include "scrub/ino_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/listxattr.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/orphanage.h"
+#include "scrub/dirtree.h"
+#include "scrub/readdir.h"
+
+/*
+ * Directory Tree Structure Repairs
+ * ================================
+ *
+ * If we decide that the directory being scanned is participating in a
+ * directory loop, the only change we can make is to remove directory entries
+ * pointing down to @sc->ip. If that leaves it with no parents, the directory
+ * should be adopted by the orphanage.
+ */
+
+/* Set up to repair directory loops. */
+int
+xrep_setup_dirtree(
+ struct xfs_scrub *sc)
+{
+ return xrep_orphanage_try_create(sc);
+}
+
+/* Change the outcome of this path. */
+static inline void
+xrep_dirpath_set_outcome(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ enum xchk_dirpath_outcome outcome)
+{
+ trace_xrep_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps,
+ outcome);
+
+ path->outcome = outcome;
+}
+
+/* Delete all paths. */
+STATIC void
+xrep_dirtree_delete_all_paths(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ oc->suspect--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ case XCHK_DIRPATH_OK:
+ oc->good--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect == 0);
+ ASSERT(oc->good == 0);
+}
+
+/* Since this is the surviving path, set the dotdot entry to this value. */
+STATIC void
+xrep_dirpath_retain_parent(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xchk_dirpath_step step;
+ int error;
+
+ error = xfarray_load(dl->path_steps, path->first_step, &step);
+ if (error)
+ return;
+
+ dl->parent_ino = be64_to_cpu(step.pptr_rec.p_ino);
+}
+
+/* Find the one surviving path so we know how to set dotdot. */
+STATIC void
+xrep_dirtree_find_surviving_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ bool foundit = false;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ case XCHK_DIRPATH_OK:
+ if (!foundit) {
+ xrep_dirpath_retain_parent(dl, path);
+ foundit = true;
+ continue;
+ }
+ ASSERT(foundit == false);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect + oc->good == 1);
+}
+
+/* Delete all paths except for the one good one. */
+STATIC void
+xrep_dirtree_keep_one_good_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ bool foundit = false;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ oc->suspect--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ case XCHK_DIRPATH_OK:
+ if (!foundit) {
+ xrep_dirpath_retain_parent(dl, path);
+ foundit = true;
+ continue;
+ }
+ oc->good--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect == 0);
+ ASSERT(oc->good < 2);
+}
+
+/* Delete all paths except for one suspect one. */
+STATIC void
+xrep_dirtree_keep_one_suspect_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ bool foundit = false;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ if (!foundit) {
+ xrep_dirpath_retain_parent(dl, path);
+ foundit = true;
+ continue;
+ }
+ oc->suspect--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ case XCHK_DIRPATH_OK:
+ ASSERT(0);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect == 1);
+ ASSERT(oc->good == 0);
+}
+
+/*
+ * Figure out what to do with the paths we tried to find. Returns -EDEADLOCK
+ * if the scan results have become stale.
+ */
+STATIC void
+xrep_dirtree_decide_fate(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ xchk_dirtree_evaluate(dl, oc);
+
+ /* Parentless directories should not have any paths at all. */
+ if (xchk_dirtree_parentless(dl)) {
+ xrep_dirtree_delete_all_paths(dl, oc);
+ return;
+ }
+
+ /* One path is exactly the number of paths we want. */
+ if (oc->good + oc->suspect == 1) {
+ xrep_dirtree_find_surviving_path(dl, oc);
+ return;
+ }
+
+ /* Zero paths means we should reattach the subdir to the orphanage. */
+ if (oc->good + oc->suspect == 0) {
+ if (dl->sc->orphanage)
+ oc->needs_adoption = true;
+ return;
+ }
+
+ /*
+ * Otherwise, this subdirectory has too many parents. If there's at
+ * least one good path, keep it and delete the others.
+ */
+ if (oc->good > 0) {
+ xrep_dirtree_keep_one_good_path(dl, oc);
+ return;
+ }
+
+ /*
+ * There are no good paths and there are too many suspect paths.
+ * Keep the first suspect path and delete the rest.
+ */
+ xrep_dirtree_keep_one_suspect_path(dl, oc);
+}
+
+/*
+ * Load the first step of this path into @step and @dl->xname/pptr
+ * for later repair work.
+ */
+STATIC int
+xrep_dirtree_prep_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ struct xchk_dirpath_step *step)
+{
+ int error;
+
+ error = xfarray_load(dl->path_steps, path->first_step, step);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(dl->path_names, step->name_cookie, &dl->xname,
+ step->name_len);
+ if (error)
+ return error;
+
+ dl->pptr_rec = step->pptr_rec; /* struct copy */
+ return 0;
+}
+
+/* Delete the VFS dentry for a removed child. */
+STATIC int
+xrep_dirtree_purge_dentry(
+ struct xchk_dirtree *dl,
+ struct xfs_inode *dp,
+ const struct xfs_name *name)
+{
+ struct qstr qname = QSTR_INIT(name->name, name->len);
+ struct dentry *parent_dentry, *child_dentry;
+ int error = 0;
+
+ /*
+ * Find the dentry for the parent directory. If there isn't one, we're
+ * done. Caller already holds i_rwsem for parent and child.
+ */
+ parent_dentry = d_find_alias(VFS_I(dp));
+ if (!parent_dentry)
+ return 0;
+
+ /* The VFS thinks the parent is a directory, right? */
+ if (!d_is_dir(parent_dentry)) {
+ ASSERT(d_is_dir(parent_dentry));
+ error = -EFSCORRUPTED;
+ goto out_dput_parent;
+ }
+
+ /*
+ * Try to find the dirent pointing to the child. If there isn't one,
+ * we're done.
+ */
+ qname.hash = full_name_hash(parent_dentry, name->name, name->len);
+ child_dentry = d_lookup(parent_dentry, &qname);
+ if (!child_dentry) {
+ error = 0;
+ goto out_dput_parent;
+ }
+
+ trace_xrep_dirtree_delete_child(dp->i_mount, child_dentry);
+
+ /* Child is not a directory? We're screwed. */
+ if (!d_is_dir(child_dentry)) {
+ ASSERT(d_is_dir(child_dentry));
+ error = -EFSCORRUPTED;
+ goto out_dput_child;
+ }
+
+ /* Replace the child dentry with a negative one. */
+ d_delete(child_dentry);
+
+out_dput_child:
+ dput(child_dentry);
+out_dput_parent:
+ dput(parent_dentry);
+ return error;
+}
+
+/*
+ * Prepare to delete a link by taking the IOLOCK of the parent and the child
+ * (scrub target). Caller must hold IOLOCK_EXCL on @sc->ip. Returns 0 if we
+ * took both locks, or a negative errno if we couldn't lock the parent in time.
+ */
+static inline int
+xrep_dirtree_unlink_iolock(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp)
+{
+ int error;
+
+ ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL);
+
+ if (xfs_ilock_nowait(dp, XFS_IOLOCK_EXCL))
+ return 0;
+
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ do {
+ xfs_ilock(dp, XFS_IOLOCK_EXCL);
+ if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ break;
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL);
+
+ if (xchk_should_terminate(sc, &error)) {
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ delay(1);
+ } while (1);
+
+ return 0;
+}
+
+/*
+ * Remove a link from the directory tree and update the dcache. Returns
+ * -ESTALE if the scan data are now out of date.
+ */
+STATIC int
+xrep_dirtree_unlink(
+ struct xchk_dirtree *dl,
+ struct xfs_inode *dp,
+ struct xchk_dirpath *path,
+ struct xchk_dirpath_step *step)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t dotdot_ino;
+ xfs_ino_t parent_ino = dl->parent_ino;
+ unsigned int resblks;
+ int dontcare;
+ int error;
+
+ /* Take IOLOCK_EXCL of the parent and child. */
+ error = xrep_dirtree_unlink_iolock(sc, dp);
+ if (error)
+ return error;
+
+ /*
+ * Create the transaction that we need to sever the path. Ignore
+ * EDQUOT and ENOSPC being returned via nospace_error because the
+ * directory code can handle a reservationless update.
+ */
+ resblks = xfs_remove_space_res(mp, step->name_len);
+ error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, sc->ip,
+ &resblks, &sc->tp, &dontcare);
+ if (error)
+ goto out_iolock;
+
+ /*
+ * Cancel if someone invalidate the paths while we were trying to get
+ * the ILOCK.
+ */
+ mutex_lock(&dl->lock);
+ if (dl->stale) {
+ mutex_unlock(&dl->lock);
+ error = -ESTALE;
+ goto out_trans_cancel;
+ }
+ xrep_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETING);
+ mutex_unlock(&dl->lock);
+
+ trace_xrep_dirtree_delete_path(dl->sc, sc->ip, path->path_nr,
+ &dl->xname, &dl->pptr_rec);
+
+ /*
+ * Decide if we need to reset the dotdot entry. Rules:
+ *
+ * - If there's a surviving parent, we want dotdot to point there.
+ * - If we don't have any surviving parents, then point dotdot at the
+ * root dir.
+ * - If dotdot is already set to the value we want, pass in NULLFSINO
+ * for no change necessary.
+ *
+ * Do this /before/ we dirty anything, in case the dotdot lookup
+ * fails.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &dotdot_ino);
+ if (error)
+ goto out_trans_cancel;
+ if (parent_ino == NULLFSINO)
+ parent_ino = dl->root_ino;
+ if (dotdot_ino == parent_ino)
+ parent_ino = NULLFSINO;
+
+ /* Drop the link from sc->ip's dotdot entry. */
+ error = xfs_droplink(sc->tp, dp);
+ if (error)
+ goto out_trans_cancel;
+
+ /* Reset the dotdot entry to a surviving parent. */
+ if (parent_ino != NULLFSINO) {
+ error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot,
+ parent_ino, 0);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /* Drop the link from dp to sc->ip. */
+ error = xfs_droplink(sc->tp, sc->ip);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_dir_removename(sc->tp, dp, &dl->xname, sc->ip->i_ino,
+ resblks);
+ if (error) {
+ ASSERT(error != -ENOENT);
+ goto out_trans_cancel;
+ }
+
+ if (xfs_has_parent(sc->mp)) {
+ error = xfs_parent_removename(sc->tp, &dl->ppargs, dp,
+ &dl->xname, sc->ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Notify dirent hooks that we removed the bad link, invalidate the
+ * dcache, and commit the repair.
+ */
+ xfs_dir_update_hook(dp, sc->ip, -1, &dl->xname);
+ error = xrep_dirtree_purge_dentry(dl, dp, &dl->xname);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xrep_trans_commit(sc);
+ goto out_ilock;
+
+out_trans_cancel:
+ xchk_trans_cancel(sc);
+out_ilock:
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+out_iolock:
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * Delete a directory entry that points to this directory. Returns -ESTALE
+ * if the scan data are now out of date.
+ */
+STATIC int
+xrep_dirtree_delete_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xchk_dirpath_step step;
+ struct xfs_scrub *sc = dl->sc;
+ struct xfs_inode *dp;
+ int error;
+
+ /*
+ * Load the parent pointer and directory inode for this path, then
+ * drop the scan lock, the ILOCK, and the transaction so that
+ * _delete_path can reserve the proper transaction. This sets up
+ * @dl->xname for the deletion.
+ */
+ error = xrep_dirtree_prep_path(dl, path, &step);
+ if (error)
+ return error;
+
+ error = xchk_iget(sc, be64_to_cpu(step.pptr_rec.p_ino), &dp);
+ if (error)
+ return error;
+
+ mutex_unlock(&dl->lock);
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* Delete the directory link and release the parent. */
+ error = xrep_dirtree_unlink(dl, dp, path, &step);
+ xchk_irele(sc, dp);
+
+ /*
+ * Retake all the resources we had at the beginning even if the repair
+ * failed or the scan data are now stale. This keeps things simple for
+ * the caller.
+ */
+ xchk_trans_alloc_empty(sc);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+
+ if (!error && dl->stale)
+ error = -ESTALE;
+ return error;
+}
+
+/* Add a new path to represent our in-progress adoption. */
+STATIC int
+xrep_dirtree_create_adoption_path(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xchk_dirpath *path;
+ int error;
+
+ /*
+ * We should have capped the number of paths at XFS_MAXLINK-1 in the
+ * scanner.
+ */
+ if (dl->nr_paths > XFS_MAXLINK) {
+ ASSERT(dl->nr_paths <= XFS_MAXLINK);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Create a new xchk_path structure to remember this parent pointer
+ * and record the first name step.
+ */
+ path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS);
+ if (!path)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&path->list);
+ xino_bitmap_init(&path->seen_inodes);
+ path->nr_steps = 0;
+ path->outcome = XREP_DIRPATH_ADOPTING;
+
+ /*
+ * Record the new link that we just created in the orphanage. Because
+ * adoption is the last repair that we perform, we don't bother filling
+ * in the path all the way back to the root.
+ */
+ xfs_inode_to_parent_rec(&dl->pptr_rec, sc->orphanage);
+
+ error = xino_bitmap_set(&path->seen_inodes, sc->orphanage->i_ino);
+ if (error)
+ goto out_path;
+
+ trace_xrep_dirtree_create_adoption(sc, sc->ip, dl->nr_paths,
+ &dl->xname, &dl->pptr_rec);
+
+ error = xchk_dirpath_append(dl, sc->ip, path, &dl->xname,
+ &dl->pptr_rec);
+ if (error)
+ goto out_path;
+
+ path->first_step = xfarray_length(dl->path_steps) - 1;
+ path->second_step = XFARRAY_NULLIDX;
+ path->path_nr = dl->nr_paths;
+
+ list_add_tail(&path->list, &dl->path_list);
+ dl->nr_paths++;
+ return 0;
+
+out_path:
+ kfree(path);
+ return error;
+}
+
+/*
+ * Prepare to move a file to the orphanage by taking the IOLOCK of the
+ * orphanage and the child (scrub target). Caller must hold IOLOCK_EXCL on
+ * @sc->ip. Returns 0 if we took both locks, or a negative errno if we
+ * couldn't lock the orphanage in time.
+ */
+static inline int
+xrep_dirtree_adopt_iolock(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL);
+
+ if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ return 0;
+
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ do {
+ xrep_orphanage_ilock(sc, XFS_IOLOCK_EXCL);
+ if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ break;
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+
+ if (xchk_should_terminate(sc, &error)) {
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ delay(1);
+ } while (1);
+
+ return 0;
+}
+
+/*
+ * Reattach this orphaned directory to the orphanage. Do not call this with
+ * any resources held. Returns -ESTALE if the scan data have become out of
+ * date.
+ */
+STATIC int
+xrep_dirtree_adopt(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ /* Take the IOLOCK of the orphanage and the scrub target. */
+ error = xrep_dirtree_adopt_iolock(sc);
+ if (error)
+ return error;
+
+ /*
+ * Set up for an adoption. The directory tree fixer runs after the
+ * link counts have been corrected. Therefore, we must bump the
+ * child's link count since there will be no further opportunity to fix
+ * errors.
+ */
+ error = xrep_adoption_trans_alloc(sc, &dl->adoption);
+ if (error)
+ goto out_iolock;
+ dl->adoption.bump_child_nlink = true;
+
+ /* Figure out what name we're going to use here. */
+ error = xrep_adoption_compute_name(&dl->adoption, &dl->xname);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Now that we have a proposed name for the orphanage entry, create
+ * a faux path so that the live update hook will see it.
+ */
+ mutex_lock(&dl->lock);
+ if (dl->stale) {
+ mutex_unlock(&dl->lock);
+ error = -ESTALE;
+ goto out_trans;
+ }
+ error = xrep_dirtree_create_adoption_path(dl);
+ mutex_unlock(&dl->lock);
+ if (error)
+ goto out_trans;
+
+ /* Reparent the directory. */
+ error = xrep_adoption_move(&dl->adoption);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Commit the name and release all inode locks except for the scrub
+ * target's IOLOCK.
+ */
+ error = xrep_trans_commit(sc);
+ goto out_ilock;
+
+out_trans:
+ xchk_trans_cancel(sc);
+out_ilock:
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+out_iolock:
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * This newly orphaned directory needs to be adopted by the orphanage.
+ * Make this happen.
+ */
+STATIC int
+xrep_dirtree_move_to_orphanage(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ /*
+ * Start by dropping all the resources that we hold so that we can grab
+ * all the resources that we need for the adoption.
+ */
+ mutex_unlock(&dl->lock);
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* Perform the adoption. */
+ error = xrep_dirtree_adopt(dl);
+
+ /*
+ * Retake all the resources we had at the beginning even if the repair
+ * failed or the scan data are now stale. This keeps things simple for
+ * the caller.
+ */
+ xchk_trans_alloc_empty(sc);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+
+ if (!error && dl->stale)
+ error = -ESTALE;
+ return error;
+}
+
+/*
+ * Try to fix all the problems. Returns -ESTALE if the scan data have become
+ * out of date.
+ */
+STATIC int
+xrep_dirtree_fix_problems(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ int error;
+
+ /* Delete all the paths we don't want. */
+ xchk_dirtree_for_each_path(dl, path) {
+ if (path->outcome != XCHK_DIRPATH_DELETE)
+ continue;
+
+ error = xrep_dirtree_delete_path(dl, path);
+ if (error)
+ return error;
+ }
+
+ /* Reparent this directory to the orphanage. */
+ if (oc->needs_adoption) {
+ if (xrep_orphanage_can_adopt(dl->sc))
+ return xrep_dirtree_move_to_orphanage(dl);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/* Fix directory loops involving this directory. */
+int
+xrep_dirtree(
+ struct xfs_scrub *sc)
+{
+ struct xchk_dirtree *dl = sc->buf;
+ struct xchk_dirtree_outcomes oc;
+ int error;
+
+ /*
+ * Prepare to fix the directory tree by retaking the scan lock. The
+ * order of resource acquisition is still IOLOCK -> transaction ->
+ * ILOCK -> scan lock.
+ */
+ mutex_lock(&dl->lock);
+ do {
+ /*
+ * Decide what we're going to do, then do it. An -ESTALE
+ * return here means the scan results are invalid and we have
+ * to walk again.
+ */
+ if (!dl->stale) {
+ xrep_dirtree_decide_fate(dl, &oc);
+
+ trace_xrep_dirtree_decided_fate(dl, &oc);
+
+ error = xrep_dirtree_fix_problems(dl, &oc);
+ if (!error || error != -ESTALE)
+ break;
+ }
+ error = xchk_dirtree_find_paths_to_root(dl);
+ if (error == -ELNRNG || error == -ENOSR)
+ error = -EFSCORRUPTED;
+ } while (!error);
+ mutex_unlock(&dl->lock);
+
+ return error;
+}
diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c
new file mode 100644
index 000000000000..01766041ba2c
--- /dev/null
+++ b/fs/xfs/scrub/findparent.c
@@ -0,0 +1,454 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trans_space.h"
+#include "xfs_health.h"
+#include "xfs_exchmaps.h"
+#include "xfs_parent.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/findparent.h"
+#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/listxattr.h"
+
+/*
+ * Finding the Parent of a Directory
+ * =================================
+ *
+ * Directories have parent pointers, in the sense that each directory contains
+ * a dotdot entry that points to the single allowed parent. The brute force
+ * way to find the parent of a given directory is to scan every directory in
+ * the filesystem looking for a child dirent that references this directory.
+ *
+ * This module wraps the process of scanning the directory tree. It requires
+ * that @sc->ip is the directory whose parent we want to find, and that the
+ * caller hold only the IOLOCK on that directory. The scan itself needs to
+ * take the ILOCK of each directory visited.
+ *
+ * Because we cannot hold @sc->ip's ILOCK during a scan of the whole fs, it is
+ * necessary to use dirent hook to update the parent scan results. Callers
+ * must not read the scan results without re-taking @sc->ip's ILOCK.
+ *
+ * There are a few shortcuts that we can take to avoid scanning the entire
+ * filesystem, such as noticing directory tree roots and querying the dentry
+ * cache for parent information.
+ */
+
+struct xrep_findparent_info {
+ /* The directory currently being scanned. */
+ struct xfs_inode *dp;
+
+ /*
+ * Scrub context. We're looking for a @dp containing a directory
+ * entry pointing to sc->ip->i_ino.
+ */
+ struct xfs_scrub *sc;
+
+ /* Optional scan information for a xrep_findparent_scan call. */
+ struct xrep_parent_scan_info *parent_scan;
+
+ /*
+ * Parent that we've found for sc->ip. If we're scanning the entire
+ * directory tree, we need this to ensure that we only find /one/
+ * parent directory.
+ */
+ xfs_ino_t found_parent;
+
+ /*
+ * This is set to true if @found_parent was not observed directly from
+ * the directory scan but by noticing a change in dotdot entries after
+ * cycling the sc->ip IOLOCK.
+ */
+ bool parent_tentative;
+};
+
+/*
+ * If this directory entry points to the scrub target inode, then the directory
+ * we're scanning is the parent of the scrub target inode.
+ */
+STATIC int
+xrep_findparent_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_findparent_info *fpi = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(fpi->sc, &error))
+ return error;
+
+ if (ino != fpi->sc->ip->i_ino)
+ return 0;
+
+ /* Ignore garbage directory entry names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /*
+ * Ignore dotdot and dot entries -- we're looking for parent -> child
+ * links only.
+ */
+ if (name->name[0] == '.' && (name->len == 1 ||
+ (name->len == 2 && name->name[1] == '.')))
+ return 0;
+
+ /* Uhoh, more than one parent for a dir? */
+ if (fpi->found_parent != NULLFSINO &&
+ !(fpi->parent_tentative && fpi->found_parent == fpi->dp->i_ino)) {
+ trace_xrep_findparent_dirent(fpi->sc->ip, 0);
+ return -EFSCORRUPTED;
+ }
+
+ /* We found a potential parent; remember this. */
+ trace_xrep_findparent_dirent(fpi->sc->ip, fpi->dp->i_ino);
+ fpi->found_parent = fpi->dp->i_ino;
+ fpi->parent_tentative = false;
+
+ if (fpi->parent_scan)
+ xrep_findparent_scan_found(fpi->parent_scan, fpi->dp->i_ino);
+
+ return 0;
+}
+
+/*
+ * If this is a directory, walk the dirents looking for any that point to the
+ * scrub target inode.
+ */
+STATIC int
+xrep_findparent_walk_directory(
+ struct xrep_findparent_info *fpi)
+{
+ struct xfs_scrub *sc = fpi->sc;
+ struct xfs_inode *dp = fpi->dp;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /*
+ * The inode being scanned cannot be its own parent, nor can any
+ * temporary directory we created to stage this repair.
+ */
+ if (dp == sc->ip || dp == sc->tempip)
+ return 0;
+
+ /*
+ * Similarly, temporary files created to stage a repair cannot be the
+ * parent of this inode.
+ */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
+ /*
+ * Scan the directory to see if there it contains an entry pointing to
+ * the directory that we are repairing.
+ */
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * If this directory is known to be sick, we cannot scan it reliably
+ * and must abort.
+ */
+ if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
+ XFS_SICK_INO_BMBTD |
+ XFS_SICK_INO_DIR)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ /*
+ * We cannot complete our parent pointer scan if a directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_dir_walk(sc, dp, xrep_findparent_dirent, fpi);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+/*
+ * Update this directory's dotdot pointer based on ongoing dirent updates.
+ */
+STATIC int
+xrep_findparent_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_parent_scan_info *pscan;
+ struct xfs_scrub *sc;
+
+ pscan = container_of(nb, struct xrep_parent_scan_info,
+ dhook.dirent_hook.nb);
+ sc = pscan->sc;
+
+ /*
+ * If @p->ip is the subdirectory that we're interested in and we've
+ * already scanned @p->dp, update the dotdot target inumber to the
+ * parent inode.
+ */
+ if (p->ip->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&pscan->iscan, p->dp->i_ino)) {
+ if (p->delta > 0) {
+ xrep_findparent_scan_found(pscan, p->dp->i_ino);
+ } else {
+ xrep_findparent_scan_found(pscan, NULLFSINO);
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Set up a scan to find the parent of a directory. The provided dirent hook
+ * will be called when there is a dotdot update for the inode being repaired.
+ */
+int
+__xrep_findparent_scan_start(
+ struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan,
+ notifier_fn_t custom_fn)
+{
+ int error;
+
+ if (!(sc->flags & XCHK_FSGATES_DIRENTS)) {
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ return -EINVAL;
+ }
+
+ pscan->sc = sc;
+ pscan->parent_ino = NULLFSINO;
+
+ mutex_init(&pscan->lock);
+
+ xchk_iscan_start(sc, 30000, 100, &pscan->iscan);
+
+ /*
+ * Hook into the dirent update code. The hook only operates on inodes
+ * that were already scanned, and the scanner thread takes each inode's
+ * ILOCK, which means that any in-progress inode updates will finish
+ * before we can scan the inode.
+ */
+ if (custom_fn)
+ xfs_dir_hook_setup(&pscan->dhook, custom_fn);
+ else
+ xfs_dir_hook_setup(&pscan->dhook, xrep_findparent_live_update);
+ error = xfs_dir_hook_add(sc->mp, &pscan->dhook);
+ if (error)
+ goto out_iscan;
+
+ return 0;
+out_iscan:
+ xchk_iscan_teardown(&pscan->iscan);
+ mutex_destroy(&pscan->lock);
+ return error;
+}
+
+/*
+ * Scan the entire filesystem looking for a parent inode for the inode being
+ * scrubbed. @sc->ip must not be the root of a directory tree. Callers must
+ * not hold a dirty transaction or any lock that would interfere with taking
+ * an ILOCK.
+ *
+ * Returns 0 with @pscan->parent_ino set to the parent that we found.
+ * Returns 0 with @pscan->parent_ino set to NULLFSINO if we found no parents.
+ * Returns the usual negative errno if something else happened.
+ */
+int
+xrep_findparent_scan(
+ struct xrep_parent_scan_info *pscan)
+{
+ struct xrep_findparent_info fpi = {
+ .sc = pscan->sc,
+ .found_parent = NULLFSINO,
+ .parent_scan = pscan,
+ };
+ struct xfs_scrub *sc = pscan->sc;
+ int ret;
+
+ ASSERT(S_ISDIR(VFS_IC(sc->ip)->i_mode));
+
+ while ((ret = xchk_iscan_iter(&pscan->iscan, &fpi.dp)) == 1) {
+ if (S_ISDIR(VFS_I(fpi.dp)->i_mode))
+ ret = xrep_findparent_walk_directory(&fpi);
+ else
+ ret = 0;
+ xchk_iscan_mark_visited(&pscan->iscan, fpi.dp);
+ xchk_irele(sc, fpi.dp);
+ if (ret)
+ break;
+
+ if (xchk_should_terminate(sc, &ret))
+ break;
+ }
+ xchk_iscan_iter_finish(&pscan->iscan);
+
+ return ret;
+}
+
+/* Tear down a parent scan. */
+void
+xrep_findparent_scan_teardown(
+ struct xrep_parent_scan_info *pscan)
+{
+ xfs_dir_hook_del(pscan->sc->mp, &pscan->dhook);
+ xchk_iscan_teardown(&pscan->iscan);
+ mutex_destroy(&pscan->lock);
+}
+
+/* Finish a parent scan early. */
+void
+xrep_findparent_scan_finish_early(
+ struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino)
+{
+ xrep_findparent_scan_found(pscan, ino);
+ xchk_iscan_finish_early(&pscan->iscan);
+}
+
+/*
+ * Confirm that the directory @parent_ino actually contains a directory entry
+ * pointing to the child @sc->ip->ino. This function returns one of several
+ * ways:
+ *
+ * Returns 0 with @parent_ino unchanged if the parent was confirmed.
+ * Returns 0 with @parent_ino set to NULLFSINO if the parent was not valid.
+ * Returns the usual negative errno if something else happened.
+ */
+int
+xrep_findparent_confirm(
+ struct xfs_scrub *sc,
+ xfs_ino_t *parent_ino)
+{
+ struct xrep_findparent_info fpi = {
+ .sc = sc,
+ .found_parent = NULLFSINO,
+ };
+ int error;
+
+ /*
+ * The root directory always points to itself. Unlinked dirs can point
+ * anywhere, so we point them at the root dir too.
+ */
+ if (sc->ip == sc->mp->m_rootip || VFS_I(sc->ip)->i_nlink == 0) {
+ *parent_ino = sc->mp->m_sb.sb_rootino;
+ return 0;
+ }
+
+ /* Reject garbage parent inode numbers and self-referential parents. */
+ if (*parent_ino == NULLFSINO)
+ return 0;
+ if (!xfs_verify_dir_ino(sc->mp, *parent_ino) ||
+ *parent_ino == sc->ip->i_ino) {
+ *parent_ino = NULLFSINO;
+ return 0;
+ }
+
+ error = xchk_iget(sc, *parent_ino, &fpi.dp);
+ if (error)
+ return error;
+
+ if (!S_ISDIR(VFS_I(fpi.dp)->i_mode)) {
+ *parent_ino = NULLFSINO;
+ goto out_rele;
+ }
+
+ error = xrep_findparent_walk_directory(&fpi);
+ if (error)
+ goto out_rele;
+
+ *parent_ino = fpi.found_parent;
+out_rele:
+ xchk_irele(sc, fpi.dp);
+ return error;
+}
+
+/*
+ * If we're the root of a directory tree, we are our own parent. If we're an
+ * unlinked directory, the parent /won't/ have a link to us. Set the parent
+ * directory to the root for both cases. Returns NULLFSINO if we don't know
+ * what to do.
+ */
+xfs_ino_t
+xrep_findparent_self_reference(
+ struct xfs_scrub *sc)
+{
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino)
+ return sc->mp->m_sb.sb_rootino;
+
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return sc->mp->m_sb.sb_rootino;
+
+ return NULLFSINO;
+}
+
+/* Check the dentry cache to see if knows of a parent for the scrub target. */
+xfs_ino_t
+xrep_findparent_from_dcache(
+ struct xfs_scrub *sc)
+{
+ struct inode *pip = NULL;
+ struct dentry *dentry, *parent;
+ xfs_ino_t ret = NULLFSINO;
+
+ dentry = d_find_alias(VFS_I(sc->ip));
+ if (!dentry)
+ goto out;
+
+ parent = dget_parent(dentry);
+ if (!parent)
+ goto out_dput;
+
+ ASSERT(parent->d_sb == sc->ip->i_mount->m_super);
+
+ pip = igrab(d_inode(parent));
+ dput(parent);
+
+ if (S_ISDIR(pip->i_mode)) {
+ trace_xrep_findparent_from_dcache(sc->ip, XFS_I(pip)->i_ino);
+ ret = XFS_I(pip)->i_ino;
+ }
+
+ xchk_irele(sc, XFS_I(pip));
+
+out_dput:
+ dput(dentry);
+out:
+ return ret;
+}
diff --git a/fs/xfs/scrub/findparent.h b/fs/xfs/scrub/findparent.h
new file mode 100644
index 000000000000..d998c7a88152
--- /dev/null
+++ b/fs/xfs/scrub/findparent.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FINDPARENT_H__
+#define __XFS_SCRUB_FINDPARENT_H__
+
+struct xrep_parent_scan_info {
+ struct xfs_scrub *sc;
+
+ /* Inode scan cursor. */
+ struct xchk_iscan iscan;
+
+ /* Hook to capture directory entry updates. */
+ struct xfs_dir_hook dhook;
+
+ /* Lock protecting parent_ino. */
+ struct mutex lock;
+
+ /* Parent inode that we've found. */
+ xfs_ino_t parent_ino;
+
+ bool lookup_parent;
+};
+
+int __xrep_findparent_scan_start(struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan,
+ notifier_fn_t custom_fn);
+static inline int xrep_findparent_scan_start(struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan)
+{
+ return __xrep_findparent_scan_start(sc, pscan, NULL);
+}
+int xrep_findparent_scan(struct xrep_parent_scan_info *pscan);
+void xrep_findparent_scan_teardown(struct xrep_parent_scan_info *pscan);
+
+static inline void
+xrep_findparent_scan_found(
+ struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino)
+{
+ mutex_lock(&pscan->lock);
+ pscan->parent_ino = ino;
+ mutex_unlock(&pscan->lock);
+}
+
+void xrep_findparent_scan_finish_early(struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino);
+
+int xrep_findparent_confirm(struct xfs_scrub *sc, xfs_ino_t *parent_ino);
+
+xfs_ino_t xrep_findparent_self_reference(struct xfs_scrub *sc);
+xfs_ino_t xrep_findparent_from_dcache(struct xfs_scrub *sc);
+
+#endif /* __XFS_SCRUB_FINDPARENT_H__ */
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index d310737c8823..1d3e98346933 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -85,7 +85,7 @@ xchk_fscount_warmup(
continue;
/* Lock both AG headers. */
- error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp);
if (error)
break;
error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
@@ -412,10 +412,11 @@ xchk_fscount_count_frextents(
int error;
fsc->frextents = 0;
+ fsc->frextents_delayed = 0;
if (!xfs_has_realtime(mp))
return 0;
- xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
error = xfs_rtalloc_query_all(sc->mp, sc->tp,
xchk_fscount_add_frextent, fsc);
if (error) {
@@ -423,8 +424,10 @@ xchk_fscount_count_frextents(
goto out_unlock;
}
+ fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents);
+
out_unlock:
- xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
return error;
}
#else
@@ -434,6 +437,7 @@ xchk_fscount_count_frextents(
struct xchk_fscounters *fsc)
{
fsc->frextents = 0;
+ fsc->frextents_delayed = 0;
return 0;
}
#endif /* CONFIG_XFS_RT */
@@ -517,7 +521,7 @@ xchk_fscounters(
/*
* If the filesystem is not frozen, the counter summation calls above
- * can race with xfs_mod_freecounter, which subtracts a requested space
+ * can race with xfs_dec_freecounter, which subtracts a requested space
* reservation from the counter and undoes the subtraction if that made
* the counter go negative. Therefore, it's possible to see negative
* values here, and we should only flag that as a corruption if we
@@ -593,7 +597,7 @@ xchk_fscounters(
}
if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
- fsc->frextents)) {
+ fsc->frextents - fsc->frextents_delayed)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
else
diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h
index 461a13d25f4b..bcf56e1c36f9 100644
--- a/fs/xfs/scrub/fscounters.h
+++ b/fs/xfs/scrub/fscounters.h
@@ -12,6 +12,7 @@ struct xchk_fscounters {
uint64_t ifree;
uint64_t fdblocks;
uint64_t frextents;
+ uint64_t frextents_delayed;
unsigned long long icount_min;
unsigned long long icount_max;
bool frozen;
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
index 94cdb852bee4..469bf645dbea 100644
--- a/fs/xfs/scrub/fscounters_repair.c
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -65,7 +65,17 @@ xrep_fscounters(
percpu_counter_set(&mp->m_icount, fsc->icount);
percpu_counter_set(&mp->m_ifree, fsc->ifree);
percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
- percpu_counter_set(&mp->m_frextents, fsc->frextents);
+
+ /*
+ * Online repair is only supported on v5 file systems, which require
+ * lazy sb counters and thus no update of sb_fdblocks here. But as of
+ * now we don't support lazy counting sb_frextents yet, and thus need
+ * to also update it directly here. And for that we need to keep
+ * track of the delalloc reservations separately, as they are are
+ * subtracted from m_frextents, but not included in sb_frextents.
+ */
+ percpu_counter_set(&mp->m_frextents,
+ fsc->frextents - fsc->frextents_delayed);
mp->m_sb.sb_frextents = fsc->frextents;
return 0;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 9020a6bef7f1..b712a8bd34f5 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -108,6 +108,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS },
[XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK },
[XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS },
+ [XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE },
};
/* Return the health status mask for this scrub type. */
diff --git a/fs/xfs/scrub/ino_bitmap.h b/fs/xfs/scrub/ino_bitmap.h
new file mode 100644
index 000000000000..1300833679ab
--- /dev/null
+++ b/fs/xfs/scrub/ino_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_INO_BITMAP_H__
+#define __XFS_SCRUB_INO_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_ino_t */
+
+struct xino_bitmap {
+ struct xbitmap64 inobitmap;
+};
+
+static inline void xino_bitmap_init(struct xino_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->inobitmap);
+}
+
+static inline void xino_bitmap_destroy(struct xino_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->inobitmap);
+}
+
+static inline int xino_bitmap_set(struct xino_bitmap *bitmap, xfs_ino_t ino)
+{
+ return xbitmap64_set(&bitmap->inobitmap, ino, 1);
+}
+
+static inline int xino_bitmap_test(struct xino_bitmap *bitmap, xfs_ino_t ino)
+{
+ uint64_t len = 1;
+
+ return xbitmap64_test(&bitmap->inobitmap, ino, &len);
+}
+
+#endif /* __XFS_SCRUB_INO_BITMAP_H__ */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 6e2fe2d6250b..d32716fb2fec 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -739,6 +739,23 @@ xchk_inode_check_reflink_iflag(
xchk_ino_set_corrupt(sc, ino);
}
+/*
+ * If this inode has zero link count, it must be on the unlinked list. If
+ * it has nonzero link count, it must not be on the unlinked list.
+ */
+STATIC void
+xchk_inode_check_unlinked(
+ struct xfs_scrub *sc)
+{
+ if (VFS_I(sc->ip)->i_nlink == 0) {
+ if (!xfs_inode_on_unlinked_list(sc->ip))
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ } else {
+ if (xfs_inode_on_unlinked_list(sc->ip))
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ }
+}
+
/* Scrub an inode. */
int
xchk_inode(
@@ -771,6 +788,8 @@ xchk_inode(
if (S_ISREG(VFS_I(sc->ip)->i_mode))
xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino);
+ xchk_inode_check_unlinked(sc);
+
xchk_inode_xref(sc, sc->ip->i_ino, &di);
out:
return error;
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index eab380e95ef4..daf9f1ee7c2c 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -46,6 +46,7 @@
#include "scrub/repair.h"
#include "scrub/iscan.h"
#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
/*
* Inode Record Repair
@@ -282,6 +283,51 @@ xrep_dinode_findmode_dirent(
return 0;
}
+/* Try to lock a directory, or wait a jiffy. */
+static inline int
+xrep_dinode_ilock_nowait(
+ struct xfs_inode *dp,
+ unsigned int lock_mode)
+{
+ if (xfs_ilock_nowait(dp, lock_mode))
+ return true;
+
+ schedule_timeout_killable(1);
+ return false;
+}
+
+/*
+ * Try to lock a directory to look for ftype hints. Since we already hold the
+ * AGI buffer, we cannot block waiting for the ILOCK because rename can take
+ * the ILOCK and then try to lock AGIs.
+ */
+STATIC int
+xrep_dinode_trylock_directory(
+ struct xrep_inode *ri,
+ struct xfs_inode *dp,
+ unsigned int *lock_modep)
+{
+ unsigned long deadline = jiffies + msecs_to_jiffies(30000);
+ unsigned int lock_mode;
+ int error = 0;
+
+ do {
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ if (xfs_need_iread_extents(&dp->i_df))
+ lock_mode = XFS_ILOCK_EXCL;
+ else
+ lock_mode = XFS_ILOCK_SHARED;
+
+ if (xrep_dinode_ilock_nowait(dp, lock_mode)) {
+ *lock_modep = lock_mode;
+ return 0;
+ }
+ } while (!time_is_before_jiffies(deadline));
+ return -EBUSY;
+}
+
/*
* If this is a directory, walk the dirents looking for any that point to the
* scrub target inode.
@@ -295,11 +341,17 @@ xrep_dinode_findmode_walk_directory(
unsigned int lock_mode;
int error = 0;
+ /* Ignore temporary repair directories. */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
/*
* Scan the directory to see if there it contains an entry pointing to
* the directory that we are repairing.
*/
- lock_mode = xfs_ilock_data_map_shared(dp);
+ error = xrep_dinode_trylock_directory(ri, dp, &lock_mode);
+ if (error)
+ return error;
/*
* If this directory is known to be sick, we cannot scan it reliably
@@ -356,6 +408,7 @@ xrep_dinode_find_mode(
* so there's a real possibility that _iscan_iter can return EBUSY.
*/
xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
+ xchk_iscan_set_agi_trylock(&ri->ftype_iscan);
ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
@@ -463,6 +516,17 @@ xrep_dinode_mode(
return 0;
}
+/* Fix unused link count fields having nonzero values. */
+STATIC void
+xrep_dinode_nlinks(
+ struct xfs_dinode *dip)
+{
+ if (dip->di_version > 1)
+ dip->di_onlink = 0;
+ else
+ dip->di_nlink = 0;
+}
+
/* Fix any conflicting flags that the verifiers complain about. */
STATIC void
xrep_dinode_flags(
@@ -1324,6 +1388,7 @@ xrep_dinode_core(
iget_error = xrep_dinode_mode(ri, dip);
if (iget_error)
goto write;
+ xrep_dinode_nlinks(dip);
xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
xrep_dinode_size(ri, dip);
xrep_dinode_extsize_hints(sc, dip);
@@ -1671,6 +1736,44 @@ xrep_inode_extsize(
}
}
+/* Ensure this file has an attr fork if it needs to hold a parent pointer. */
+STATIC int
+xrep_inode_pptr(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct inode *inode = VFS_I(ip);
+
+ if (!xfs_has_parent(mp))
+ return 0;
+
+ /*
+ * Unlinked inodes that cannot be added to the directory tree will not
+ * have a parent pointer.
+ */
+ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ return 0;
+
+ /* The root directory doesn't have a parent pointer. */
+ if (ip == mp->m_rootip)
+ return 0;
+
+ /*
+ * Metadata inodes are rooted in the superblock and do not have any
+ * parents.
+ */
+ if (xfs_is_metadata_inode(ip))
+ return 0;
+
+ /* Inode already has an attr fork; no further work possible here. */
+ if (xfs_inode_has_attr_fork(ip))
+ return 0;
+
+ return xfs_bmap_add_attrfork(sc->tp, ip,
+ sizeof(struct xfs_attr_sf_hdr), true);
+}
+
/* Fix any irregularities in an inode that the verifiers don't catch. */
STATIC int
xrep_inode_problems(
@@ -1681,6 +1784,9 @@ xrep_inode_problems(
error = xrep_inode_blockcounts(sc);
if (error)
return error;
+ error = xrep_inode_pptr(sc);
+ if (error)
+ return error;
xrep_inode_timestamps(sc->ip);
xrep_inode_flags(sc);
xrep_inode_ids(sc);
@@ -1697,6 +1803,46 @@ xrep_inode_problems(
return xrep_roll_trans(sc);
}
+/*
+ * Make sure this inode's unlinked list pointers are consistent with its
+ * link count.
+ */
+STATIC int
+xrep_inode_unlinked(
+ struct xfs_scrub *sc)
+{
+ unsigned int nlink = VFS_I(sc->ip)->i_nlink;
+ int error;
+
+ /*
+ * If this inode is linked from the directory tree and on the unlinked
+ * list, remove it from the unlinked list.
+ */
+ if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) {
+ struct xfs_perag *pag;
+ int error;
+
+ pag = xfs_perag_get(sc->mp,
+ XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino));
+ error = xfs_iunlink_remove(sc->tp, pag, sc->ip);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If this inode is not linked from the directory tree yet not on the
+ * unlinked list, put it on the unlinked list.
+ */
+ if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) {
+ error = xfs_iunlink(sc->tp, sc->ip);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
/* Repair an inode's fields. */
int
xrep_inode(
@@ -1746,5 +1892,10 @@ xrep_inode(
return error;
}
+ /* Reconnect incore unlinked list */
+ error = xrep_inode_unlinked(sc);
+ if (error)
+ return error;
+
return xrep_defer_finish(sc);
}
diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
index ec3478bc505e..cf9d983667ce 100644
--- a/fs/xfs/scrub/iscan.c
+++ b/fs/xfs/scrub/iscan.c
@@ -243,6 +243,51 @@ xchk_iscan_finish(
mutex_unlock(&iscan->lock);
}
+/* Mark an inode scan finished before we actually scan anything. */
+void
+xchk_iscan_finish_early(
+ struct xchk_iscan *iscan)
+{
+ ASSERT(iscan->cursor_ino == iscan->scan_start_ino);
+ ASSERT(iscan->__visited_ino == iscan->scan_start_ino);
+
+ xchk_iscan_finish(iscan);
+}
+
+/*
+ * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set,
+ * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed,
+ * or the usual negative errno.
+ */
+STATIC int
+xchk_iscan_read_agi(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_buf **agi_bpp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ unsigned long relax;
+ int ret;
+
+ if (!xchk_iscan_agi_needs_trylock(iscan))
+ return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp);
+
+ relax = msecs_to_jiffies(iscan->iget_retry_delay);
+ do {
+ ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK,
+ agi_bpp);
+ if (ret != -EAGAIN)
+ return ret;
+ if (!iscan->iget_timeout ||
+ time_is_before_jiffies(iscan->__iget_deadline))
+ return -EBUSY;
+
+ trace_xchk_iscan_agi_retry_wait(iscan);
+ } while (!schedule_timeout_killable(relax) &&
+ !xchk_iscan_aborted(iscan));
+ return -ECANCELED;
+}
+
/*
* Advance ino to the next inode that the inobt thinks is allocated, being
* careful to jump to the next AG if we've reached the right end of this AG's
@@ -281,7 +326,7 @@ xchk_iscan_advance(
if (!pag)
return -ECANCELED;
- ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+ ret = xchk_iscan_read_agi(iscan, pag, &agi_bp);
if (ret)
goto out_pag;
@@ -363,6 +408,15 @@ xchk_iscan_iget_retry(
}
/*
+ * For an inode scan, we hold the AGI and want to try to grab a batch of
+ * inodes. Holding the AGI prevents inodegc from clearing freed inodes,
+ * so we must use noretry here. For every inode after the first one in the
+ * batch, we don't want to wait, so we use retry there too. Finally, use
+ * dontcache to avoid polluting the cache.
+ */
+#define ISCAN_IGET_FLAGS (XFS_IGET_NORETRY | XFS_IGET_DONTCACHE)
+
+/*
* Grab an inode as part of an inode scan. While scanning this inode, the
* caller must ensure that no other threads can modify the inode until a call
* to xchk_iscan_visit succeeds.
@@ -389,7 +443,7 @@ xchk_iscan_iget(
ASSERT(iscan->__inodes[0] == NULL);
/* Fill the first slot in the inode array. */
- error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0,
&iscan->__inodes[idx]);
trace_xchk_iscan_iget(iscan, error);
@@ -402,8 +456,13 @@ xchk_iscan_iget(
* It's possible that this inode has lost all of its links but
* hasn't yet been inactivated. If we don't have a transaction
* or it's not writable, flush the inodegc workers and wait.
+ * If we have a non-empty transaction, we must not block on
+ * inodegc, which allocates its own transactions.
*/
- xfs_inodegc_flush(mp);
+ if (sc->tp && !(sc->tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
+ xfs_inodegc_push(mp);
+ else
+ xfs_inodegc_flush(mp);
return xchk_iscan_iget_retry(iscan, true);
}
@@ -457,7 +516,7 @@ xchk_iscan_iget(
ASSERT(iscan->__inodes[idx] == NULL);
- error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0,
&iscan->__inodes[idx]);
if (error)
break;
diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h
index 71f657552dfa..f9f47fa01a9e 100644
--- a/fs/xfs/scrub/iscan.h
+++ b/fs/xfs/scrub/iscan.h
@@ -59,6 +59,9 @@ struct xchk_iscan {
/* Set if the scan has been aborted due to some event in the fs. */
#define XCHK_ISCAN_OPSTATE_ABORTED (1)
+/* Use trylock to acquire the AGI */
+#define XCHK_ISCAN_OPSTATE_TRYLOCK_AGI (2)
+
static inline bool
xchk_iscan_aborted(const struct xchk_iscan *iscan)
{
@@ -71,8 +74,21 @@ xchk_iscan_abort(struct xchk_iscan *iscan)
set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
}
+static inline bool
+xchk_iscan_agi_needs_trylock(const struct xchk_iscan *iscan)
+{
+ return test_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate);
+}
+
+static inline void
+xchk_iscan_set_agi_trylock(struct xchk_iscan *iscan)
+{
+ set_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate);
+}
+
void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout,
unsigned int iget_retry_delay, struct xchk_iscan *iscan);
+void xchk_iscan_finish_early(struct xchk_iscan *iscan);
void xchk_iscan_teardown(struct xchk_iscan *iscan);
int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp);
diff --git a/fs/xfs/scrub/listxattr.c b/fs/xfs/scrub/listxattr.c
new file mode 100644
index 000000000000..256ff7700c94
--- /dev/null
+++ b/fs/xfs/scrub/listxattr.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_trans.h"
+#include "scrub/scrub.h"
+#include "scrub/bitmap.h"
+#include "scrub/dab_bitmap.h"
+#include "scrub/listxattr.h"
+
+/* Call a function for every entry in a shortform xattr structure. */
+STATIC int
+xchk_xattr_walk_sf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ void *priv)
+{
+ struct xfs_attr_sf_hdr *hdr = ip->i_af.if_data;
+ struct xfs_attr_sf_entry *sfe;
+ unsigned int i;
+ int error;
+
+ sfe = xfs_attr_sf_firstentry(hdr);
+ for (i = 0; i < hdr->count; i++) {
+ error = attr_fn(sc, ip, sfe->flags, sfe->nameval, sfe->namelen,
+ &sfe->nameval[sfe->namelen], sfe->valuelen,
+ priv);
+ if (error)
+ return error;
+
+ sfe = xfs_attr_sf_nextentry(sfe);
+ }
+
+ return 0;
+}
+
+/* Call a function for every entry in this xattr leaf block. */
+STATIC int
+xchk_xattr_walk_leaf_entries(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ struct xfs_buf *bp,
+ void *priv)
+{
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr_leaf_entry *entry;
+ unsigned int i;
+ int error;
+
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
+
+ for (i = 0; i < ichdr.count; entry++, i++) {
+ void *value;
+ unsigned char *name;
+ unsigned int namelen, valuelen;
+
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ struct xfs_attr_leaf_name_local *name_loc;
+
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
+ name = name_loc->nameval;
+ namelen = name_loc->namelen;
+ value = &name_loc->nameval[name_loc->namelen];
+ valuelen = be16_to_cpu(name_loc->valuelen);
+ } else {
+ struct xfs_attr_leaf_name_remote *name_rmt;
+
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ name = name_rmt->name;
+ namelen = name_rmt->namelen;
+ value = NULL;
+ valuelen = be32_to_cpu(name_rmt->valuelen);
+ }
+
+ error = attr_fn(sc, ip, entry->flags, name, namelen, value,
+ valuelen, priv);
+ if (error)
+ return error;
+
+ }
+
+ return 0;
+}
+
+/*
+ * Call a function for every entry in a leaf-format xattr structure. Avoid
+ * memory allocations for the loop detector since there's only one block.
+ */
+STATIC int
+xchk_xattr_walk_leaf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ void *priv)
+{
+ struct xfs_buf *leaf_bp;
+ int error;
+
+ error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, 0, &leaf_bp);
+ if (error)
+ return error;
+
+ error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, priv);
+ xfs_trans_brelse(sc->tp, leaf_bp);
+ return error;
+}
+
+/* Find the leftmost leaf in the xattr dabtree. */
+STATIC int
+xchk_xattr_find_leftmost_leaf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ struct xdab_bitmap *seen_dablks,
+ struct xfs_buf **leaf_bpp)
+{
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_buf *bp;
+ xfs_failaddr_t fa;
+ xfs_dablk_t blkno = 0;
+ unsigned int expected_level = 0;
+ int error;
+
+ for (;;) {
+ xfs_extlen_t len = 1;
+ uint16_t magic;
+
+ /* Make sure we haven't seen this new block already. */
+ if (xdab_bitmap_test(seen_dablks, blkno, &len))
+ return -EFSCORRUPTED;
+
+ error = xfs_da3_node_read(tp, ip, blkno, &bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ node = bp->b_addr;
+ magic = be16_to_cpu(node->hdr.info.magic);
+ if (magic == XFS_ATTR_LEAF_MAGIC ||
+ magic == XFS_ATTR3_LEAF_MAGIC)
+ break;
+
+ error = -EFSCORRUPTED;
+ if (magic != XFS_DA_NODE_MAGIC &&
+ magic != XFS_DA3_NODE_MAGIC)
+ goto out_buf;
+
+ fa = xfs_da3_node_header_check(bp, ip->i_ino);
+ if (fa)
+ goto out_buf;
+
+ xfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
+
+ if (nodehdr.count == 0 || nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+ goto out_buf;
+
+ /* Check the level from the root node. */
+ if (blkno == 0)
+ expected_level = nodehdr.level - 1;
+ else if (expected_level != nodehdr.level)
+ goto out_buf;
+ else
+ expected_level--;
+
+ /* Remember that we've seen this node. */
+ error = xdab_bitmap_set(seen_dablks, blkno, 1);
+ if (error)
+ goto out_buf;
+
+ /* Find the next level towards the leaves of the dabtree. */
+ btree = nodehdr.btree;
+ blkno = be32_to_cpu(btree->before);
+ xfs_trans_brelse(tp, bp);
+ }
+
+ error = -EFSCORRUPTED;
+ fa = xfs_attr3_leaf_header_check(bp, ip->i_ino);
+ if (fa)
+ goto out_buf;
+
+ if (expected_level != 0)
+ goto out_buf;
+
+ /* Remember that we've seen this leaf. */
+ error = xdab_bitmap_set(seen_dablks, blkno, 1);
+ if (error)
+ goto out_buf;
+
+ *leaf_bpp = bp;
+ return 0;
+
+out_buf:
+ xfs_trans_brelse(tp, bp);
+ return error;
+}
+
+/* Call a function for every entry in a node-format xattr structure. */
+STATIC int
+xchk_xattr_walk_node(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ xchk_xattrleaf_fn leaf_fn,
+ void *priv)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xdab_bitmap seen_dablks;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_buf *leaf_bp;
+ int error;
+
+ xdab_bitmap_init(&seen_dablks);
+
+ error = xchk_xattr_find_leftmost_leaf(sc, ip, &seen_dablks, &leaf_bp);
+ if (error)
+ goto out_bitmap;
+
+ for (;;) {
+ xfs_extlen_t len;
+
+ error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp,
+ priv);
+ if (error)
+ goto out_leaf;
+
+ /* Find the right sibling of this leaf block. */
+ leaf = leaf_bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ if (leafhdr.forw == 0)
+ goto out_leaf;
+
+ xfs_trans_brelse(sc->tp, leaf_bp);
+
+ if (leaf_fn) {
+ error = leaf_fn(sc, priv);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Make sure we haven't seen this new leaf already. */
+ len = 1;
+ if (xdab_bitmap_test(&seen_dablks, leafhdr.forw, &len)) {
+ error = -EFSCORRUPTED;
+ goto out_bitmap;
+ }
+
+ error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino,
+ leafhdr.forw, &leaf_bp);
+ if (error)
+ goto out_bitmap;
+
+ /* Remember that we've seen this new leaf. */
+ error = xdab_bitmap_set(&seen_dablks, leafhdr.forw, 1);
+ if (error)
+ goto out_leaf;
+ }
+
+out_leaf:
+ xfs_trans_brelse(sc->tp, leaf_bp);
+out_bitmap:
+ xdab_bitmap_destroy(&seen_dablks);
+ return error;
+}
+
+/*
+ * Call a function for every extended attribute in a file.
+ *
+ * Callers must hold the ILOCK. No validation or cursor restarts allowed.
+ * Returns -EFSCORRUPTED on any problem, including loops in the dabtree.
+ */
+int
+xchk_xattr_walk(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ xchk_xattrleaf_fn leaf_fn,
+ void *priv)
+{
+ int error;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ if (!xfs_inode_hasattr(ip))
+ return 0;
+
+ if (ip->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+ return xchk_xattr_walk_sf(sc, ip, attr_fn, priv);
+
+ /* attr functions require that the attr fork is loaded */
+ error = xfs_iread_extents(sc->tp, ip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ if (xfs_attr_is_leaf(ip))
+ return xchk_xattr_walk_leaf(sc, ip, attr_fn, priv);
+
+ return xchk_xattr_walk_node(sc, ip, attr_fn, leaf_fn, priv);
+}
diff --git a/fs/xfs/scrub/listxattr.h b/fs/xfs/scrub/listxattr.h
new file mode 100644
index 000000000000..703cfb7b14cf
--- /dev/null
+++ b/fs/xfs/scrub/listxattr.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_LISTXATTR_H__
+#define __XFS_SCRUB_LISTXATTR_H__
+
+typedef int (*xchk_xattr_fn)(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int attr_flags, const unsigned char *name,
+ unsigned int namelen, const void *value, unsigned int valuelen,
+ void *priv);
+
+typedef int (*xchk_xattrleaf_fn)(struct xfs_scrub *sc, void *priv);
+
+int xchk_xattr_walk(struct xfs_scrub *sc, struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn, xchk_xattrleaf_fn leaf_fn, void *priv);
+
+#endif /* __XFS_SCRUB_LISTXATTR_H__ */
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 8a7d9557897c..80aee30886c4 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -18,15 +18,19 @@
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_ag.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/iscan.h"
+#include "scrub/orphanage.h"
#include "scrub/nlinks.h"
#include "scrub/trace.h"
#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/listxattr.h"
/*
* Live Inode Link Count Checking
@@ -43,11 +47,23 @@ int
xchk_setup_nlinks(
struct xfs_scrub *sc)
{
+ struct xchk_nlink_ctrs *xnc;
+ int error;
+
xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
- sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
- if (!sc->buf)
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_nlinks(sc);
+ if (error)
+ return error;
+ }
+
+ xnc = kvzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
+ if (!xnc)
return -ENOMEM;
+ xnc->xname.name = xnc->namebuf;
+ xnc->sc = sc;
+ sc->buf = xnc;
return xchk_setup_fs(sc);
}
@@ -152,6 +168,13 @@ xchk_nlinks_live_update(
xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
+ /*
+ * Ignore temporary directories being used to stage dir repairs, since
+ * we don't bump the link counts of the children.
+ */
+ if (xrep_is_tempfile(p->dp))
+ return NOTIFY_DONE;
+
trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
p->delta, p->name->name, p->name->len);
@@ -251,12 +274,17 @@ xchk_nlinks_collect_dirent(
* number of parents of the root directory.
*
* Otherwise, increment the number of backrefs pointing back to ino.
+ *
+ * If the filesystem has parent pointers, we walk the pptrs to
+ * determine the backref count.
*/
if (dotdot) {
if (dp == sc->mp->m_rootip)
error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
- else
+ else if (!xfs_has_parent(sc->mp))
error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
+ else
+ error = 0;
if (error)
goto out_unlock;
}
@@ -293,6 +321,61 @@ out_incomplete:
return error;
}
+/* Bump the backref count for the inode referenced by this parent pointer. */
+STATIC int
+xchk_nlinks_collect_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+ struct xchk_nlink_ctrs *xnc = priv;
+ const struct xfs_parent_rec *pptr_rec = value;
+ xfs_ino_t parent_ino;
+ int error;
+
+ /* Update the shadow link counts if we haven't already failed. */
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error)
+ return error;
+
+ trace_xchk_nlinks_collect_pptr(sc->mp, ip, &xname, pptr_rec);
+
+ mutex_lock(&xnc->lock);
+
+ error = xchk_nlinks_update_incore(xnc, parent_ino, 0, 1, 0);
+ if (error)
+ goto out_unlock;
+
+ mutex_unlock(&xnc->lock);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xnc->lock);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(sc);
+ return error;
+}
+
/* Walk a directory to bump the observed link counts of the children. */
STATIC int
xchk_nlinks_collect_dir(
@@ -303,6 +386,13 @@ xchk_nlinks_collect_dir(
unsigned int lock_mode;
int error = 0;
+ /*
+ * Ignore temporary directories being used to stage dir repairs, since
+ * we don't bump the link counts of the children.
+ */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
/* Prevent anyone from changing this directory while we walk it. */
xfs_ilock(dp, XFS_IOLOCK_SHARED);
lock_mode = xfs_ilock_data_map_shared(dp);
@@ -332,6 +422,28 @@ xchk_nlinks_collect_dir(
if (error)
goto out_abort;
+ /* Walk the parent pointers to get real backref counts. */
+ if (xfs_has_parent(sc->mp)) {
+ /*
+ * If the extended attributes look as though they has been
+ * zapped by the inode record repair code, we cannot scan for
+ * parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_xattr_walk(sc, dp, xchk_nlinks_collect_pptr, NULL,
+ xnc);
+ if (error == -ECANCELED) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (error)
+ goto out_abort;
+ }
+
xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
goto out_unlock;
@@ -537,6 +649,14 @@ xchk_nlinks_compare_inode(
unsigned int actual_nlink;
int error;
+ /*
+ * Ignore temporary files being used to stage repairs, since we assume
+ * they're correct for non-directories, and the directory repair code
+ * doesn't bump the link counts for the children.
+ */
+ if (xrep_is_tempfile(ip))
+ return 0;
+
xfs_ilock(ip, XFS_ILOCK_SHARED);
mutex_lock(&xnc->lock);
@@ -571,9 +691,11 @@ xchk_nlinks_compare_inode(
* this as a corruption. The VFS won't let users increase the link
* count, but it will let them decrease it.
*/
- if (total_links > XFS_MAXLINK) {
+ if (total_links > XFS_NLINK_PINNED) {
xchk_ino_set_corrupt(sc, ip->i_ino);
goto out_corrupt;
+ } else if (total_links > XFS_MAXLINK) {
+ xchk_ino_set_warning(sc, ip->i_ino);
}
/* Link counts should match. */
@@ -850,9 +972,6 @@ xchk_nlinks_setup_scan(
xfs_agino_t first_agino, last_agino;
int error;
- ASSERT(xnc->sc == NULL);
- xnc->sc = sc;
-
mutex_init(&xnc->lock);
/* Retry iget every tenth of a second for up to 30 seconds. */
diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h
index a950f3daf204..b820712bfd87 100644
--- a/fs/xfs/scrub/nlinks.h
+++ b/fs/xfs/scrub/nlinks.h
@@ -28,6 +28,13 @@ struct xchk_nlink_ctrs {
* from other writer threads.
*/
struct xfs_dir_hook dhook;
+
+ /* Orphanage reparenting request. */
+ struct xrep_adoption adoption;
+
+ /* Directory entry name, plus the trailing null. */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
};
/*
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
index b87618322f55..b3e707f47b7b 100644
--- a/fs/xfs/scrub/nlinks_repair.c
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -17,14 +17,19 @@
#include "xfs_iwalk.h"
#include "xfs_ialloc.h"
#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/iscan.h"
+#include "scrub/orphanage.h"
#include "scrub/nlinks.h"
#include "scrub/trace.h"
+#include "scrub/tempfile.h"
/*
* Live Inode Link Count Repair
@@ -36,6 +41,48 @@
* inode is locked.
*/
+/* Set up to repair inode link counts. */
+int
+xrep_setup_nlinks(
+ struct xfs_scrub *sc)
+{
+ return xrep_orphanage_try_create(sc);
+}
+
+/*
+ * Inodes that aren't the root directory or the orphanage, have a nonzero link
+ * count, and no observed parents should be moved to the orphanage.
+ */
+static inline bool
+xrep_nlinks_is_orphaned(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int actual_nlink,
+ const struct xchk_nlink *obs)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (obs->parents != 0)
+ return false;
+ if (ip == mp->m_rootip || ip == sc->orphanage)
+ return false;
+ return actual_nlink != 0;
+}
+
+/* Remove an inode from the unlinked list. */
+STATIC int
+xrep_nlinks_iunlink_remove(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag;
+ int error;
+
+ pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino));
+ error = xfs_iunlink_remove(sc->tp, pag, sc->ip);
+ xfs_perag_put(pag);
+ return error;
+}
+
/*
* Correct the link count of the given inode. Because we have to grab locks
* and resources in a certain order, it's possible that this will be a no-op.
@@ -50,17 +97,55 @@ xrep_nlinks_repair_inode(
struct xfs_inode *ip = sc->ip;
uint64_t total_links;
uint64_t actual_nlink;
+ bool orphanage_available = false;
bool dirty = false;
int error;
- xchk_ilock(sc, XFS_IOLOCK_EXCL);
+ /*
+ * Ignore temporary files being used to stage repairs, since we assume
+ * they're correct for non-directories, and the directory repair code
+ * doesn't bump the link counts for the children.
+ */
+ if (xrep_is_tempfile(ip))
+ return 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp);
- if (error)
- return error;
+ /*
+ * If the filesystem has an orphanage attached to the scrub context,
+ * prepare for a link count repair that could involve @ip being adopted
+ * by the lost+found.
+ */
+ if (xrep_orphanage_can_adopt(sc)) {
+ error = xrep_orphanage_iolock_two(sc);
+ if (error)
+ return error;
- xchk_ilock(sc, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(sc->tp, ip, 0);
+ error = xrep_adoption_trans_alloc(sc, &xnc->adoption);
+ if (error) {
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ } else {
+ orphanage_available = true;
+ }
+ }
+
+ /*
+ * Either there is no orphanage or we couldn't allocate resources for
+ * that kind of update. Let's try again with only the resources we
+ * need for a simple link count update, since that's much more common.
+ */
+ if (!orphanage_available) {
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0,
+ &sc->tp);
+ if (error) {
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(sc->tp, ip, 0);
+ }
mutex_lock(&xnc->lock);
@@ -99,28 +184,68 @@ xrep_nlinks_repair_inode(
}
/*
- * We did not find any links to this inode. If the inode agrees, we
- * have nothing further to do. If not, the inode has a nonzero link
- * count and we don't have anywhere to graft the child onto. Dropping
- * a live inode's link count to zero can cause unexpected shutdowns in
- * inactivation, so leave it alone.
+ * Decide if we're going to move this file to the orphanage, and fix
+ * up the incore link counts if we are.
*/
- if (total_links == 0) {
- if (actual_nlink != 0)
- trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
- goto out_trans;
+ if (orphanage_available &&
+ xrep_nlinks_is_orphaned(sc, ip, actual_nlink, &obs)) {
+ /* Figure out what name we're going to use here. */
+ error = xrep_adoption_compute_name(&xnc->adoption, &xnc->xname);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Reattach this file to the directory tree by moving it to
+ * the orphanage per the adoption parameters that we already
+ * computed.
+ */
+ error = xrep_adoption_move(&xnc->adoption);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Re-read the link counts since the reparenting will have
+ * updated our scan info.
+ */
+ mutex_lock(&xnc->lock);
+ error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_trans;
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+ dirty = true;
}
- /* Commit the new link count if it changed. */
- if (total_links != actual_nlink) {
- if (total_links > XFS_MAXLINK) {
- trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ /*
+ * If this inode is linked from the directory tree and on the unlinked
+ * list, remove it from the unlinked list.
+ */
+ if (total_links > 0 && xfs_inode_on_unlinked_list(ip)) {
+ error = xrep_nlinks_iunlink_remove(sc);
+ if (error)
goto out_trans;
- }
+ dirty = true;
+ }
+ /*
+ * If this inode is not linked from the directory tree yet not on the
+ * unlinked list, put it on the unlinked list.
+ */
+ if (total_links == 0 && !xfs_inode_on_unlinked_list(ip)) {
+ error = xfs_iunlink(sc->tp, ip);
+ if (error)
+ goto out_trans;
+ dirty = true;
+ }
+
+ /* Commit the new link count if it changed. */
+ if (total_links != actual_nlink) {
trace_xrep_nlinks_update_inode(mp, ip, &obs);
- set_nlink(VFS_I(ip), total_links);
+ set_nlink(VFS_I(ip), min_t(unsigned long long, total_links,
+ XFS_NLINK_PINNED));
dirty = true;
}
@@ -132,14 +257,19 @@ xrep_nlinks_repair_inode(
xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
error = xrep_trans_commit(sc);
- xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- return error;
+ goto out_unlock;
out_scanlock:
mutex_unlock(&xnc->lock);
out_trans:
xchk_trans_cancel(sc);
- xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_unlock:
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ if (orphanage_available) {
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ }
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
return error;
}
@@ -172,10 +302,10 @@ xrep_nlinks(
/*
* We need ftype for an accurate count of the number of child
* subdirectory links. Child subdirectories with a back link (dotdot
- * entry) but no forward link are unfixable, so we cannot repair the
- * link count of the parent directory based on the back link count
- * alone. Filesystems without ftype support are rare (old V4) so we
- * just skip out here.
+ * entry) but no forward link are moved to the orphanage, so we cannot
+ * repair the link count of the parent directory based on the back link
+ * count alone. Filesystems without ftype support are rare (old V4) so
+ * we just skip out here.
*/
if (!xfs_has_ftype(sc->mp))
return -EOPNOTSUPP;
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
new file mode 100644
index 000000000000..7148d8362db8
--- /dev/null
+++ b/fs/xfs/scrub/orphanage.c
@@ -0,0 +1,627 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_dir2.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_parent.h"
+#include "xfs_attr_sf.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/trace.h"
+#include "scrub/orphanage.h"
+#include "scrub/readdir.h"
+
+#include <linux/namei.h>
+
+/*
+ * The Orphanage
+ * =============
+ *
+ * If the directory tree is damaged, children of that directory become
+ * inaccessible via that file path. If a child has no other parents, the file
+ * is said to be orphaned. xfs_repair fixes this situation by creating a
+ * orphanage directory (specifically, /lost+found) and creating a directory
+ * entry pointing to the orphaned file.
+ *
+ * Online repair follows this tactic by creating a root-owned /lost+found
+ * directory if one does not exist. If an orphan is found, it will move that
+ * files into orphanage.
+ */
+
+/* Make the orphanage owned by root. */
+STATIC int
+xrep_chown_orphanage(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_dquot *udqp = NULL, *gdqp = NULL, *pdqp = NULL;
+ struct xfs_dquot *oldu = NULL, *oldg = NULL, *oldp = NULL;
+ struct inode *inode = VFS_I(dp);
+ int error;
+
+ error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+ XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ error = xfs_trans_alloc_ichange(dp, udqp, gdqp, pdqp, true, &tp);
+ if (error)
+ goto out_dqrele;
+
+ /*
+ * Always clear setuid/setgid/sticky on the orphanage since we don't
+ * normally want that functionality on this directory and xfs_repair
+ * doesn't create it this way either. Leave the other access bits
+ * unchanged.
+ */
+ inode->i_mode &= ~(S_ISUID | S_ISGID | S_ISVTX);
+
+ /*
+ * Change the ownerships and register quota modifications
+ * in the transaction.
+ */
+ if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) {
+ if (XFS_IS_UQUOTA_ON(mp))
+ oldu = xfs_qm_vop_chown(tp, dp, &dp->i_udquot, udqp);
+ inode->i_uid = GLOBAL_ROOT_UID;
+ }
+ if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) {
+ if (XFS_IS_GQUOTA_ON(mp))
+ oldg = xfs_qm_vop_chown(tp, dp, &dp->i_gdquot, gdqp);
+ inode->i_gid = GLOBAL_ROOT_GID;
+ }
+ if (dp->i_projid != 0) {
+ if (XFS_IS_PQUOTA_ON(mp))
+ oldp = xfs_qm_vop_chown(tp, dp, &dp->i_pdquot, pdqp);
+ dp->i_projid = 0;
+ }
+
+ dp->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ XFS_STATS_INC(mp, xs_ig_attrchg);
+
+ if (xfs_has_wsync(mp))
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp);
+
+ xfs_qm_dqrele(oldu);
+ xfs_qm_dqrele(oldg);
+ xfs_qm_dqrele(oldp);
+
+out_dqrele:
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+ return error;
+}
+
+#define ORPHANAGE "lost+found"
+
+/* Create the orphanage directory, and set sc->orphanage to it. */
+int
+xrep_orphanage_create(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct dentry *root_dentry, *orphanage_dentry;
+ struct inode *root_inode = VFS_I(sc->mp->m_rootip);
+ struct inode *orphanage_inode;
+ int error;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (xfs_is_readonly(mp)) {
+ sc->orphanage = NULL;
+ return 0;
+ }
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(sc->orphanage == NULL);
+
+ /* Find the dentry for the root directory... */
+ root_dentry = d_find_alias(root_inode);
+ if (!root_dentry) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* ...which is a directory, right? */
+ if (!d_is_dir(root_dentry)) {
+ error = -EFSCORRUPTED;
+ goto out_dput_root;
+ }
+
+ /* Try to find the orphanage directory. */
+ inode_lock_nested(root_inode, I_MUTEX_PARENT);
+ orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry,
+ strlen(ORPHANAGE));
+ if (IS_ERR(orphanage_dentry)) {
+ error = PTR_ERR(orphanage_dentry);
+ goto out_unlock_root;
+ }
+
+ /*
+ * Nothing found? Call mkdir to create the orphanage. Create the
+ * directory without other-user access because we're live and someone
+ * could have been relying partly on minimal access to a parent
+ * directory to control access to a file we put in here.
+ */
+ if (d_really_is_negative(orphanage_dentry)) {
+ error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry,
+ 0750);
+ if (error)
+ goto out_dput_orphanage;
+ }
+
+ /* Not a directory? Bail out. */
+ if (!d_is_dir(orphanage_dentry)) {
+ error = -ENOTDIR;
+ goto out_dput_orphanage;
+ }
+
+ /*
+ * Grab a reference to the orphanage. This /should/ succeed since
+ * we hold the root directory locked and therefore nobody can delete
+ * the orphanage.
+ */
+ orphanage_inode = igrab(d_inode(orphanage_dentry));
+ if (!orphanage_inode) {
+ error = -ENOENT;
+ goto out_dput_orphanage;
+ }
+
+ /* Make sure the orphanage is owned by root. */
+ error = xrep_chown_orphanage(sc, XFS_I(orphanage_inode));
+ if (error)
+ goto out_dput_orphanage;
+
+ /* Stash the reference for later and bail out. */
+ sc->orphanage = XFS_I(orphanage_inode);
+ sc->orphanage_ilock_flags = 0;
+
+out_dput_orphanage:
+ dput(orphanage_dentry);
+out_unlock_root:
+ inode_unlock(VFS_I(sc->mp->m_rootip));
+out_dput_root:
+ dput(root_dentry);
+out:
+ return error;
+}
+
+void
+xrep_orphanage_ilock(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ sc->orphanage_ilock_flags |= ilock_flags;
+ xfs_ilock(sc->orphanage, ilock_flags);
+}
+
+bool
+xrep_orphanage_ilock_nowait(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ if (xfs_ilock_nowait(sc->orphanage, ilock_flags)) {
+ sc->orphanage_ilock_flags |= ilock_flags;
+ return true;
+ }
+
+ return false;
+}
+
+void
+xrep_orphanage_iunlock(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ xfs_iunlock(sc->orphanage, ilock_flags);
+ sc->orphanage_ilock_flags &= ~ilock_flags;
+}
+
+/* Grab the IOLOCK of the orphanage and sc->ip. */
+int
+xrep_orphanage_iolock_two(
+ struct xfs_scrub *sc)
+{
+ int error = 0;
+
+ while (true) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Normal XFS takes the IOLOCK before grabbing a transaction.
+ * Scrub holds a transaction, which means that we can't block
+ * on either IOLOCK.
+ */
+ if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
+ if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ break;
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ }
+ delay(1);
+ }
+
+ return 0;
+}
+
+/* Release the orphanage. */
+void
+xrep_orphanage_rele(
+ struct xfs_scrub *sc)
+{
+ if (!sc->orphanage)
+ return;
+
+ if (sc->orphanage_ilock_flags)
+ xfs_iunlock(sc->orphanage, sc->orphanage_ilock_flags);
+
+ xchk_irele(sc, sc->orphanage);
+ sc->orphanage = NULL;
+}
+
+/* Adoption moves a file into /lost+found */
+
+/* Can the orphanage adopt @sc->ip? */
+bool
+xrep_orphanage_can_adopt(
+ struct xfs_scrub *sc)
+{
+ ASSERT(sc->ip != NULL);
+
+ if (!sc->orphanage)
+ return false;
+ if (sc->ip == sc->orphanage)
+ return false;
+ if (xfs_internal_inum(sc->mp, sc->ip->i_ino))
+ return false;
+ return true;
+}
+
+/*
+ * Create a new transaction to send a child to the orphanage.
+ *
+ * Allocate a new transaction with sufficient disk space to handle the
+ * adoption, take ILOCK_EXCL of the orphanage and sc->ip, joins them to the
+ * transaction, and reserve quota to reparent the latter. Caller must hold the
+ * IOLOCK of the orphanage and sc->ip.
+ */
+int
+xrep_adoption_trans_alloc(
+ struct xfs_scrub *sc,
+ struct xrep_adoption *adopt)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned int child_blkres = 0;
+ int error;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(sc->ip != NULL);
+ ASSERT(sc->orphanage != NULL);
+ ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL);
+ ASSERT(sc->orphanage_ilock_flags & XFS_IOLOCK_EXCL);
+ ASSERT(!(sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)));
+ ASSERT(!(sc->orphanage_ilock_flags &
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)));
+
+ /* Compute the worst case space reservation that we need. */
+ adopt->sc = sc;
+ adopt->orphanage_blkres = xfs_link_space_res(mp, MAXNAMELEN);
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+ child_blkres = xfs_rename_space_res(mp, 0, false,
+ xfs_name_dotdot.len, false);
+ if (xfs_has_parent(mp))
+ child_blkres += XFS_ADDAFORK_SPACE_RES(mp);
+ adopt->child_blkres = child_blkres;
+
+ /*
+ * Allocate a transaction to link the child into the parent, along with
+ * enough disk space to handle expansion of both the orphanage and the
+ * dotdot entry of a child directory.
+ */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link,
+ adopt->orphanage_blkres + adopt->child_blkres, 0, 0,
+ &sc->tp);
+ if (error)
+ return error;
+
+ xfs_lock_two_inodes(sc->orphanage, XFS_ILOCK_EXCL,
+ sc->ip, XFS_ILOCK_EXCL);
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ sc->orphanage_ilock_flags |= XFS_ILOCK_EXCL;
+
+ xfs_trans_ijoin(sc->tp, sc->orphanage, 0);
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /*
+ * Reserve enough quota in the orphan directory to add the new name.
+ * Normally the orphanage should have user/group/project ids of zero
+ * and hence is not subject to quota enforcement, but we're allowed to
+ * exceed quota to reattach disconnected parts of the directory tree.
+ */
+ error = xfs_trans_reserve_quota_nblks(sc->tp, sc->orphanage,
+ adopt->orphanage_blkres, 0, true);
+ if (error)
+ goto out_cancel;
+
+ /*
+ * Reserve enough quota in the child directory to change dotdot.
+ * Here we're also allowed to exceed file quota to repair inconsistent
+ * metadata.
+ */
+ if (adopt->child_blkres) {
+ error = xfs_trans_reserve_quota_nblks(sc->tp, sc->ip,
+ adopt->child_blkres, 0, true);
+ if (error)
+ goto out_cancel;
+ }
+
+ return 0;
+out_cancel:
+ xchk_trans_cancel(sc);
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Compute the xfs_name for the directory entry that we're adding to the
+ * orphanage. Caller must hold ILOCKs of sc->ip and the orphanage and must not
+ * reuse namebuf until the adoption completes or is dissolved.
+ */
+int
+xrep_adoption_compute_name(
+ struct xrep_adoption *adopt,
+ struct xfs_name *xname)
+{
+ struct xfs_scrub *sc = adopt->sc;
+ char *namebuf = (void *)xname->name;
+ xfs_ino_t ino;
+ unsigned int incr = 0;
+ int error = 0;
+
+ adopt->xname = xname;
+ xname->len = snprintf(namebuf, MAXNAMELEN, "%llu", sc->ip->i_ino);
+ xname->type = xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode);
+
+ /* Make sure the filename is unique in the lost+found. */
+ error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino);
+ while (error == 0 && incr < 10000) {
+ xname->len = snprintf(namebuf, MAXNAMELEN, "%llu.%u",
+ sc->ip->i_ino, ++incr);
+ error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino);
+ }
+ if (error == 0) {
+ /* We already have 10,000 entries in the orphanage? */
+ return -EFSCORRUPTED;
+ }
+
+ if (error != -ENOENT)
+ return error;
+ return 0;
+}
+
+/*
+ * Make sure the dcache does not have a positive dentry for the name we've
+ * chosen. The caller should have checked with the ondisk directory, so any
+ * discrepancy is a sign that something is seriously wrong.
+ */
+static int
+xrep_adoption_check_dcache(
+ struct xrep_adoption *adopt)
+{
+ struct qstr qname = QSTR_INIT(adopt->xname->name,
+ adopt->xname->len);
+ struct xfs_scrub *sc = adopt->sc;
+ struct dentry *d_orphanage, *d_child;
+ int error = 0;
+
+ d_orphanage = d_find_alias(VFS_I(sc->orphanage));
+ if (!d_orphanage)
+ return 0;
+
+ d_child = d_hash_and_lookup(d_orphanage, &qname);
+ if (d_child) {
+ trace_xrep_adoption_check_child(sc->mp, d_child);
+
+ if (d_is_positive(d_child)) {
+ ASSERT(d_is_negative(d_child));
+ error = -EFSCORRUPTED;
+ }
+
+ dput(d_child);
+ }
+
+ dput(d_orphanage);
+ return error;
+}
+
+/*
+ * Invalidate all dentries for the name that was added to the orphanage
+ * directory, and all dentries pointing to the child inode that was moved.
+ *
+ * There should not be any positive entries for the name, since we've
+ * maintained our lock on the orphanage directory.
+ */
+static void
+xrep_adoption_zap_dcache(
+ struct xrep_adoption *adopt)
+{
+ struct qstr qname = QSTR_INIT(adopt->xname->name,
+ adopt->xname->len);
+ struct xfs_scrub *sc = adopt->sc;
+ struct dentry *d_orphanage, *d_child;
+
+ /* Invalidate all dentries for the adoption name */
+ d_orphanage = d_find_alias(VFS_I(sc->orphanage));
+ if (!d_orphanage)
+ return;
+
+ d_child = d_hash_and_lookup(d_orphanage, &qname);
+ while (d_child != NULL) {
+ trace_xrep_adoption_invalidate_child(sc->mp, d_child);
+
+ ASSERT(d_is_negative(d_child));
+ d_invalidate(d_child);
+ dput(d_child);
+ d_child = d_lookup(d_orphanage, &qname);
+ }
+
+ dput(d_orphanage);
+
+ /* Invalidate all the dentries pointing down to this file. */
+ while ((d_child = d_find_alias(VFS_I(sc->ip))) != NULL) {
+ trace_xrep_adoption_invalidate_child(sc->mp, d_child);
+
+ d_invalidate(d_child);
+ dput(d_child);
+ }
+}
+
+/*
+ * If we have to add an attr fork ahead of a parent pointer update, how much
+ * space should we ask for?
+ */
+static inline int
+xrep_adoption_attr_sizeof(
+ const struct xrep_adoption *adopt)
+{
+ return sizeof(struct xfs_attr_sf_hdr) +
+ xfs_attr_sf_entsize_byname(sizeof(struct xfs_parent_rec),
+ adopt->xname->len);
+}
+
+/*
+ * Move the current file to the orphanage under the computed name.
+ *
+ * Returns with a dirty transaction so that the caller can handle any other
+ * work, such as fixing up unlinked lists or resetting link counts.
+ */
+int
+xrep_adoption_move(
+ struct xrep_adoption *adopt)
+{
+ struct xfs_scrub *sc = adopt->sc;
+ bool isdir = S_ISDIR(VFS_I(sc->ip)->i_mode);
+ int error;
+
+ trace_xrep_adoption_reparent(sc->orphanage, adopt->xname,
+ sc->ip->i_ino);
+
+ error = xrep_adoption_check_dcache(adopt);
+ if (error)
+ return error;
+
+ /*
+ * If this filesystem has parent pointers, ensure that the file being
+ * moved to the orphanage has an attribute fork. This is required
+ * because the parent pointer code does not itself add attr forks.
+ */
+ if (!xfs_inode_has_attr_fork(sc->ip) && xfs_has_parent(sc->mp)) {
+ int sf_size = xrep_adoption_attr_sizeof(adopt);
+
+ error = xfs_bmap_add_attrfork(sc->tp, sc->ip, sf_size, true);
+ if (error)
+ return error;
+ }
+
+ /* Create the new name in the orphanage. */
+ error = xfs_dir_createname(sc->tp, sc->orphanage, adopt->xname,
+ sc->ip->i_ino, adopt->orphanage_blkres);
+ if (error)
+ return error;
+
+ /*
+ * Bump the link count of the orphanage if we just added a
+ * subdirectory, and update its timestamps.
+ */
+ xfs_trans_ichgtime(sc->tp, sc->orphanage,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ if (isdir)
+ xfs_bumplink(sc->tp, sc->orphanage);
+ xfs_trans_log_inode(sc->tp, sc->orphanage, XFS_ILOG_CORE);
+
+ /* Bump the link count of the child. */
+ if (adopt->bump_child_nlink) {
+ xfs_bumplink(sc->tp, sc->ip);
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ }
+
+ /* Replace the dotdot entry if the child is a subdirectory. */
+ if (isdir) {
+ error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot,
+ sc->orphanage->i_ino, adopt->child_blkres);
+ if (error)
+ return error;
+ }
+
+ /* Add a parent pointer from the file back to the lost+found. */
+ if (xfs_has_parent(sc->mp)) {
+ error = xfs_parent_addname(sc->tp, &adopt->ppargs,
+ sc->orphanage, adopt->xname, sc->ip);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Notify dirent hooks that we moved the file to /lost+found, and
+ * finish all the deferred work so that we know the adoption is fully
+ * recorded in the log.
+ */
+ xfs_dir_update_hook(sc->orphanage, sc->ip, 1, adopt->xname);
+
+ /* Remove negative dentries from the lost+found's dcache */
+ xrep_adoption_zap_dcache(adopt);
+ return 0;
+}
+
+/*
+ * Roll to a clean scrub transaction so that we can release the orphanage,
+ * even if xrep_adoption_move was not called.
+ *
+ * Commits all the work and deferred ops attached to an adoption request and
+ * rolls to a clean scrub transaction. On success, returns 0 with the scrub
+ * context holding a clean transaction with no inodes joined. On failure,
+ * returns negative errno with no scrub transaction. All inode locks are
+ * still held after this function returns.
+ */
+int
+xrep_adoption_trans_roll(
+ struct xrep_adoption *adopt)
+{
+ struct xfs_scrub *sc = adopt->sc;
+ int error;
+
+ trace_xrep_adoption_trans_roll(sc->orphanage, sc->ip,
+ !!(sc->tp->t_flags & XFS_TRANS_DIRTY));
+
+ /* Finish all the deferred ops to commit all repairs. */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+
+ /* Roll the transaction once more to detach the inodes. */
+ return xfs_trans_roll(&sc->tp);
+}
diff --git a/fs/xfs/scrub/orphanage.h b/fs/xfs/scrub/orphanage.h
new file mode 100644
index 000000000000..7c7a2e7d81db
--- /dev/null
+++ b/fs/xfs/scrub/orphanage.h
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ORPHANAGE_H__
+#define __XFS_SCRUB_ORPHANAGE_H__
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_orphanage_create(struct xfs_scrub *sc);
+
+/*
+ * If we're doing a repair, ensure that the orphanage exists and attach it to
+ * the scrub context.
+ */
+static inline int
+xrep_orphanage_try_create(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ ASSERT(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR);
+
+ error = xrep_orphanage_create(sc);
+ switch (error) {
+ case 0:
+ case -ENOENT:
+ case -ENOTDIR:
+ case -ENOSPC:
+ /*
+ * If the orphanage can't be found or isn't a directory, we'll
+ * keep going, but we won't be able to attach the file to the
+ * orphanage if we can't find the parent.
+ */
+ return 0;
+ }
+
+ return error;
+}
+
+int xrep_orphanage_iolock_two(struct xfs_scrub *sc);
+
+void xrep_orphanage_ilock(struct xfs_scrub *sc, unsigned int ilock_flags);
+bool xrep_orphanage_ilock_nowait(struct xfs_scrub *sc,
+ unsigned int ilock_flags);
+void xrep_orphanage_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags);
+
+void xrep_orphanage_rele(struct xfs_scrub *sc);
+
+/* Information about a request to add a file to the orphanage. */
+struct xrep_adoption {
+ struct xfs_scrub *sc;
+
+ /* Name used for the adoption. */
+ struct xfs_name *xname;
+
+ /* Parent pointer context tracking */
+ struct xfs_parent_args ppargs;
+
+ /* Block reservations for orphanage and child (if directory). */
+ unsigned int orphanage_blkres;
+ unsigned int child_blkres;
+
+ /*
+ * Does the caller want us to bump the child link count? This is not
+ * needed when reattaching files that have become disconnected but have
+ * nlink > 1. It is necessary when changing the directory tree
+ * structure.
+ */
+ bool bump_child_nlink:1;
+};
+
+bool xrep_orphanage_can_adopt(struct xfs_scrub *sc);
+
+int xrep_adoption_trans_alloc(struct xfs_scrub *sc,
+ struct xrep_adoption *adopt);
+int xrep_adoption_compute_name(struct xrep_adoption *adopt,
+ struct xfs_name *xname);
+int xrep_adoption_move(struct xrep_adoption *adopt);
+int xrep_adoption_trans_roll(struct xrep_adoption *adopt);
+#else
+struct xrep_adoption { /* empty */ };
+# define xrep_orphanage_rele(sc) ((void)0)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_ORPHANAGE_H__ */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 7db873672146..733c410a2279 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -10,19 +10,37 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/repair.h"
+#include "scrub/listxattr.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/trace.h"
/* Set us up to scrub parents. */
int
xchk_setup_parent(
struct xfs_scrub *sc)
{
+ int error;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_parent(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_inode_contents(sc, 0);
}
@@ -143,7 +161,8 @@ xchk_parent_validate(
}
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
- if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) {
+ if (dp == sc->ip || xrep_is_tempfile(dp) ||
+ !S_ISDIR(VFS_I(dp)->i_mode)) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
goto out_rele;
}
@@ -185,6 +204,621 @@ out_rele:
return error;
}
+/*
+ * Checking of Parent Pointers
+ * ===========================
+ *
+ * On filesystems with directory parent pointers, we check the referential
+ * integrity by visiting each parent pointer of a child file and checking that
+ * the directory referenced by the pointer actually has a dirent pointing
+ * forward to the child file.
+ */
+
+/* Deferred parent pointer entry that we saved for later. */
+struct xchk_pptr {
+ /* Cookie for retrieval of the pptr name. */
+ xfblob_cookie name_cookie;
+
+ /* Parent pointer record. */
+ struct xfs_parent_rec pptr_rec;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+};
+
+struct xchk_pptrs {
+ struct xfs_scrub *sc;
+
+ /* How many parent pointers did we find at the end? */
+ unsigned long long pptrs_found;
+
+ /* Parent of this directory. */
+ xfs_ino_t parent_ino;
+
+ /* Fixed-size array of xchk_pptr structures. */
+ struct xfarray *pptr_entries;
+
+ /* Blobs containing parent pointer names. */
+ struct xfblob *pptr_names;
+
+ /* Scratch buffer for scanning pptr xattrs */
+ struct xfs_da_args pptr_args;
+
+ /* If we've cycled the ILOCK, we must revalidate all deferred pptrs. */
+ bool need_revalidate;
+
+ /* Name buffer */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
+};
+
+/* Does this parent pointer match the dotdot entry? */
+STATIC int
+xchk_parent_scan_dotdot(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xchk_pptrs *pp = priv;
+ xfs_ino_t parent_ino;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error)
+ return error;
+
+ if (pp->parent_ino == parent_ino)
+ return -ECANCELED;
+
+ return 0;
+}
+
+/* Look up the dotdot entry so that we can check it as we walk the pptrs. */
+STATIC int
+xchk_parent_pptr_and_dotdot(
+ struct xchk_pptrs *pp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ int error;
+
+ /* Look up '..' */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &pp->parent_ino);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ return error;
+ if (!xfs_verify_dir_ino(sc->mp, pp->parent_ino)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
+ /* Is this the root dir? Then '..' must point to itself. */
+ if (sc->ip == sc->mp->m_rootip) {
+ if (sc->ip->i_ino != pp->parent_ino)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
+ /*
+ * If this is now an unlinked directory, the dotdot value is
+ * meaningless as long as it points to a valid inode.
+ */
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return 0;
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* Otherwise, walk the pptrs again, and check. */
+ error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_dotdot, NULL, pp);
+ if (error == -ECANCELED) {
+ /* Found a parent pointer that matches dotdot. */
+ return 0;
+ }
+ if (!error || error == -EFSCORRUPTED) {
+ /* Found a broken parent pointer or no match. */
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+ return error;
+}
+
+/*
+ * Try to lock a parent directory for checking dirents. Returns the inode
+ * flags for the locks we now hold, or zero if we failed.
+ */
+STATIC unsigned int
+xchk_parent_lock_dir(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp)
+{
+ if (!xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED))
+ return 0;
+
+ if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) {
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ if (!xfs_need_iread_extents(&dp->i_df))
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED;
+
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+ if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) {
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL;
+}
+
+/* Check the forward link (dirent) associated with this parent pointer. */
+STATIC int
+xchk_parent_dirent(
+ struct xchk_pptrs *pp,
+ const struct xfs_name *xname,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ xfs_ino_t child_ino;
+ int error;
+
+ /*
+ * Use the name attached to this parent pointer to look up the
+ * directory entry in the alleged parent.
+ */
+ error = xchk_dir_lookup(sc, dp, xname, &child_ino);
+ if (error == -ENOENT) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ return error;
+
+ /* Does the inode number match? */
+ if (child_ino != sc->ip->i_ino) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+
+ return 0;
+}
+
+/* Try to grab a parent directory. */
+STATIC int
+xchk_parent_iget(
+ struct xchk_pptrs *pp,
+ const struct xfs_parent_rec *pptr,
+ struct xfs_inode **dpp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ struct xfs_inode *ip;
+ xfs_ino_t parent_ino = be64_to_cpu(pptr->p_ino);
+ int error;
+
+ /* Validate inode number. */
+ error = xfs_dir_ino_validate(sc->mp, parent_ino);
+ if (error) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return -ECANCELED;
+ }
+
+ error = xchk_iget(sc, parent_ino, &ip);
+ if (error == -EINVAL || error == -ENOENT) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return -ECANCELED;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ return error;
+
+ /* The parent must be a directory. */
+ if (!S_ISDIR(VFS_I(ip)->i_mode)) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ goto out_rele;
+ }
+
+ /* Validate generation number. */
+ if (VFS_I(ip)->i_generation != be32_to_cpu(pptr->p_gen)) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ goto out_rele;
+ }
+
+ *dpp = ip;
+ return 0;
+out_rele:
+ xchk_irele(sc, ip);
+ return 0;
+}
+
+/*
+ * Walk an xattr of a file. If this xattr is a parent pointer, follow it up
+ * to a parent directory and check that the parent has a dirent pointing back
+ * to us.
+ */
+STATIC int
+xchk_parent_scan_attr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+ struct xchk_pptrs *pp = priv;
+ struct xfs_inode *dp = NULL;
+ const struct xfs_parent_rec *pptr_rec = value;
+ xfs_ino_t parent_ino;
+ unsigned int lockmode;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return error;
+ }
+
+ /* No self-referential parent pointers. */
+ if (parent_ino == sc->ip->i_ino) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return -ECANCELED;
+ }
+
+ pp->pptrs_found++;
+
+ error = xchk_parent_iget(pp, pptr_rec, &dp);
+ if (error)
+ return error;
+ if (!dp)
+ return 0;
+
+ /* Try to lock the inode. */
+ lockmode = xchk_parent_lock_dir(sc, dp);
+ if (!lockmode) {
+ struct xchk_pptr save_pp = {
+ .pptr_rec = *pptr_rec, /* struct copy */
+ .namelen = namelen,
+ };
+
+ /* Couldn't lock the inode, so save the pptr for later. */
+ trace_xchk_parent_defer(sc->ip, &xname, dp->i_ino);
+
+ error = xfblob_storename(pp->pptr_names, &save_pp.name_cookie,
+ &xname);
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0,
+ &error))
+ goto out_rele;
+
+ error = xfarray_append(pp->pptr_entries, &save_pp);
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0,
+ &error))
+ goto out_rele;
+
+ goto out_rele;
+ }
+
+ error = xchk_parent_dirent(pp, &xname, dp);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lockmode);
+out_rele:
+ xchk_irele(sc, dp);
+ return error;
+}
+
+/*
+ * Revalidate a parent pointer that we collected in the past but couldn't check
+ * because of lock contention. Returns 0 if the parent pointer is still valid,
+ * -ENOENT if it has gone away on us, or a negative errno.
+ */
+STATIC int
+xchk_parent_revalidate_pptr(
+ struct xchk_pptrs *pp,
+ const struct xfs_name *xname,
+ struct xfs_parent_rec *pptr)
+{
+ struct xfs_scrub *sc = pp->sc;
+ int error;
+
+ error = xfs_parent_lookup(sc->tp, sc->ip, xname, pptr, &pp->pptr_args);
+ if (error == -ENOATTR) {
+ /* Parent pointer went away, nothing to revalidate. */
+ return -ENOENT;
+ }
+
+ return error;
+}
+
+/*
+ * Check a parent pointer the slow way, which means we cycle locks a bunch
+ * and put up with revalidation until we get it done.
+ */
+STATIC int
+xchk_parent_slow_pptr(
+ struct xchk_pptrs *pp,
+ const struct xfs_name *xname,
+ struct xfs_parent_rec *pptr)
+{
+ struct xfs_scrub *sc = pp->sc;
+ struct xfs_inode *dp = NULL;
+ unsigned int lockmode;
+ int error;
+
+ /* Check that the deferred parent pointer still exists. */
+ if (pp->need_revalidate) {
+ error = xchk_parent_revalidate_pptr(pp, xname, pptr);
+ if (error == -ENOENT)
+ return 0;
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0,
+ &error))
+ return error;
+ }
+
+ error = xchk_parent_iget(pp, pptr, &dp);
+ if (error)
+ return error;
+ if (!dp)
+ return 0;
+
+ /*
+ * If we can grab both IOLOCK and ILOCK of the alleged parent, we
+ * can proceed with the validation.
+ */
+ lockmode = xchk_parent_lock_dir(sc, dp);
+ if (lockmode) {
+ trace_xchk_parent_slowpath(sc->ip, xname, dp->i_ino);
+ goto check_dirent;
+ }
+
+ /*
+ * We couldn't lock the parent dir. Drop all the locks and try to
+ * get them again, one at a time.
+ */
+ xchk_iunlock(sc, sc->ilock_flags);
+ pp->need_revalidate = true;
+
+ trace_xchk_parent_ultraslowpath(sc->ip, xname, dp->i_ino);
+
+ error = xchk_dir_trylock_for_pptrs(sc, dp, &lockmode);
+ if (error)
+ goto out_rele;
+
+ /* Revalidate the parent pointer now that we cycled locks. */
+ error = xchk_parent_revalidate_pptr(pp, xname, pptr);
+ if (error == -ENOENT) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ goto out_unlock;
+
+check_dirent:
+ error = xchk_parent_dirent(pp, xname, dp);
+out_unlock:
+ xfs_iunlock(dp, lockmode);
+out_rele:
+ xchk_irele(sc, dp);
+ return error;
+}
+
+/* Check all the parent pointers that we deferred the first time around. */
+STATIC int
+xchk_parent_finish_slow_pptrs(
+ struct xchk_pptrs *pp)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ foreach_xfarray_idx(pp->pptr_entries, array_cur) {
+ struct xchk_pptr pptr;
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ error = xfarray_load(pp->pptr_entries, array_cur, &pptr);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(pp->pptr_names, pptr.name_cookie,
+ &pp->xname, pptr.namelen);
+ if (error)
+ return error;
+
+ error = xchk_parent_slow_pptr(pp, &pp->xname, &pptr.pptr_rec);
+ if (error)
+ return error;
+ }
+
+ /* Empty out both xfiles now that we've checked everything. */
+ xfarray_truncate(pp->pptr_entries);
+ xfblob_truncate(pp->pptr_names);
+ return 0;
+}
+
+/* Count the number of parent pointers. */
+STATIC int
+xchk_parent_count_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xchk_pptrs *pp = priv;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ pp->pptrs_found++;
+ return 0;
+}
+
+/*
+ * Compare the number of parent pointers to the link count. For
+ * non-directories these should be the same. For unlinked directories the
+ * count should be zero; for linked directories, it should be nonzero.
+ */
+STATIC int
+xchk_parent_count_pptrs(
+ struct xchk_pptrs *pp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ int error;
+
+ /*
+ * If we cycled the ILOCK while cross-checking parent pointers with
+ * dirents, then we need to recalculate the number of parent pointers.
+ */
+ if (pp->need_revalidate) {
+ pp->pptrs_found = 0;
+ error = xchk_xattr_walk(sc, sc->ip, xchk_parent_count_pptr,
+ NULL, pp);
+ if (error == -EFSCORRUPTED) {
+ /* Found a bad parent pointer */
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+ if (error)
+ return error;
+ }
+
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ if (sc->ip == sc->mp->m_rootip)
+ pp->pptrs_found++;
+
+ if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ else if (VFS_I(sc->ip)->i_nlink > 0 &&
+ pp->pptrs_found == 0)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ } else {
+ if (VFS_I(sc->ip)->i_nlink != pp->pptrs_found)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ }
+
+ return 0;
+}
+
+/* Check parent pointers of a file. */
+STATIC int
+xchk_parent_pptr(
+ struct xfs_scrub *sc)
+{
+ struct xchk_pptrs *pp;
+ char *descr;
+ int error;
+
+ pp = kvzalloc(sizeof(struct xchk_pptrs), XCHK_GFP_FLAGS);
+ if (!pp)
+ return -ENOMEM;
+ pp->sc = sc;
+ pp->xname.name = pp->namebuf;
+
+ /*
+ * Set up some staging memory for parent pointers that we can't check
+ * due to locking contention.
+ */
+ descr = xchk_xfile_ino_descr(sc, "slow parent pointer entries");
+ error = xfarray_create(descr, 0, sizeof(struct xchk_pptr),
+ &pp->pptr_entries);
+ kfree(descr);
+ if (error)
+ goto out_pp;
+
+ descr = xchk_xfile_ino_descr(sc, "slow parent pointer names");
+ error = xfblob_create(descr, &pp->pptr_names);
+ kfree(descr);
+ if (error)
+ goto out_entries;
+
+ error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_attr, NULL, pp);
+ if (error == -ECANCELED) {
+ error = 0;
+ goto out_names;
+ }
+ if (error)
+ goto out_names;
+
+ error = xchk_parent_finish_slow_pptrs(pp);
+ if (error == -ETIMEDOUT) {
+ /* Couldn't grab a lock, scrub was marked incomplete */
+ error = 0;
+ goto out_names;
+ }
+ if (error)
+ goto out_names;
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_names;
+
+ /*
+ * For subdirectories, make sure the dotdot entry references the same
+ * inode as the parent pointers.
+ *
+ * If we're scanning a /consistent/ directory, there should only be
+ * one parent pointer, and it should point to the same directory as
+ * the dotdot entry.
+ *
+ * However, a corrupt directory tree might feature a subdirectory with
+ * multiple parents. The directory loop scanner is responsible for
+ * correcting that kind of problem, so for now we only validate that
+ * the dotdot entry matches /one/ of the parents.
+ */
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ error = xchk_parent_pptr_and_dotdot(pp);
+ if (error)
+ goto out_names;
+ }
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_pp;
+
+ /*
+ * Complain if the number of parent pointers doesn't match the link
+ * count. This could be a sign of missing parent pointers (or an
+ * incorrect link count).
+ */
+ error = xchk_parent_count_pptrs(pp);
+ if (error)
+ goto out_names;
+
+out_names:
+ xfblob_destroy(pp->pptr_names);
+out_entries:
+ xfarray_destroy(pp->pptr_entries);
+out_pp:
+ kvfree(pp);
+ return error;
+}
+
/* Scrub a parent pointer. */
int
xchk_parent(
@@ -194,6 +828,9 @@ xchk_parent(
xfs_ino_t parent_ino;
int error = 0;
+ if (xfs_has_parent(mp))
+ return xchk_parent_pptr(sc);
+
/*
* If we're a directory, check that the '..' link points up to
* a directory that has one entry pointing to us.
@@ -237,3 +874,64 @@ xchk_parent(
return error;
}
+
+/*
+ * Decide if this file's extended attributes (and therefore its parent
+ * pointers) have been zapped to satisfy the inode and ifork verifiers.
+ * Checking and repairing should be postponed until the extended attribute
+ * structure is fixed.
+ */
+bool
+xchk_pptr_looks_zapped(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+
+ ASSERT(xfs_has_parent(mp));
+
+ /*
+ * Temporary files that cannot be linked into the directory tree do not
+ * have attr forks because they cannot ever have parents.
+ */
+ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ return false;
+
+ /*
+ * Directory tree roots do not have parents, so the expected outcome
+ * of a parent pointer scan is always the empty set. It's safe to scan
+ * them even if the attr fork was zapped.
+ */
+ if (ip == mp->m_rootip)
+ return false;
+
+ /*
+ * Metadata inodes are all rooted in the superblock and do not have
+ * any parents. Hence the attr fork will not be initialized, but
+ * there are no parent pointers that might have been zapped.
+ */
+ if (xfs_is_metadata_inode(ip))
+ return false;
+
+ /*
+ * Linked and linkable non-rootdir files should always have an
+ * attribute fork because that is where parent pointers are
+ * stored. If the fork is absent, something is amiss.
+ */
+ if (!xfs_inode_has_attr_fork(ip))
+ return true;
+
+ /* Repair zapped this file's attr fork a short time ago */
+ if (xfs_ifork_zapped(ip, XFS_ATTR_FORK))
+ return true;
+
+ /*
+ * If the dinode repair found a bad attr fork, it will reset the fork
+ * to extents format with zero records and wait for the bmapbta
+ * scrubber to reconstruct the block mappings. The extended attribute
+ * structure always contain some content when parent pointers are
+ * enabled, so this is a clear sign of a zapped attr fork.
+ */
+ return ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0;
+}
diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c
new file mode 100644
index 000000000000..7b42b7f65a0b
--- /dev/null
+++ b/fs/xfs/scrub/parent_repair.c
@@ -0,0 +1,1612 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trans_space.h"
+#include "xfs_health.h"
+#include "xfs_exchmaps.h"
+#include "xfs_parent.h"
+#include "xfs_attr.h"
+#include "xfs_bmap.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/findparent.h"
+#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/orphanage.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/attr_repair.h"
+#include "scrub/listxattr.h"
+
+/*
+ * Repairing The Directory Parent Pointer
+ * ======================================
+ *
+ * Currently, only directories support parent pointers (in the form of '..'
+ * entries), so we simply scan the filesystem and update the '..' entry.
+ *
+ * Note that because the only parent pointer is the dotdot entry, we won't
+ * touch an unhealthy directory, since the directory repair code is perfectly
+ * capable of rebuilding a directory with the proper parent inode.
+ *
+ * See the section on locking issues in dir_repair.c for more information about
+ * conflicts with the VFS. The findparent code wll keep our incore parent
+ * inode up to date.
+ *
+ * If parent pointers are enabled, we instead reconstruct the parent pointer
+ * information by visiting every directory entry of every directory in the
+ * system and translating the relevant dirents into parent pointers. In this
+ * case, it is advantageous to stash all parent pointers created from dirents
+ * from a single parent file before replaying them into the temporary file. To
+ * save memory, the live filesystem scan reuses the findparent object. Parent
+ * pointer repair chooses either directory scanning or findparent, but not
+ * both.
+ *
+ * When salvaging completes, the remaining stashed entries are replayed to the
+ * temporary file. All non-parent pointer extended attributes are copied to
+ * the temporary file's extended attributes. An atomic file mapping exchange
+ * is used to commit the new xattr blocks to the file being repaired. This
+ * will disrupt attrmulti cursors.
+ */
+
+/* Create a parent pointer in the tempfile. */
+#define XREP_PPTR_ADD (1)
+
+/* Remove a parent pointer from the tempfile. */
+#define XREP_PPTR_REMOVE (2)
+
+/* A stashed parent pointer update. */
+struct xrep_pptr {
+ /* Cookie for retrieval of the pptr name. */
+ xfblob_cookie name_cookie;
+
+ /* Parent pointer record. */
+ struct xfs_parent_rec pptr_rec;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+
+ /* XREP_PPTR_{ADD,REMOVE} */
+ uint8_t action;
+};
+
+/*
+ * Stash up to 8 pages of recovered parent pointers in pptr_recs and
+ * pptr_names before we write them to the temp file.
+ */
+#define XREP_PARENT_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+struct xrep_parent {
+ struct xfs_scrub *sc;
+
+ /* Fixed-size array of xrep_pptr structures. */
+ struct xfarray *pptr_recs;
+
+ /* Blobs containing parent pointer names. */
+ struct xfblob *pptr_names;
+
+ /* xattr keys */
+ struct xfarray *xattr_records;
+
+ /* xattr values */
+ struct xfblob *xattr_blobs;
+
+ /* Scratch buffers for saving extended attributes */
+ unsigned char *xattr_name;
+ void *xattr_value;
+ unsigned int xattr_value_sz;
+
+ /*
+ * Information used to exchange the attr fork mappings, if the fs
+ * supports parent pointers.
+ */
+ struct xrep_tempexch tx;
+
+ /*
+ * Information used to scan the filesystem to find the inumber of the
+ * dotdot entry for this directory. On filesystems without parent
+ * pointers, we use the findparent_* functions on this object and
+ * access only the parent_ino field directly.
+ *
+ * When parent pointers are enabled, the directory entry scanner uses
+ * the iscan, hooks, and lock fields of this object directly.
+ * @pscan.lock coordinates access to pptr_recs, pptr_names, pptr, and
+ * pptr_scratch. This reduces the memory requirements of this
+ * structure.
+ *
+ * The lock also controls access to xattr_records and xattr_blobs(?)
+ */
+ struct xrep_parent_scan_info pscan;
+
+ /* Orphanage reparenting request. */
+ struct xrep_adoption adoption;
+
+ /* Directory entry name, plus the trailing null. */
+ struct xfs_name xname;
+ unsigned char namebuf[MAXNAMELEN];
+
+ /* Scratch buffer for scanning pptr xattrs */
+ struct xfs_da_args pptr_args;
+
+ /* Have we seen any live updates of parent pointers recently? */
+ bool saw_pptr_updates;
+
+ /* Number of parents we found after all other repairs */
+ unsigned long long parents;
+};
+
+struct xrep_parent_xattr {
+ /* Cookie for retrieval of the xattr name. */
+ xfblob_cookie name_cookie;
+
+ /* Cookie for retrieval of the xattr value. */
+ xfblob_cookie value_cookie;
+
+ /* XFS_ATTR_* flags */
+ int flags;
+
+ /* Length of the value and name. */
+ uint32_t valuelen;
+ uint16_t namelen;
+};
+
+/*
+ * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write
+ * them to the temp file.
+ */
+#define XREP_PARENT_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+/* Tear down all the incore stuff we created. */
+static void
+xrep_parent_teardown(
+ struct xrep_parent *rp)
+{
+ xrep_findparent_scan_teardown(&rp->pscan);
+ kvfree(rp->xattr_name);
+ rp->xattr_name = NULL;
+ kvfree(rp->xattr_value);
+ rp->xattr_value = NULL;
+ if (rp->xattr_blobs)
+ xfblob_destroy(rp->xattr_blobs);
+ rp->xattr_blobs = NULL;
+ if (rp->xattr_records)
+ xfarray_destroy(rp->xattr_records);
+ rp->xattr_records = NULL;
+ if (rp->pptr_names)
+ xfblob_destroy(rp->pptr_names);
+ rp->pptr_names = NULL;
+ if (rp->pptr_recs)
+ xfarray_destroy(rp->pptr_recs);
+ rp->pptr_recs = NULL;
+}
+
+/* Set up for a parent repair. */
+int
+xrep_setup_parent(
+ struct xfs_scrub *sc)
+{
+ struct xrep_parent *rp;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ rp = kvzalloc(sizeof(struct xrep_parent), XCHK_GFP_FLAGS);
+ if (!rp)
+ return -ENOMEM;
+ rp->sc = sc;
+ rp->xname.name = rp->namebuf;
+ sc->buf = rp;
+
+ error = xrep_tempfile_create(sc, S_IFREG);
+ if (error)
+ return error;
+
+ return xrep_orphanage_try_create(sc);
+}
+
+/*
+ * Scan all files in the filesystem for a child dirent that we can turn into
+ * the dotdot entry for this directory.
+ */
+STATIC int
+xrep_parent_find_dotdot(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t ino;
+ unsigned int sick, checked;
+ int error;
+
+ /*
+ * Avoid sick directories. There shouldn't be anyone else clearing the
+ * directory's sick status.
+ */
+ xfs_inode_measure_sickness(sc->ip, &sick, &checked);
+ if (sick & XFS_SICK_INO_DIR)
+ return -EFSCORRUPTED;
+
+ ino = xrep_findparent_self_reference(sc);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rp->pscan, ino);
+ return 0;
+ }
+
+ /*
+ * Drop the ILOCK on this directory so that we can scan for the dotdot
+ * entry. Figure out who is going to be the parent of this directory,
+ * then retake the ILOCK so that we can salvage directory entries.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* Does the VFS dcache have an answer for us? */
+ ino = xrep_findparent_from_dcache(sc);
+ if (ino != NULLFSINO) {
+ error = xrep_findparent_confirm(sc, &ino);
+ if (!error && ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rp->pscan, ino);
+ goto out_relock;
+ }
+ }
+
+ /* Scan the entire filesystem for a parent. */
+ error = xrep_findparent_scan(&rp->pscan);
+out_relock:
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+
+ return error;
+}
+
+/*
+ * Add this stashed incore parent pointer to the temporary file.
+ * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
+ * must not be in transaction context.
+ */
+STATIC int
+xrep_parent_replay_update(
+ struct xrep_parent *rp,
+ const struct xfs_name *xname,
+ struct xrep_pptr *pptr)
+{
+ struct xfs_scrub *sc = rp->sc;
+
+ switch (pptr->action) {
+ case XREP_PPTR_ADD:
+ /* Create parent pointer. */
+ trace_xrep_parent_replay_parentadd(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ return xfs_parent_set(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rp->pptr_args);
+ case XREP_PPTR_REMOVE:
+ /* Remove parent pointer. */
+ trace_xrep_parent_replay_parentremove(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ return xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rp->pptr_args);
+ }
+
+ ASSERT(0);
+ return -EIO;
+}
+
+/*
+ * Flush stashed parent pointer updates that have been recorded by the scanner.
+ * This is done to reduce the memory requirements of the parent pointer
+ * rebuild, since files can have a lot of hardlinks and the fs can be busy.
+ *
+ * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile
+ * IOLOCK.
+ */
+STATIC int
+xrep_parent_replay_updates(
+ struct xrep_parent *rp)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ mutex_lock(&rp->pscan.lock);
+ foreach_xfarray_idx(rp->pptr_recs, array_cur) {
+ struct xrep_pptr pptr;
+
+ error = xfarray_load(rp->pptr_recs, array_cur, &pptr);
+ if (error)
+ goto out_unlock;
+
+ error = xfblob_loadname(rp->pptr_names, pptr.name_cookie,
+ &rp->xname, pptr.namelen);
+ if (error)
+ goto out_unlock;
+ rp->xname.len = pptr.namelen;
+ mutex_unlock(&rp->pscan.lock);
+
+ error = xrep_parent_replay_update(rp, &rp->xname, &pptr);
+ if (error)
+ return error;
+
+ mutex_lock(&rp->pscan.lock);
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rp->pptr_recs);
+ xfblob_truncate(rp->pptr_names);
+ mutex_unlock(&rp->pscan.lock);
+ return 0;
+out_unlock:
+ mutex_unlock(&rp->pscan.lock);
+ return error;
+}
+
+/*
+ * Remember that we want to create a parent pointer in the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_parent_stash_parentadd(
+ struct xrep_parent *rp,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_pptr pptr = {
+ .action = XREP_PPTR_ADD,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_parent_stash_parentadd(rp->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rp->pptr_recs, &pptr);
+}
+
+/*
+ * Remember that we want to remove a parent pointer from the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_parent_stash_parentremove(
+ struct xrep_parent *rp,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_pptr pptr = {
+ .action = XREP_PPTR_REMOVE,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_parent_stash_parentremove(rp->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rp->pptr_recs, &pptr);
+}
+
+/*
+ * Examine an entry of a directory. If this dirent leads us back to the file
+ * whose parent pointers we're rebuilding, add a pptr to the temporary
+ * directory.
+ */
+STATIC int
+xrep_parent_scan_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_parent *rp = priv;
+ int error;
+
+ /* Dirent doesn't point to this directory. */
+ if (ino != rp->sc->ip->i_ino)
+ return 0;
+
+ /* No weird looking names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /* No mismatching ftypes. */
+ if (name->type != xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode))
+ return -EFSCORRUPTED;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ /*
+ * Transform this dirent into a parent pointer and queue it for later
+ * addition to the temporary file.
+ */
+ mutex_lock(&rp->pscan.lock);
+ error = xrep_parent_stash_parentadd(rp, name, dp);
+ mutex_unlock(&rp->pscan.lock);
+ return error;
+}
+
+/*
+ * Decide if we want to look for dirents in this directory. Skip the file
+ * being repaired and any files being used to stage repairs.
+ */
+static inline bool
+xrep_parent_want_scan(
+ struct xrep_parent *rp,
+ const struct xfs_inode *ip)
+{
+ return ip != rp->sc->ip && !xrep_is_tempfile(ip);
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt.
+ * Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_parent_scan_ilock(
+ struct xrep_parent *rp,
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ /* Still need to take the shared ILOCK to advance the iscan cursor. */
+ if (!xrep_parent_want_scan(rp, ip))
+ goto lock;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ goto lock;
+ }
+
+lock:
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
+/*
+ * Scan this file for relevant child dirents that point to the file whose
+ * parent pointers we're rebuilding.
+ */
+STATIC int
+xrep_parent_scan_file(
+ struct xrep_parent *rp,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode;
+ int error = 0;
+
+ lock_mode = xrep_parent_scan_ilock(rp, ip);
+
+ if (!xrep_parent_want_scan(rp, ip))
+ goto scan_done;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ /*
+ * If the directory looks as though it has been zapped by the
+ * inode record repair code, we cannot scan for child dirents.
+ */
+ if (xchk_dir_looks_zapped(ip)) {
+ error = -EBUSY;
+ goto scan_done;
+ }
+
+ error = xchk_dir_walk(rp->sc, ip, xrep_parent_scan_dirent, rp);
+ if (error)
+ goto scan_done;
+ }
+
+scan_done:
+ xchk_iscan_mark_visited(&rp->pscan.iscan, ip);
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/* Decide if we've stashed too much pptr data in memory. */
+static inline bool
+xrep_parent_want_flush_stashed(
+ struct xrep_parent *rp)
+{
+ unsigned long long bytes;
+
+ bytes = xfarray_bytes(rp->pptr_recs) + xfblob_bytes(rp->pptr_names);
+ return bytes > XREP_PARENT_MAX_STASH_BYTES;
+}
+
+/*
+ * Scan all directories in the filesystem to look for dirents that we can turn
+ * into parent pointers.
+ */
+STATIC int
+xrep_parent_scan_dirtree(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * Filesystem scans are time consuming. Drop the file ILOCK and all
+ * other resources for the duration of the scan and hope for the best.
+ * The live update hooks will keep our scan information up to date.
+ */
+ xchk_trans_cancel(sc);
+ if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
+ xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
+ XFS_ILOCK_EXCL));
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) {
+ bool flush;
+
+ error = xrep_parent_scan_file(rp, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ /* Flush stashed pptr updates to constrain memory usage. */
+ mutex_lock(&rp->pscan.lock);
+ flush = xrep_parent_want_flush_stashed(rp);
+ mutex_unlock(&rp->pscan.lock);
+ if (flush) {
+ xchk_trans_cancel(sc);
+
+ error = xrep_tempfile_iolock_polled(sc);
+ if (error)
+ break;
+
+ error = xrep_parent_replay_updates(rp);
+ xrep_tempfile_iounlock(sc);
+ if (error)
+ break;
+
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ break;
+ }
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rp->pscan.iscan);
+ if (error) {
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Retake sc->ip's ILOCK now that we're done flushing stashed parent
+ * pointers. We end this function with an empty transaction and the
+ * ILOCK.
+ */
+ xchk_ilock(rp->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Capture dirent updates being made by other threads which are relevant to the
+ * file being repaired.
+ */
+STATIC int
+xrep_parent_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_parent *rp;
+ struct xfs_scrub *sc;
+ int error;
+
+ rp = container_of(nb, struct xrep_parent, pscan.dhook.dirent_hook.nb);
+ sc = rp->sc;
+
+ /*
+ * This thread updated a dirent that points to the file that we're
+ * repairing, so stash the update for replay against the temporary
+ * file.
+ */
+ if (p->ip->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&rp->pscan.iscan, p->dp->i_ino)) {
+ mutex_lock(&rp->pscan.lock);
+ if (p->delta > 0)
+ error = xrep_parent_stash_parentadd(rp, p->name, p->dp);
+ else
+ error = xrep_parent_stash_parentremove(rp, p->name,
+ p->dp);
+ if (!error)
+ rp->saw_pptr_updates = true;
+ mutex_unlock(&rp->pscan.lock);
+ if (error)
+ goto out_abort;
+ }
+
+ return NOTIFY_DONE;
+out_abort:
+ xchk_iscan_abort(&rp->pscan.iscan);
+ return NOTIFY_DONE;
+}
+
+/* Reset a directory's dotdot entry, if needed. */
+STATIC int
+xrep_parent_reset_dotdot(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t ino;
+ unsigned int spaceres;
+ int error = 0;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &ino);
+ if (error || ino == rp->pscan.parent_ino)
+ return error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ trace_xrep_parent_reset_dotdot(sc->ip, rp->pscan.parent_ino);
+
+ /*
+ * Reserve more space just in case we have to expand the dir. We're
+ * allowed to exceed quota to repair inconsistent metadata.
+ */
+ spaceres = xfs_rename_space_res(sc->mp, 0, false, xfs_name_dotdot.len,
+ false);
+ error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, spaceres, 0,
+ true);
+ if (error)
+ return error;
+
+ error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot,
+ rp->pscan.parent_ino, spaceres);
+ if (error)
+ return error;
+
+ /*
+ * Roll transaction to detach the inode from the transaction but retain
+ * ILOCK_EXCL.
+ */
+ return xfs_trans_roll(&sc->tp);
+}
+
+/* Pass back the parent inumber if this a parent pointer */
+STATIC int
+xrep_parent_lookup_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ xfs_ino_t *inop = priv;
+ xfs_ino_t parent_ino;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error)
+ return error;
+
+ *inop = parent_ino;
+ return -ECANCELED;
+}
+
+/*
+ * Find the first parent of the scrub target by walking parent pointers for
+ * the purpose of deciding if we're going to move it to the orphanage.
+ * We don't care if the attr fork is zapped.
+ */
+STATIC int
+xrep_parent_lookup_pptrs(
+ struct xfs_scrub *sc,
+ xfs_ino_t *inop)
+{
+ int error;
+
+ *inop = NULLFSINO;
+
+ error = xchk_xattr_walk(sc, sc->ip, xrep_parent_lookup_pptr, NULL,
+ inop);
+ if (error && error != -ECANCELED)
+ return error;
+ return 0;
+}
+
+/*
+ * Move the current file to the orphanage.
+ *
+ * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon
+ * successful return, the scrub transaction will have enough extra reservation
+ * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
+ * orphanage; and both inodes will be ijoined.
+ */
+STATIC int
+xrep_parent_move_to_orphanage(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t orig_parent, new_parent;
+ int error;
+
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ /*
+ * We are about to drop the ILOCK on sc->ip to lock the
+ * orphanage and prepare for the adoption. Therefore, look up
+ * the old dotdot entry for sc->ip so that we can compare it
+ * after we re-lock sc->ip.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot,
+ &orig_parent);
+ if (error)
+ return error;
+ } else {
+ /*
+ * We haven't dropped the ILOCK since we committed the new
+ * xattr structure (and hence the new parent pointer records),
+ * which means that the file cannot have been moved in the
+ * directory tree, and there are no parents.
+ */
+ orig_parent = NULLFSINO;
+ }
+
+ /*
+ * Drop the ILOCK on the scrub target and commit the transaction.
+ * Adoption computes its own resource requirements and gathers the
+ * necessary components.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* If we can take the orphanage's iolock then we're ready to move. */
+ if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
+ xchk_iunlock(sc, sc->ilock_flags);
+ error = xrep_orphanage_iolock_two(sc);
+ if (error)
+ return error;
+ }
+
+ /* Grab transaction and ILOCK the two files. */
+ error = xrep_adoption_trans_alloc(sc, &rp->adoption);
+ if (error)
+ return error;
+
+ error = xrep_adoption_compute_name(&rp->adoption, &rp->xname);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
+ * entry again. If the parent changed or the child was unlinked while
+ * the child directory was unlocked, we don't need to move the child to
+ * the orphanage after all. For a non-directory, we have to scan for
+ * the first parent pointer to see if one has been added.
+ */
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot,
+ &new_parent);
+ else
+ error = xrep_parent_lookup_pptrs(sc, &new_parent);
+ if (error)
+ return error;
+
+ /*
+ * Attach to the orphanage if we still have a linked directory and it
+ * hasn't been moved.
+ */
+ if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
+ error = xrep_adoption_move(&rp->adoption);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Launder the scrub transaction so we can drop the orphanage ILOCK
+ * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK.
+ */
+ error = xrep_adoption_trans_roll(&rp->adoption);
+ if (error)
+ return error;
+
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ return 0;
+}
+
+/* Ensure that the xattr value buffer is large enough. */
+STATIC int
+xrep_parent_alloc_xattr_value(
+ struct xrep_parent *rp,
+ size_t bufsize)
+{
+ void *new_val;
+
+ if (rp->xattr_value_sz >= bufsize)
+ return 0;
+
+ if (rp->xattr_value) {
+ kvfree(rp->xattr_value);
+ rp->xattr_value = NULL;
+ rp->xattr_value_sz = 0;
+ }
+
+ new_val = kvmalloc(bufsize, XCHK_GFP_FLAGS);
+ if (!new_val)
+ return -ENOMEM;
+
+ rp->xattr_value = new_val;
+ rp->xattr_value_sz = bufsize;
+ return 0;
+}
+
+/* Retrieve the (remote) value of a non-pptr xattr. */
+STATIC int
+xrep_parent_fetch_xattr_remote(
+ struct xrep_parent *rp,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ unsigned int valuelen)
+{
+ struct xfs_scrub *sc = rp->sc;
+ struct xfs_da_args args = {
+ .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK,
+ .geo = sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .dp = ip,
+ .name = name,
+ .namelen = namelen,
+ .trans = sc->tp,
+ .valuelen = valuelen,
+ .owner = ip->i_ino,
+ };
+ int error;
+
+ /*
+ * If we need a larger value buffer, try to allocate one. If that
+ * fails, return with -EDEADLOCK to try harder.
+ */
+ error = xrep_parent_alloc_xattr_value(rp, valuelen);
+ if (error == -ENOMEM)
+ return -EDEADLOCK;
+ if (error)
+ return error;
+
+ args.value = rp->xattr_value;
+ xfs_attr_sethash(&args);
+ return xfs_attr_get_ilocked(&args);
+}
+
+/* Stash non-pptr attributes for later replay into the temporary file. */
+STATIC int
+xrep_parent_stash_xattr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xrep_parent_xattr key = {
+ .valuelen = valuelen,
+ .namelen = namelen,
+ .flags = attr_flags & XFS_ATTR_NSP_ONDISK_MASK,
+ };
+ struct xrep_parent *rp = priv;
+ int error;
+
+ if (attr_flags & (XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT))
+ return 0;
+
+ if (!value) {
+ error = xrep_parent_fetch_xattr_remote(rp, ip, attr_flags,
+ name, namelen, valuelen);
+ if (error)
+ return error;
+
+ value = rp->xattr_value;
+ }
+
+ trace_xrep_parent_stash_xattr(rp->sc->tempip, key.flags, (void *)name,
+ key.namelen, key.valuelen);
+
+ error = xfblob_store(rp->xattr_blobs, &key.name_cookie, name,
+ key.namelen);
+ if (error)
+ return error;
+
+ error = xfblob_store(rp->xattr_blobs, &key.value_cookie, value,
+ key.valuelen);
+ if (error)
+ return error;
+
+ return xfarray_append(rp->xattr_records, &key);
+}
+
+/* Insert one xattr key/value. */
+STATIC int
+xrep_parent_insert_xattr(
+ struct xrep_parent *rp,
+ const struct xrep_parent_xattr *key)
+{
+ struct xfs_da_args args = {
+ .dp = rp->sc->tempip,
+ .attr_filter = key->flags,
+ .namelen = key->namelen,
+ .valuelen = key->valuelen,
+ .owner = rp->sc->ip->i_ino,
+ .geo = rp->sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .op_flags = XFS_DA_OP_OKNOENT,
+ };
+ int error;
+
+ ASSERT(!(key->flags & XFS_ATTR_PARENT));
+
+ /*
+ * Grab pointers to the scrub buffer so that we can use them to insert
+ * attrs into the temp file.
+ */
+ args.name = rp->xattr_name;
+ args.value = rp->xattr_value;
+
+ /*
+ * The attribute name is stored near the end of the in-core buffer,
+ * though we reserve one more byte to ensure null termination.
+ */
+ rp->xattr_name[XATTR_NAME_MAX] = 0;
+
+ error = xfblob_load(rp->xattr_blobs, key->name_cookie, rp->xattr_name,
+ key->namelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rp->xattr_blobs, key->name_cookie);
+ if (error)
+ return error;
+
+ error = xfblob_load(rp->xattr_blobs, key->value_cookie, args.value,
+ key->valuelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rp->xattr_blobs, key->value_cookie);
+ if (error)
+ return error;
+
+ rp->xattr_name[key->namelen] = 0;
+
+ trace_xrep_parent_insert_xattr(rp->sc->tempip, key->flags,
+ rp->xattr_name, key->namelen, key->valuelen);
+
+ xfs_attr_sethash(&args);
+ return xfs_attr_set(&args, XFS_ATTRUPDATE_UPSERT, false);
+}
+
+/*
+ * Periodically flush salvaged attributes to the temporary file. This is done
+ * to reduce the memory requirements of the xattr rebuild because files can
+ * contain millions of attributes.
+ */
+STATIC int
+xrep_parent_flush_xattrs(
+ struct xrep_parent *rp)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ /*
+ * Entering this function, the scrub context has a reference to the
+ * inode being repaired, the temporary file, and the empty scrub
+ * transaction that we created for the xattr scan. We hold ILOCK_EXCL
+ * on the inode being repaired.
+ *
+ * To constrain kernel memory use, we occasionally flush salvaged
+ * xattrs from the xfarray and xfblob structures into the temporary
+ * file in preparation for exchanging the xattr structures at the end.
+ * Updating the temporary file requires a transaction, so we commit the
+ * scrub transaction and drop the ILOCK so that xfs_attr_set can
+ * allocate whatever transaction it wants.
+ *
+ * We still hold IOLOCK_EXCL on the inode being repaired, which
+ * prevents anyone from adding xattrs (or parent pointers) while we're
+ * flushing.
+ */
+ xchk_trans_cancel(rp->sc);
+ xchk_iunlock(rp->sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK of the temporary file while we modify xattrs. This
+ * isn't strictly required because the temporary file is never revealed
+ * to userspace, but we follow the same locking rules. We still hold
+ * sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rp->sc);
+ if (error)
+ return error;
+
+ /* Add all the salvaged attrs to the temporary file. */
+ foreach_xfarray_idx(rp->xattr_records, array_cur) {
+ struct xrep_parent_xattr key;
+
+ error = xfarray_load(rp->xattr_records, array_cur, &key);
+ if (error)
+ return error;
+
+ error = xrep_parent_insert_xattr(rp, &key);
+ if (error)
+ return error;
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rp->xattr_records);
+ xfblob_truncate(rp->xattr_blobs);
+
+ xrep_tempfile_iounlock(rp->sc);
+
+ /* Recreate the empty transaction and relock the inode. */
+ error = xchk_trans_alloc_empty(rp->sc);
+ if (error)
+ return error;
+ xchk_ilock(rp->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Decide if we've stashed too much xattr data in memory. */
+static inline bool
+xrep_parent_want_flush_xattrs(
+ struct xrep_parent *rp)
+{
+ unsigned long long bytes;
+
+ bytes = xfarray_bytes(rp->xattr_records) +
+ xfblob_bytes(rp->xattr_blobs);
+ return bytes > XREP_PARENT_XATTR_MAX_STASH_BYTES;
+}
+
+/* Flush staged attributes to the temporary file if we're over the limit. */
+STATIC int
+xrep_parent_try_flush_xattrs(
+ struct xfs_scrub *sc,
+ void *priv)
+{
+ struct xrep_parent *rp = priv;
+ int error;
+
+ if (!xrep_parent_want_flush_xattrs(rp))
+ return 0;
+
+ error = xrep_parent_flush_xattrs(rp);
+ if (error)
+ return error;
+
+ /*
+ * If there were any parent pointer updates to the xattr structure
+ * while we dropped the ILOCK, the xattr structure is now stale.
+ * Signal to the attr copy process that we need to start over, but
+ * this time without opportunistic attr flushing.
+ *
+ * This is unlikely to happen, so we're ok with restarting the copy.
+ */
+ mutex_lock(&rp->pscan.lock);
+ if (rp->saw_pptr_updates)
+ error = -ESTALE;
+ mutex_unlock(&rp->pscan.lock);
+ return error;
+}
+
+/* Copy all the non-pptr extended attributes into the temporary file. */
+STATIC int
+xrep_parent_copy_xattrs(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ int error;
+
+ /*
+ * Clear the pptr updates flag. We hold sc->ip ILOCKed, so there
+ * can't be any parent pointer updates in progress.
+ */
+ mutex_lock(&rp->pscan.lock);
+ rp->saw_pptr_updates = false;
+ mutex_unlock(&rp->pscan.lock);
+
+ /* Copy xattrs, stopping periodically to flush the incore buffers. */
+ error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr,
+ xrep_parent_try_flush_xattrs, rp);
+ if (error && error != -ESTALE)
+ return error;
+
+ if (error == -ESTALE) {
+ /*
+ * The xattr copy collided with a parent pointer update.
+ * Restart the copy, but this time hold the ILOCK all the way
+ * to the end to lock out any directory parent pointer updates.
+ */
+ error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr,
+ NULL, rp);
+ if (error)
+ return error;
+ }
+
+ /* Flush any remaining stashed xattrs to the temporary file. */
+ if (xfarray_bytes(rp->xattr_records) == 0)
+ return 0;
+
+ return xrep_parent_flush_xattrs(rp);
+}
+
+/*
+ * Ensure that @sc->ip and @sc->tempip both have attribute forks before we head
+ * into the attr fork exchange transaction. All files on a filesystem with
+ * parent pointers must have an attr fork because the parent pointer code does
+ * not itself add attribute forks.
+ *
+ * Note: Unlinkable unlinked files don't need one, but the overhead of having
+ * an unnecessary attr fork is not justified by the additional code complexity
+ * that would be needed to track that state correctly.
+ */
+STATIC int
+xrep_parent_ensure_attr_fork(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ int error;
+
+ error = xfs_attr_add_fork(sc->tempip,
+ sizeof(struct xfs_attr_sf_hdr), 1);
+ if (error)
+ return error;
+ return xfs_attr_add_fork(sc->ip, sizeof(struct xfs_attr_sf_hdr), 1);
+}
+
+/*
+ * Finish replaying stashed parent pointer updates, allocate a transaction for
+ * exchanging extent mappings, and take the ILOCKs of both files before we
+ * commit the new attribute structure.
+ */
+STATIC int
+xrep_parent_finalize_tempfile(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ int error;
+
+ /*
+ * Repair relies on the ILOCK to quiesce all possible xattr updates.
+ * Replay all queued parent pointer updates into the tempfile before
+ * exchanging the contents, even if that means dropping the ILOCKs and
+ * the transaction.
+ */
+ do {
+ error = xrep_parent_replay_updates(rp);
+ if (error)
+ return error;
+
+ error = xrep_parent_ensure_attr_fork(rp);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rp->tx);
+ if (error)
+ return error;
+
+ if (xfarray_length(rp->pptr_recs) == 0)
+ break;
+
+ xchk_trans_cancel(sc);
+ xrep_tempfile_iunlock_both(sc);
+ } while (!xchk_should_terminate(sc, &error));
+ return error;
+}
+
+/*
+ * Replay all the stashed parent pointers into the temporary file, copy all
+ * the non-pptr xattrs from the file being repaired into the temporary file,
+ * and exchange the attr fork contents atomically.
+ */
+STATIC int
+xrep_parent_rebuild_pptrs(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t parent_ino = NULLFSINO;
+ int error;
+
+ /*
+ * Copy non-ppttr xattrs from the file being repaired into the
+ * temporary file's xattr structure. We hold sc->ip's IOLOCK, which
+ * prevents setxattr/removexattr calls from occurring, but renames
+ * update the parent pointers without holding IOLOCK. If we detect
+ * stale attr structures, we restart the scan but only flush at the
+ * end.
+ */
+ error = xrep_parent_copy_xattrs(rp);
+ if (error)
+ return error;
+
+ /*
+ * Cancel the empty transaction that we used to walk and copy attrs,
+ * and drop the ILOCK so that we can take the IOLOCK on the temporary
+ * file. We still hold sc->ip's IOLOCK.
+ */
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ error = xrep_tempfile_iolock_polled(sc);
+ if (error)
+ return error;
+
+ /*
+ * Allocate transaction, lock inodes, and make sure that we've replayed
+ * all the stashed pptr updates to the tempdir. After this point,
+ * we're ready to exchange the attr fork mappings.
+ */
+ error = xrep_parent_finalize_tempfile(rp);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing pptr fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (xchk_iscan_aborted(&rp->pscan.iscan))
+ return -ECANCELED;
+
+ /*
+ * Exchange the attr fork contents and junk the old attr fork contents,
+ * which are now in the tempfile.
+ */
+ error = xrep_xattr_swap(sc, &rp->tx);
+ if (error)
+ return error;
+ error = xrep_xattr_reset_tempfile_fork(sc);
+ if (error)
+ return error;
+
+ /*
+ * Roll to get a transaction without any inodes joined to it. Then we
+ * can drop the tempfile's ILOCK and IOLOCK before doing more work on
+ * the scrub target file.
+ */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+ xrep_tempfile_iunlock(sc);
+ xrep_tempfile_iounlock(sc);
+
+ /*
+ * We've committed the new parent pointers. Find at least one parent
+ * so that we can decide if we're moving this file to the orphanage.
+ * For this purpose, root directories are their own parents.
+ */
+ if (sc->ip == sc->mp->m_rootip) {
+ xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino);
+ } else {
+ error = xrep_parent_lookup_pptrs(sc, &parent_ino);
+ if (error)
+ return error;
+ if (parent_ino != NULLFSINO)
+ xrep_findparent_scan_found(&rp->pscan, parent_ino);
+ }
+ return 0;
+}
+
+/*
+ * Commit the new parent pointer structure (currently only the dotdot entry) to
+ * the file that we're repairing.
+ */
+STATIC int
+xrep_parent_rebuild_tree(
+ struct xrep_parent *rp)
+{
+ int error;
+
+ if (xfs_has_parent(rp->sc->mp)) {
+ error = xrep_parent_rebuild_pptrs(rp);
+ if (error)
+ return error;
+ }
+
+ if (rp->pscan.parent_ino == NULLFSINO) {
+ if (xrep_orphanage_can_adopt(rp->sc))
+ return xrep_parent_move_to_orphanage(rp);
+ return -EFSCORRUPTED;
+ }
+
+ if (S_ISDIR(VFS_I(rp->sc->ip)->i_mode))
+ return xrep_parent_reset_dotdot(rp);
+
+ return 0;
+}
+
+/* Count the number of parent pointers. */
+STATIC int
+xrep_parent_count_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xrep_parent *rp = priv;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ rp->parents++;
+ return 0;
+}
+
+/*
+ * After all parent pointer rebuilding and adoption activity completes, reset
+ * the link count of this nondirectory, having scanned the fs to rebuild all
+ * parent pointers.
+ */
+STATIC int
+xrep_parent_set_nondir_nlink(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_perag *pag;
+ bool joined = false;
+ int error;
+
+ /* Count parent pointers so we can reset the file link count. */
+ rp->parents = 0;
+ error = xchk_xattr_walk(sc, ip, xrep_parent_count_pptr, NULL, rp);
+ if (error)
+ return error;
+
+ if (rp->parents > 0 && xfs_inode_on_unlinked_list(ip)) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+
+ /*
+ * The file is on the unlinked list but we found parents.
+ * Remove the file from the unlinked list.
+ */
+ pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, ip->i_ino));
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_iunlink_remove(sc->tp, pag, ip);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ } else if (rp->parents == 0 && !xfs_inode_on_unlinked_list(ip)) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+
+ /*
+ * The file is not on the unlinked list but we found no
+ * parents. Add the file to the unlinked list.
+ */
+ error = xfs_iunlink(sc->tp, ip);
+ if (error)
+ return error;
+ }
+
+ /* Set the correct link count. */
+ if (VFS_I(ip)->i_nlink != rp->parents) {
+ if (!joined) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+ }
+
+ set_nlink(VFS_I(ip), min_t(unsigned long long, rp->parents,
+ XFS_NLINK_PINNED));
+ }
+
+ /* Log the inode to keep it moving forward if we dirtied anything. */
+ if (joined)
+ xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
+
+/* Set up the filesystem scan so we can look for parents. */
+STATIC int
+xrep_parent_setup_scan(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ char *descr;
+ struct xfs_da_geometry *geo = sc->mp->m_attr_geo;
+ int max_len;
+ int error;
+
+ if (!xfs_has_parent(sc->mp))
+ return xrep_findparent_scan_start(sc, &rp->pscan);
+
+ /* Buffers for copying non-pptr attrs to the tempfile */
+ rp->xattr_name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS);
+ if (!rp->xattr_name)
+ return -ENOMEM;
+
+ /*
+ * Allocate enough memory to handle loading local attr values from the
+ * xfblob data while flushing stashed attrs to the temporary file.
+ * We only realloc the buffer when salvaging remote attr values, so
+ * TRY_HARDER means we allocate the maximal attr value size.
+ */
+ if (sc->flags & XCHK_TRY_HARDER)
+ max_len = XATTR_SIZE_MAX;
+ else
+ max_len = xfs_attr_leaf_entsize_local_max(geo->blksize);
+ error = xrep_parent_alloc_xattr_value(rp, max_len);
+ if (error)
+ goto out_xattr_name;
+
+ /* Set up some staging memory for logging parent pointer updates. */
+ descr = xchk_xfile_ino_descr(sc, "parent pointer entries");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_pptr),
+ &rp->pptr_recs);
+ kfree(descr);
+ if (error)
+ goto out_xattr_value;
+
+ descr = xchk_xfile_ino_descr(sc, "parent pointer names");
+ error = xfblob_create(descr, &rp->pptr_names);
+ kfree(descr);
+ if (error)
+ goto out_recs;
+
+ /* Set up some storage for copying attrs before the mapping exchange */
+ descr = xchk_xfile_ino_descr(sc,
+ "parent pointer retained xattr entries");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_parent_xattr),
+ &rp->xattr_records);
+ kfree(descr);
+ if (error)
+ goto out_names;
+
+ descr = xchk_xfile_ino_descr(sc,
+ "parent pointer retained xattr values");
+ error = xfblob_create(descr, &rp->xattr_blobs);
+ kfree(descr);
+ if (error)
+ goto out_attr_keys;
+
+ error = __xrep_findparent_scan_start(sc, &rp->pscan,
+ xrep_parent_live_update);
+ if (error)
+ goto out_attr_values;
+
+ return 0;
+
+out_attr_values:
+ xfblob_destroy(rp->xattr_blobs);
+ rp->xattr_blobs = NULL;
+out_attr_keys:
+ xfarray_destroy(rp->xattr_records);
+ rp->xattr_records = NULL;
+out_names:
+ xfblob_destroy(rp->pptr_names);
+ rp->pptr_names = NULL;
+out_recs:
+ xfarray_destroy(rp->pptr_recs);
+ rp->pptr_recs = NULL;
+out_xattr_value:
+ kvfree(rp->xattr_value);
+ rp->xattr_value = NULL;
+out_xattr_name:
+ kvfree(rp->xattr_name);
+ rp->xattr_name = NULL;
+ return error;
+}
+
+int
+xrep_parent(
+ struct xfs_scrub *sc)
+{
+ struct xrep_parent *rp = sc->buf;
+ int error;
+
+ /*
+ * When the parent pointers feature is enabled, repairs are committed
+ * by atomically committing a new xattr structure and reaping the old
+ * attr fork. Reaping requires rmap and exchange-range to be enabled.
+ */
+ if (xfs_has_parent(sc->mp)) {
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+ }
+
+ error = xrep_parent_setup_scan(rp);
+ if (error)
+ return error;
+
+ if (xfs_has_parent(sc->mp))
+ error = xrep_parent_scan_dirtree(rp);
+ else
+ error = xrep_parent_find_dotdot(rp);
+ if (error)
+ goto out_teardown;
+
+ /* Last chance to abort before we start committing dotdot fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_teardown;
+
+ error = xrep_parent_rebuild_tree(rp);
+ if (error)
+ goto out_teardown;
+ if (xfs_has_parent(sc->mp) && !S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ error = xrep_parent_set_nondir_nlink(rp);
+ if (error)
+ goto out_teardown;
+ }
+
+ error = xrep_defer_finish(sc);
+
+out_teardown:
+ xrep_parent_teardown(rp);
+ return error;
+}
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
index 0bab4c30cb85..90cd1512bba9 100644
--- a/fs/xfs/scrub/quota_repair.c
+++ b/fs/xfs/scrub/quota_repair.c
@@ -77,8 +77,6 @@ xrep_quota_item_fill_bmap_hole(
irec, &nmaps);
if (error)
return error;
- if (nmaps != 1)
- return -ENOSPC;
dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock);
@@ -444,10 +442,6 @@ xrep_quota_data_fork(
XFS_BMAPI_CONVERT, 0, &nrec, &nmap);
if (error)
goto out;
- if (nmap != 1) {
- error = -ENOSPC;
- goto out;
- }
ASSERT(nrec.br_startoff == irec.br_startoff);
ASSERT(nrec.br_blockcount == irec.br_blockcount);
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index dfdcb96b6c16..01c9a2dc0f2c 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -18,6 +18,7 @@
#include "xfs_trans.h"
#include "xfs_error.h"
#include "scrub/scrub.h"
+#include "scrub/common.h"
#include "scrub/readdir.h"
/* Call a function for every entry in a shortform directory. */
@@ -99,7 +100,7 @@ xchk_dir_walk_block(
unsigned int off, next_off, end;
int error;
- error = xfs_dir3_block_read(sc->tp, dp, &bp);
+ error = xfs_dir3_block_read(sc->tp, dp, dp->i_ino, &bp);
if (error)
return error;
@@ -175,7 +176,7 @@ xchk_read_leaf_dir_buf(
if (new_off > *curoff)
*curoff = new_off;
- return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp);
+ return xfs_dir3_data_read(tp, dp, dp->i_ino, map.br_startoff, 0, bpp);
}
/* Call a function for every entry in a leaf directory. */
@@ -273,8 +274,8 @@ xchk_dir_walk(
.dp = dp,
.geo = dp->i_mount->m_dir_geo,
.trans = sc->tp,
+ .owner = dp->i_ino,
};
- bool isblock;
int error;
if (xfs_is_shutdown(dp->i_mount))
@@ -283,22 +284,17 @@ xchk_dir_walk(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ switch (xfs_dir2_format(&args, &error)) {
+ case XFS_DIR2_FMT_SF:
return xchk_dir_walk_sf(sc, dp, dirent_fn, priv);
-
- /* dir2 functions require that the data fork is loaded */
- error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK);
- if (error)
- return error;
-
- error = xfs_dir2_isblock(&args, &isblock);
- if (error)
- return error;
-
- if (isblock)
+ case XFS_DIR2_FMT_BLOCK:
return xchk_dir_walk_block(sc, dp, dirent_fn, priv);
-
- return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv);
+ case XFS_DIR2_FMT_LEAF:
+ case XFS_DIR2_FMT_NODE:
+ return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv);
+ default:
+ return error;
+ }
}
/*
@@ -324,50 +320,102 @@ xchk_dir_lookup(
.hashval = xfs_dir2_hashname(dp->i_mount, name),
.whichfork = XFS_DATA_FORK,
.op_flags = XFS_DA_OP_OKNOENT,
+ .owner = dp->i_ino,
};
- bool isblock, isleaf;
int error;
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
+ /*
+ * A temporary directory's block headers are written with the owner
+ * set to sc->ip, so we must switch the owner here for the lookup.
+ */
+ if (dp == sc->tempip)
+ args.owner = sc->ip->i_ino;
+
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- error = xfs_dir2_sf_lookup(&args);
- goto out_check_rval;
- }
+ error = xfs_dir_lookup_args(&args);
+ if (!error)
+ *ino = args.inumber;
+ return error;
+}
- /* dir2 functions require that the data fork is loaded */
- error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK);
- if (error)
- return error;
+/*
+ * Try to grab the IOLOCK and ILOCK of sc->ip and ip, returning @ip's lock
+ * state. The caller may have a transaction, so we must use trylock for both
+ * IOLOCKs.
+ */
+static inline unsigned int
+xchk_dir_trylock_both(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (!xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ return 0;
- error = xfs_dir2_isblock(&args, &isblock);
- if (error)
- return error;
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+ goto parent_iolock;
- if (isblock) {
- error = xfs_dir2_block_lookup(&args);
- goto out_check_rval;
- }
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ goto parent_ilock;
- error = xfs_dir2_isleaf(&args, &isleaf);
- if (error)
- return error;
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL;
+
+parent_ilock:
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+parent_iolock:
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Try for a limited time to grab the IOLOCK and ILOCK of both the scrub target
+ * (@sc->ip) and the inode at the other end (@ip) of a directory or parent
+ * pointer link so that we can check that link.
+ *
+ * We do not know ahead of time that the directory tree is /not/ corrupt, so we
+ * cannot use the "lock two inode" functions because we do not know that there
+ * is not a racing thread trying to take the locks in opposite order. First
+ * take IOLOCK_EXCL of the scrub target, and then try to take IOLOCK_SHARED
+ * of @ip to synchronize with the VFS. Next, take ILOCK_EXCL of the scrub
+ * target and @ip to synchronize with XFS.
+ *
+ * If the trylocks succeed, *lockmode will be set to the locks held for @ip;
+ * @sc->ilock_flags will be set for the locks held for @sc->ip; and zero will
+ * be returned. If not, returns -EDEADLOCK to try again; or -ETIMEDOUT if
+ * XCHK_TRY_HARDER was set. Returns -EINTR if the process has been killed.
+ */
+int
+xchk_dir_trylock_for_pptrs(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int *lockmode)
+{
+ unsigned int nr;
+ int error = 0;
+
+ ASSERT(sc->ilock_flags == 0);
+
+ for (nr = 0; nr < HZ; nr++) {
+ *lockmode = xchk_dir_trylock_both(sc, ip);
+ if (*lockmode)
+ return 0;
- if (isleaf) {
- error = xfs_dir2_leaf_lookup(&args);
- goto out_check_rval;
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(1);
}
- error = xfs_dir2_node_lookup(&args);
+ if (sc->flags & XCHK_TRY_HARDER) {
+ xchk_set_incomplete(sc);
+ return -ETIMEDOUT;
+ }
-out_check_rval:
- if (error == -EEXIST)
- error = 0;
- if (!error)
- *ino = args.inumber;
- return error;
+ return -EDEADLOCK;
}
diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h
index 55787f4df123..da501877a64d 100644
--- a/fs/xfs/scrub/readdir.h
+++ b/fs/xfs/scrub/readdir.h
@@ -16,4 +16,7 @@ int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp,
int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp,
const struct xfs_name *name, xfs_ino_t *ino);
+int xchk_dir_trylock_for_pptrs(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int *lockmode);
+
#endif /* __XFS_SCRUB_READDIR_H__ */
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 0252a3b5b65a..be283153c254 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -211,6 +211,48 @@ static inline void xreap_defer_finish_reset(struct xreap_state *rs)
rs->force_roll = false;
}
+/*
+ * Compute the maximum length of a buffer cache scan (in units of sectors),
+ * given a quantity of fs blocks.
+ */
+xfs_daddr_t
+xrep_bufscan_max_sectors(
+ struct xfs_mount *mp,
+ xfs_extlen_t fsblocks)
+{
+ int max_fsbs;
+
+ /* Remote xattr values are the largest buffers that we support. */
+ max_fsbs = xfs_attr3_max_rmt_blocks(mp);
+
+ return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
+}
+
+/*
+ * Return an incore buffer from a sector scan, or NULL if there are no buffers
+ * left to return.
+ */
+struct xfs_buf *
+xrep_bufscan_advance(
+ struct xfs_mount *mp,
+ struct xrep_bufscan *scan)
+{
+ scan->__sector_count += scan->daddr_step;
+ while (scan->__sector_count <= scan->max_sectors) {
+ struct xfs_buf *bp = NULL;
+ int error;
+
+ error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr,
+ scan->__sector_count, XBF_LIVESCAN, &bp);
+ if (!error)
+ return bp;
+
+ scan->__sector_count += scan->daddr_step;
+ }
+
+ return NULL;
+}
+
/* Try to invalidate the incore buffers for an extent that we're freeing. */
STATIC void
xreap_agextent_binval(
@@ -241,28 +283,15 @@ xreap_agextent_binval(
* of any plausible size.
*/
while (bno < agbno_next) {
- xfs_agblock_t fsbcount;
- xfs_agblock_t max_fsbs;
-
- /*
- * Max buffer size is the max remote xattr buffer size, which
- * is one fs block larger than 64k.
- */
- max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
- xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
-
- for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
- struct xfs_buf *bp = NULL;
- xfs_daddr_t daddr;
- int error;
-
- daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
- error = xfs_buf_incore(mp->m_ddev_targp, daddr,
- XFS_FSB_TO_BB(mp, fsbcount),
- XBF_LIVESCAN, &bp);
- if (error)
- continue;
-
+ struct xrep_bufscan scan = {
+ .daddr = XFS_AGB_TO_DADDR(mp, agno, bno),
+ .max_sectors = xrep_bufscan_max_sectors(mp,
+ agbno_next - bno),
+ .daddr_step = XFS_FSB_TO_BB(mp, 1),
+ };
+ struct xfs_buf *bp;
+
+ while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
xfs_trans_bjoin(sc->tp, bp);
xfs_trans_binval(sc->tp, bp);
rs->invalidated++;
@@ -646,3 +675,375 @@ xrep_reap_fsblocks(
return 0;
}
+
+/*
+ * Metadata files are not supposed to share blocks with anything else.
+ * If blocks are shared, we remove the reverse mapping (thus reducing the
+ * crosslink factor); if blocks are not shared, we also need to free them.
+ *
+ * This first step determines the longest subset of the passed-in imap
+ * (starting at its beginning) that is either crosslinked or not crosslinked.
+ * The blockcount will be adjust down as needed.
+ */
+STATIC int
+xreap_bmapi_select(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap,
+ bool *crosslinked)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_btree_cur *cur;
+ xfs_filblks_t len = 1;
+ xfs_agblock_t bno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t agbno_next;
+ int error;
+
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
+ agbno_next = agbno + imap->br_blockcount;
+
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+
+ xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
+ error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
+ if (error)
+ goto out_cur;
+
+ bno = agbno + 1;
+ while (bno < agbno_next) {
+ bool also_crosslinked;
+
+ oinfo.oi_offset++;
+ error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo,
+ &also_crosslinked);
+ if (error)
+ goto out_cur;
+
+ if (also_crosslinked != *crosslinked)
+ break;
+
+ len++;
+ bno++;
+ }
+
+ imap->br_blockcount = len;
+ trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Decide if this buffer can be joined to a transaction. This is true for most
+ * buffers, but there are two cases that we want to catch: large remote xattr
+ * value buffers are not logged and can overflow the buffer log item dirty
+ * bitmap size; and oversized cached buffers if things have really gone
+ * haywire.
+ */
+static inline bool
+xreap_buf_loggable(
+ const struct xfs_buf *bp)
+{
+ int i;
+
+ for (i = 0; i < bp->b_map_count; i++) {
+ int chunks;
+ int map_size;
+
+ chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
+ XFS_BLF_CHUNK);
+ map_size = DIV_ROUND_UP(chunks, NBWORD);
+ if (map_size > XFS_BLF_DATAMAP_SIZE)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Invalidate any buffers for this file mapping. The @imap blockcount may be
+ * adjusted downward if we need to roll the transaction.
+ */
+STATIC int
+xreap_bmapi_binval(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag = sc->sa.pag;
+ int bmap_flags = xfs_bmapi_aflag(whichfork);
+ xfs_fileoff_t off;
+ xfs_fileoff_t max_off;
+ xfs_extlen_t scan_blocks;
+ xfs_agnumber_t agno = sc->sa.pag->pag_agno;
+ xfs_agblock_t bno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t agbno_next;
+ unsigned int invalidated = 0;
+ int error;
+
+ /*
+ * Avoid invalidating AG headers and post-EOFS blocks because we never
+ * own those.
+ */
+ agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
+ agbno_next = agbno + imap->br_blockcount;
+ if (!xfs_verify_agbno(pag, agbno) ||
+ !xfs_verify_agbno(pag, agbno_next - 1))
+ return 0;
+
+ /*
+ * Buffers for file blocks can span multiple contiguous mappings. This
+ * means that for each block in the mapping, there could exist an
+ * xfs_buf indexed by that block with any length up to the maximum
+ * buffer size (remote xattr values) or to the next hole in the fork.
+ * To set up our binval scan, first we need to figure out the location
+ * of the next hole.
+ */
+ off = imap->br_startoff + imap->br_blockcount;
+ max_off = off + xfs_attr3_max_rmt_blocks(mp);
+ while (off < max_off) {
+ struct xfs_bmbt_irec hmap;
+ int nhmaps = 1;
+
+ error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
+ &nhmaps, bmap_flags);
+ if (error)
+ return error;
+ if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_bmap_is_real_extent(&hmap))
+ break;
+
+ off = hmap.br_startoff + hmap.br_blockcount;
+ }
+ scan_blocks = off - imap->br_startoff;
+
+ trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks);
+
+ /*
+ * If there are incore buffers for these blocks, invalidate them. If
+ * we can't (try)lock the buffer we assume it's owned by someone else
+ * and leave it alone. The buffer cache cannot detect aliasing, so
+ * employ nested loops to detect incore buffers of any plausible size.
+ */
+ while (bno < agbno_next) {
+ struct xrep_bufscan scan = {
+ .daddr = XFS_AGB_TO_DADDR(mp, agno, bno),
+ .max_sectors = xrep_bufscan_max_sectors(mp,
+ scan_blocks),
+ .daddr_step = XFS_FSB_TO_BB(mp, 1),
+ };
+ struct xfs_buf *bp;
+
+ while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
+ if (xreap_buf_loggable(bp)) {
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
+ } else {
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ }
+ invalidated++;
+
+ /*
+ * Stop invalidating if we've hit the limit; we should
+ * still have enough reservation left to free however
+ * much of the mapping we've seen so far.
+ */
+ if (invalidated > XREAP_MAX_BINVAL) {
+ imap->br_blockcount = agbno_next - bno;
+ goto out;
+ }
+ }
+
+ bno++;
+ scan_blocks--;
+ }
+
+out:
+ trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount);
+ return 0;
+}
+
+/*
+ * Dispose of as much of the beginning of this file fork mapping as possible.
+ * The number of blocks disposed of is returned in @imap->br_blockcount.
+ */
+STATIC int
+xrep_reap_bmapi_iter(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap,
+ bool crosslinked)
+{
+ int error;
+
+ if (crosslinked) {
+ /*
+ * If there are other rmappings, this block is cross linked and
+ * must not be freed. Remove the reverse mapping, leave the
+ * buffer cache in its possibly confused state, and move on.
+ * We don't want to risk discarding valid data buffers from
+ * anybody else who thinks they own the block, even though that
+ * runs the risk of stale buffer warnings in the future.
+ */
+ trace_xreap_dispose_unmap_extent(sc->sa.pag,
+ XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
+ imap->br_blockcount);
+
+ /*
+ * Schedule removal of the mapping from the fork. We use
+ * deferred log intents in this function to control the exact
+ * sequence of metadata updates.
+ */
+ xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -(int64_t)imap->br_blockcount);
+ xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ return 0;
+ }
+
+ /*
+ * If the block is not crosslinked, we can invalidate all the incore
+ * buffers for the extent, and then free the extent. This is a bit of
+ * a mess since we don't detect discontiguous buffers that are indexed
+ * by a block starting before the first block of the extent but overlap
+ * anyway.
+ */
+ trace_xreap_dispose_free_extent(sc->sa.pag,
+ XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
+ imap->br_blockcount);
+
+ /*
+ * Invalidate as many buffers as we can, starting at the beginning of
+ * this mapping. If this function sets blockcount to zero, the
+ * transaction is full of logged buffer invalidations, so we need to
+ * return early so that we can roll and retry.
+ */
+ error = xreap_bmapi_binval(sc, ip, whichfork, imap);
+ if (error || imap->br_blockcount == 0)
+ return error;
+
+ /*
+ * Schedule removal of the mapping from the fork. We use deferred log
+ * intents in this function to control the exact sequence of metadata
+ * updates.
+ */
+ xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -(int64_t)imap->br_blockcount);
+ return xfs_free_extent_later(sc->tp, imap->br_startblock,
+ imap->br_blockcount, NULL, XFS_AG_RESV_NONE, true);
+}
+
+/*
+ * Dispose of as much of this file extent as we can. Upon successful return,
+ * the imap will reflect the mapping that was removed from the fork.
+ */
+STATIC int
+xreap_ifork_extent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap)
+{
+ xfs_agnumber_t agno;
+ bool crosslinked;
+ int error;
+
+ ASSERT(sc->sa.pag == NULL);
+
+ trace_xreap_ifork_extent(sc, ip, whichfork, imap);
+
+ agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
+ sc->sa.pag = xfs_perag_get(sc->mp, agno);
+ if (!sc->sa.pag)
+ return -EFSCORRUPTED;
+
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
+ if (error)
+ goto out_pag;
+
+ /*
+ * Decide the fate of the blocks at the beginning of the mapping, then
+ * update the mapping to use it with the unmap calls.
+ */
+ error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
+ if (error)
+ goto out_agf;
+
+ error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
+ if (error)
+ goto out_agf;
+
+out_agf:
+ xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
+ sc->sa.agf_bp = NULL;
+out_pag:
+ xfs_perag_put(sc->sa.pag);
+ sc->sa.pag = NULL;
+ return error;
+}
+
+/*
+ * Dispose of each block mapped to the given fork of the given file. Callers
+ * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork
+ * must not have any delalloc reservations.
+ */
+int
+xrep_reap_ifork(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ xfs_fileoff_t off = 0;
+ int bmap_flags = xfs_bmapi_aflag(whichfork);
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(ip == sc->ip || ip == sc->tempip);
+ ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
+
+ while (off < XFS_MAX_FILEOFF) {
+ struct xfs_bmbt_irec imap;
+ int nimaps = 1;
+
+ /* Read the next extent, skip past holes and delalloc. */
+ error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap,
+ &nimaps, bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * If this is a real space mapping, reap as much of it as we
+ * can in a single transaction.
+ */
+ if (xfs_bmap_is_real_extent(&imap)) {
+ error = xreap_ifork_extent(sc, ip, whichfork, &imap);
+ if (error)
+ return error;
+
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ off = imap.br_startoff + imap.br_blockcount;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index 0b69f16dd98f..3f2f1775e29d 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -13,5 +13,26 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
const struct xfs_owner_info *oinfo);
+int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
+
+/* Buffer cache scan context. */
+struct xrep_bufscan {
+ /* Disk address for the buffers we want to scan. */
+ xfs_daddr_t daddr;
+
+ /* Maximum number of sectors to scan. */
+ xfs_daddr_t max_sectors;
+
+ /* Each round, increment the search length by this number of sectors. */
+ xfs_daddr_t daddr_step;
+
+ /* Internal scan state; initialize to zero. */
+ xfs_daddr_t __sector_count;
+};
+
+xfs_daddr_t xrep_bufscan_max_sectors(struct xfs_mount *mp,
+ xfs_extlen_t fsblocks);
+struct xfs_buf *xrep_bufscan_advance(struct xfs_mount *mp,
+ struct xrep_bufscan *scan);
#endif /* __XFS_SCRUB_REAP_H__ */
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index f43dce771cdd..67478294f11a 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -32,6 +32,10 @@
#include "xfs_reflink.h"
#include "xfs_health.h"
#include "xfs_buf_mem.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_dir2.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -39,6 +43,7 @@
#include "scrub/bitmap.h"
#include "scrub/stats.h"
#include "scrub/xfile.h"
+#include "scrub/attr_repair.h"
/*
* Attempt to repair some metadata, if the metadata is corrupt and userspace
@@ -290,7 +295,7 @@ xrep_calc_ag_resblks(
icount = pag->pagi_count;
} else {
/* Try to get the actual counters from disk. */
- error = xfs_ialloc_read_agi(pag, NULL, &bp);
+ error = xfs_ialloc_read_agi(pag, NULL, 0, &bp);
if (!error) {
icount = pag->pagi_count;
xfs_buf_relse(bp);
@@ -724,7 +729,7 @@ xrep_update_qflags(
xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1);
no_update:
- mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+ mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
}
/* Force a quotacheck the next time we mount. */
@@ -908,7 +913,7 @@ xrep_reinit_pagi(
ASSERT(xfs_perag_initialised_agi(pag));
clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
- error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, 0, &bp);
if (error)
return error;
@@ -934,7 +939,7 @@ xrep_ag_init(
ASSERT(!sa->pag);
- error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, 0, &sa->agi_bp);
if (error)
return error;
@@ -963,9 +968,7 @@ xrep_reset_perag_resv(
ASSERT(sc->tp);
sc->flags &= ~XREP_RESET_PERAG_RESV;
- error = xfs_ag_resv_free(sc->sa.pag);
- if (error)
- goto out;
+ xfs_ag_resv_free(sc->sa.pag);
error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
if (error == -ENOSPC) {
xfs_err(sc->mp,
@@ -974,7 +977,6 @@ xrep_reset_perag_resv(
error = 0;
}
-out:
return error;
}
@@ -1004,55 +1006,27 @@ xrep_metadata_inode_subtype(
struct xfs_scrub *sc,
unsigned int scrub_type)
{
- __u32 smtype = sc->sm->sm_type;
- __u32 smflags = sc->sm->sm_flags;
- unsigned int sick_mask = sc->sick_mask;
+ struct xfs_scrub_subord *sub;
int error;
/*
- * Let's see if the inode needs repair. We're going to open-code calls
- * to the scrub and repair functions so that we can hang on to the
+ * Let's see if the inode needs repair. Use a subordinate scrub context
+ * to call the scrub and repair functions so that we can hang on to the
* resources that we already acquired instead of using the standard
* setup/teardown routines.
*/
- sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
- sc->sm->sm_type = scrub_type;
-
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xchk_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xchk_bmap_data(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTA:
- error = xchk_bmap_attr(sc);
- break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- }
+ sub = xchk_scrub_create_subord(sc, scrub_type);
+ error = sub->sc.ops->scrub(&sub->sc);
if (error)
goto out;
-
- if (!xrep_will_attempt(sc))
+ if (!xrep_will_attempt(&sub->sc))
goto out;
/*
* Repair some part of the inode. This will potentially join the inode
* to the transaction.
*/
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xrep_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xrep_bmap(sc, XFS_DATA_FORK, false);
- break;
- case XFS_SCRUB_TYPE_BMBTA:
- error = xrep_bmap(sc, XFS_ATTR_FORK, false);
- break;
- }
+ error = sub->sc.ops->repair(&sub->sc);
if (error)
goto out;
@@ -1061,10 +1035,10 @@ xrep_metadata_inode_subtype(
* that the inode will not be joined to the transaction when we exit
* the function.
*/
- error = xfs_defer_finish(&sc->tp);
+ error = xfs_defer_finish(&sub->sc.tp);
if (error)
goto out;
- error = xfs_trans_roll(&sc->tp);
+ error = xfs_trans_roll(&sub->sc.tp);
if (error)
goto out;
@@ -1072,31 +1046,18 @@ xrep_metadata_inode_subtype(
* Clear the corruption flags and re-check the metadata that we just
* repaired.
*/
- sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
-
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xchk_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xchk_bmap_data(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTA:
- error = xchk_bmap_attr(sc);
- break;
- }
+ sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ error = sub->sc.ops->scrub(&sub->sc);
if (error)
goto out;
/* If corruption persists, the repair has failed. */
- if (xchk_needs_repair(sc->sm)) {
+ if (xchk_needs_repair(sub->sc.sm)) {
error = -EFSCORRUPTED;
goto out;
}
out:
- sc->sick_mask = sick_mask;
- sc->sm->sm_type = smtype;
- sc->sm->sm_flags = smflags;
+ xchk_scrub_free_subord(sub);
return error;
}
@@ -1136,6 +1097,17 @@ xrep_metadata_inode_forks(
return error;
}
+ /* Clear the attr forks since metadata shouldn't have that. */
+ if (xfs_inode_hasattr(sc->ip)) {
+ if (!dirty) {
+ dirty = true;
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ }
+ error = xrep_xattr_reset_fork(sc);
+ if (error)
+ return error;
+ }
+
/*
* If we modified the inode, roll the transaction but don't rejoin the
* inode to the new transaction because xrep_bmap_data can do that.
@@ -1201,3 +1173,34 @@ xrep_trans_cancel_hook_dummy(
current->journal_info = *cookiep;
*cookiep = NULL;
}
+
+/*
+ * See if this buffer can pass the given ->verify_struct() function.
+ *
+ * If the buffer already has ops attached and they're not the ones that were
+ * passed in, we reject the buffer. Otherwise, we perform the structure test
+ * (note that we do not check CRCs) and return the outcome of the test. The
+ * buffer ops and error state are left unchanged.
+ */
+bool
+xrep_buf_verify_struct(
+ struct xfs_buf *bp,
+ const struct xfs_buf_ops *ops)
+{
+ const struct xfs_buf_ops *old_ops = bp->b_ops;
+ xfs_failaddr_t fa;
+ int old_error;
+
+ if (old_ops) {
+ if (old_ops != ops)
+ return false;
+ }
+
+ old_error = bp->b_error;
+ bp->b_ops = ops;
+ fa = bp->b_ops->verify_struct(bp);
+ bp->b_ops = old_ops;
+ bp->b_error = old_error;
+
+ return fa == NULL;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index ce082d941459..0e0dc2bf985c 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -90,6 +90,12 @@ int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
int xrep_metadata_inode_forks(struct xfs_scrub *sc);
int xrep_setup_ag_rmapbt(struct xfs_scrub *sc);
int xrep_setup_ag_refcountbt(struct xfs_scrub *sc);
+int xrep_setup_xattr(struct xfs_scrub *sc);
+int xrep_setup_directory(struct xfs_scrub *sc);
+int xrep_setup_parent(struct xfs_scrub *sc);
+int xrep_setup_nlinks(struct xfs_scrub *sc);
+int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks);
+int xrep_setup_dirtree(struct xfs_scrub *sc);
/* Repair setup functions */
int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -123,11 +129,18 @@ int xrep_bmap_attr(struct xfs_scrub *sc);
int xrep_bmap_cow(struct xfs_scrub *sc);
int xrep_nlinks(struct xfs_scrub *sc);
int xrep_fscounters(struct xfs_scrub *sc);
+int xrep_xattr(struct xfs_scrub *sc);
+int xrep_directory(struct xfs_scrub *sc);
+int xrep_parent(struct xfs_scrub *sc);
+int xrep_symlink(struct xfs_scrub *sc);
+int xrep_dirtree(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xrep_rtbitmap(struct xfs_scrub *sc);
+int xrep_rtsummary(struct xfs_scrub *sc);
#else
# define xrep_rtbitmap xrep_notsupported
+# define xrep_rtsummary xrep_notsupported
#endif /* CONFIG_XFS_RT */
#ifdef CONFIG_XFS_QUOTA
@@ -145,6 +158,8 @@ int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
struct xfs_trans **tpp);
void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
+bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+
#else
#define xrep_ino_dqattach(sc) (0)
@@ -188,9 +203,19 @@ xrep_setup_nothing(
#define xrep_setup_ag_allocbt xrep_setup_nothing
#define xrep_setup_ag_rmapbt xrep_setup_nothing
#define xrep_setup_ag_refcountbt xrep_setup_nothing
+#define xrep_setup_xattr xrep_setup_nothing
+#define xrep_setup_directory xrep_setup_nothing
+#define xrep_setup_parent xrep_setup_nothing
+#define xrep_setup_nlinks xrep_setup_nothing
+#define xrep_setup_dirtree xrep_setup_nothing
#define xrep_setup_inode(sc, imap) ((void)0)
+static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
+{
+ return 0;
+}
+
#define xrep_revalidate_allocbt (NULL)
#define xrep_revalidate_iallocbt (NULL)
@@ -212,6 +237,12 @@ xrep_setup_nothing(
#define xrep_quotacheck xrep_notsupported
#define xrep_nlinks xrep_notsupported
#define xrep_fscounters xrep_notsupported
+#define xrep_rtsummary xrep_notsupported
+#define xrep_xattr xrep_notsupported
+#define xrep_directory xrep_notsupported
+#define xrep_parent xrep_notsupported
+#define xrep_symlink xrep_notsupported
+#define xrep_dirtree xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index e8e07b683eab..e8080eba37d2 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -432,14 +432,6 @@ out:
return error;
}
-static inline bool
-is_rt_data_fork(
- struct xfs_inode *ip,
- int whichfork)
-{
- return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK;
-}
-
/*
* Iterate the block mapping btree to collect rmap records for anything in this
* fork that matches the AG. Sets @mappings_done to true if we've scanned the
@@ -578,23 +570,9 @@ xrep_rmap_scan_inode(
struct xrep_rmap *rr,
struct xfs_inode *ip)
{
- unsigned int lock_mode = 0;
+ unsigned int lock_mode = xrep_rmap_scan_ilock(ip);
int error;
- /*
- * Directory updates (create/link/unlink/rename) drop the directory's
- * ILOCK before finishing any rmapbt updates associated with directory
- * shape changes. For this scan to coordinate correctly with the live
- * update hook, we must take the only lock (i_rwsem) that is held all
- * the way to dir op completion. This will get fixed by the parent
- * pointer patchset.
- */
- if (S_ISDIR(VFS_I(ip)->i_mode)) {
- lock_mode = XFS_IOLOCK_SHARED;
- xfs_ilock(ip, lock_mode);
- }
- lock_mode |= xrep_rmap_scan_ilock(ip);
-
/* Check the data fork. */
error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
if (error)
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
index 46f5d5f605c9..0fef98e9f834 100644
--- a/fs/xfs/scrub/rtbitmap_repair.c
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -108,8 +108,6 @@ xrep_rtbitmap_data_mappings(
0, &map, &nmaps);
if (error)
return error;
- if (nmaps != 1)
- return -EFSCORRUPTED;
/* Commit new extent and all deferred work. */
error = xrep_defer_finish(sc);
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 5055092bd9e8..3fee603f5244 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -17,10 +17,14 @@
#include "xfs_bit.h"
#include "xfs_bmap.h"
#include "xfs_sb.h"
+#include "xfs_exchmaps.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/xfile.h"
+#include "scrub/repair.h"
+#include "scrub/tempexch.h"
+#include "scrub/rtsummary.h"
/*
* Realtime Summary
@@ -32,18 +36,6 @@
* (potentially large) amount of data in pageable memory.
*/
-struct xchk_rtsummary {
- struct xfs_rtalloc_args args;
-
- uint64_t rextents;
- uint64_t rbmblocks;
- uint64_t rsumsize;
- unsigned int rsumlevels;
-
- /* Memory buffer for the summary comparison. */
- union xfs_suminfo_raw words[];
-};
-
/* Set us up to check the rtsummary file. */
int
xchk_setup_rtsummary(
@@ -60,6 +52,12 @@ xchk_setup_rtsummary(
return -ENOMEM;
sc->buf = rts;
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtsummary(sc, rts);
+ if (error)
+ return error;
+ }
+
/*
* Create an xfile to construct a new rtsummary file. The xfile allows
* us to avoid pinning kernel memory for this purpose.
@@ -70,7 +68,7 @@ xchk_setup_rtsummary(
if (error)
return error;
- error = xchk_trans_alloc(sc, 0);
+ error = xchk_trans_alloc(sc, rts->resblks);
if (error)
return error;
@@ -135,7 +133,7 @@ xfsum_store(
sumoff << XFS_WORDLOG);
}
-static inline int
+inline int
xfsum_copyout(
struct xfs_scrub *sc,
xfs_rtsumoff_t sumoff,
@@ -362,7 +360,12 @@ xchk_rtsummary(
error = xchk_rtsum_compare(sc);
out_rbm:
- /* Unlock the rtbitmap since we're done with it. */
+ /*
+ * Unlock the rtbitmap since we're done with it. All other writers of
+ * the rt free space metadata grab the bitmap and summary ILOCKs in
+ * that order, so we're still protected against allocation activities
+ * even if we continue on to the repair function.
+ */
xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
return error;
}
diff --git a/fs/xfs/scrub/rtsummary.h b/fs/xfs/scrub/rtsummary.h
new file mode 100644
index 000000000000..e1d50304d8d4
--- /dev/null
+++ b/fs/xfs/scrub/rtsummary.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTSUMMARY_H__
+#define __XFS_SCRUB_RTSUMMARY_H__
+
+struct xchk_rtsummary {
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ struct xrep_tempexch tempexch;
+#endif
+ struct xfs_rtalloc_args args;
+
+ uint64_t rextents;
+ uint64_t rbmblocks;
+ uint64_t rsumsize;
+ unsigned int rsumlevels;
+ unsigned int resblks;
+
+ /* suminfo position of xfile as we write buffers to disk. */
+ xfs_rtsumoff_t prep_wordoff;
+
+ /* Memory buffer for the summary comparison. */
+ union xfs_suminfo_raw words[];
+};
+
+int xfsum_copyout(struct xfs_scrub *sc, xfs_rtsumoff_t sumoff,
+ union xfs_suminfo_raw *rawinfo, unsigned int nr_words);
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_setup_rtsummary(struct xfs_scrub *sc, struct xchk_rtsummary *rts);
+#else
+# define xrep_setup_rtsummary(sc, rts) (0)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_RTSUMMARY_H__ */
diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c
new file mode 100644
index 000000000000..d9e971c4c79f
--- /dev/null
+++ b/fs/xfs/scrub/rtsummary_repair.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_exchmaps.h"
+#include "xfs_rtbitmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/reap.h"
+#include "scrub/xfile.h"
+#include "scrub/rtsummary.h"
+
+/* Set us up to repair the rtsummary file. */
+int
+xrep_setup_rtsummary(
+ struct xfs_scrub *sc,
+ struct xchk_rtsummary *rts)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks;
+ int error;
+
+ error = xrep_tempfile_create(sc, S_IFREG);
+ if (error)
+ return error;
+
+ /*
+ * If we're doing a repair, we reserve enough blocks to write out a
+ * completely new summary file, plus twice as many blocks as we would
+ * need if we can only allocate one block per data fork mapping. This
+ * should cover the preallocation of the temporary file and exchanging
+ * the extent mappings.
+ *
+ * We cannot use xfs_exchmaps_estimate because we have not yet
+ * constructed the replacement rtsummary and therefore do not know how
+ * many extents it will use. By the time we do, we will have a dirty
+ * transaction (which we cannot drop because we cannot drop the
+ * rtsummary ILOCK) and cannot ask for more reservation.
+ */
+ blocks = XFS_B_TO_FSB(mp, mp->m_rsumsize);
+ blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+
+ rts->resblks += blocks;
+ return 0;
+}
+
+static int
+xrep_rtsummary_prep_buf(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp,
+ void *data)
+{
+ struct xchk_rtsummary *rts = data;
+ struct xfs_mount *mp = sc->mp;
+ union xfs_suminfo_raw *ondisk;
+ int error;
+
+ rts->args.mp = sc->mp;
+ rts->args.tp = sc->tp;
+ rts->args.sumbp = bp;
+ ondisk = xfs_rsumblock_infoptr(&rts->args, 0);
+ rts->args.sumbp = NULL;
+
+ bp->b_ops = &xfs_rtbuf_ops;
+
+ error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize);
+ if (error)
+ return error;
+
+ rts->prep_wordoff += mp->m_blockwsize;
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF);
+ return 0;
+}
+
+/* Repair the realtime summary. */
+int
+xrep_rtsummary(
+ struct xfs_scrub *sc)
+{
+ struct xchk_rtsummary *rts = sc->buf;
+ struct xfs_mount *mp = sc->mp;
+ xfs_filblks_t rsumblocks;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(mp))
+ return -EOPNOTSUPP;
+
+ /* Walk away if we disagree on the size of the rt bitmap. */
+ if (rts->rbmblocks != mp->m_sb.sb_rbmblocks)
+ return 0;
+
+ /* Make sure any problems with the fork are fixed. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ /*
+ * Try to take ILOCK_EXCL of the temporary file. We had better be the
+ * only ones holding onto this inode, but we can't block while holding
+ * the rtsummary file's ILOCK_EXCL.
+ */
+ while (!xrep_tempfile_ilock_nowait(sc)) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+ delay(1);
+ }
+
+ /* Make sure we have space allocated for the entire summary file. */
+ rsumblocks = XFS_B_TO_FSB(mp, rts->rsumsize);
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+ error = xrep_tempfile_prealloc(sc, 0, rsumblocks);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /* Copy the rtsummary file that we generated. */
+ error = xrep_tempfile_copyin(sc, 0, rsumblocks,
+ xrep_rtsummary_prep_buf, rts);
+ if (error)
+ return error;
+ error = xrep_tempfile_set_isize(sc, rts->rsumsize);
+ if (error)
+ return error;
+
+ /*
+ * Now exchange the contents. Nothing in repair uses the temporary
+ * buffer, so we can reuse it for the tempfile exchrange information.
+ */
+ error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, &rts->tempexch);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_contents(sc, &rts->tempexch);
+ if (error)
+ return error;
+
+ /* Reset incore state and blow out the summary cache. */
+ if (mp->m_rsum_cache)
+ memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks);
+
+ mp->m_rsumlevels = rts->rsumlevels;
+ mp->m_rsumsize = rts->rsumsize;
+
+ /* Free the old rtsummary blocks if they're not in use. */
+ return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 20fac9723c08..c013f0ba4f36 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -17,6 +17,11 @@
#include "xfs_scrub.h"
#include "xfs_buf_mem.h"
#include "xfs_rmap.h"
+#include "xfs_exchrange.h"
+#include "xfs_exchmaps.h"
+#include "xfs_dir2.h"
+#include "xfs_parent.h"
+#include "xfs_icache.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -24,6 +29,8 @@
#include "scrub/health.h"
#include "scrub/stats.h"
#include "scrub/xfile.h"
+#include "scrub/tempfile.h"
+#include "scrub/orphanage.h"
/*
* Online Scrub and Repair
@@ -171,6 +178,39 @@ xchk_fsgates_disable(
sc->flags &= ~XCHK_FSGATES_ALL;
}
+/* Free the resources associated with a scrub subtype. */
+void
+xchk_scrub_free_subord(
+ struct xfs_scrub_subord *sub)
+{
+ struct xfs_scrub *sc = sub->parent_sc;
+
+ ASSERT(sc->ip == sub->sc.ip);
+ ASSERT(sc->orphanage == sub->sc.orphanage);
+ ASSERT(sc->tempip == sub->sc.tempip);
+
+ sc->sm->sm_type = sub->old_smtype;
+ sc->sm->sm_flags = sub->old_smflags |
+ (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT);
+ sc->tp = sub->sc.tp;
+
+ if (sub->sc.buf) {
+ if (sub->sc.buf_cleanup)
+ sub->sc.buf_cleanup(sub->sc.buf);
+ kvfree(sub->sc.buf);
+ }
+ if (sub->sc.xmbtp)
+ xmbuf_free(sub->sc.xmbtp);
+ if (sub->sc.xfile)
+ xfile_destroy(sub->sc.xfile);
+
+ sc->ilock_flags = sub->sc.ilock_flags;
+ sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags;
+ sc->temp_ilock_flags = sub->sc.temp_ilock_flags;
+
+ kfree(sub);
+}
+
/* Free all the resources and finish the transactions. */
STATIC int
xchk_teardown(
@@ -211,6 +251,8 @@ xchk_teardown(
sc->buf = NULL;
}
+ xrep_tempfile_rele(sc);
+ xrep_orphanage_rele(sc);
xchk_fsgates_disable(sc);
return error;
}
@@ -319,25 +361,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_INODE,
.setup = xchk_setup_directory,
.scrub = xchk_directory,
- .repair = xrep_notsupported,
+ .repair = xrep_directory,
},
[XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */
.type = ST_INODE,
.setup = xchk_setup_xattr,
.scrub = xchk_xattr,
- .repair = xrep_notsupported,
+ .repair = xrep_xattr,
},
[XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */
.type = ST_INODE,
.setup = xchk_setup_symlink,
.scrub = xchk_symlink,
- .repair = xrep_notsupported,
+ .repair = xrep_symlink,
},
[XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */
.type = ST_INODE,
.setup = xchk_setup_parent,
.scrub = xchk_parent,
- .repair = xrep_notsupported,
+ .repair = xrep_parent,
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_FS,
@@ -349,7 +391,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
- .repair = xrep_notsupported,
+ .repair = xrep_rtsummary,
},
[XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
.type = ST_FS,
@@ -393,6 +435,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.scrub = xchk_health_record,
.repair = xrep_notsupported,
},
+ [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */
+ .type = ST_INODE,
+ .setup = xchk_setup_dirtree,
+ .scrub = xchk_dirtree,
+ .has = xfs_has_parent,
+ .repair = xrep_dirtree,
+ },
};
static int
@@ -497,8 +546,38 @@ static inline void xchk_postmortem(struct xfs_scrub *sc)
}
#endif /* CONFIG_XFS_ONLINE_REPAIR */
+/*
+ * Create a new scrub context from an existing one, but with a different scrub
+ * type.
+ */
+struct xfs_scrub_subord *
+xchk_scrub_create_subord(
+ struct xfs_scrub *sc,
+ unsigned int subtype)
+{
+ struct xfs_scrub_subord *sub;
+
+ sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS);
+ if (!sub)
+ return ERR_PTR(-ENOMEM);
+
+ sub->old_smtype = sc->sm->sm_type;
+ sub->old_smflags = sc->sm->sm_flags;
+ sub->parent_sc = sc;
+ memcpy(&sub->sc, sc, sizeof(struct xfs_scrub));
+ sub->sc.ops = &meta_scrub_ops[subtype];
+ sub->sc.sm->sm_type = subtype;
+ sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ sub->sc.buf = NULL;
+ sub->sc.buf_cleanup = NULL;
+ sub->sc.xfile = NULL;
+ sub->sc.xmbtp = NULL;
+
+ return sub;
+}
+
/* Dispatch metadata scrubbing. */
-int
+STATIC int
xfs_scrub_metadata(
struct file *file,
struct xfs_scrub_metadata *sm)
@@ -540,6 +619,7 @@ xfs_scrub_metadata(
sc->sm = sm;
sc->ops = &meta_scrub_ops[sm->sm_type];
sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
+ sc->relax = INIT_XCHK_RELAX;
retry_op:
/*
* When repairs are allowed, prevent freezing or readonly remount while
@@ -643,3 +723,221 @@ try_harder:
run.retries++;
goto retry_op;
}
+
+/* Scrub one aspect of one piece of metadata. */
+int
+xfs_ioc_scrub_metadata(
+ struct file *file,
+ void __user *arg)
+{
+ struct xfs_scrub_metadata scrub;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&scrub, arg, sizeof(scrub)))
+ return -EFAULT;
+
+ error = xfs_scrub_metadata(file, &scrub);
+ if (error)
+ return error;
+
+ if (copy_to_user(arg, &scrub, sizeof(scrub)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Decide if there have been any scrub failures up to this point. */
+static inline int
+xfs_scrubv_check_barrier(
+ struct xfs_mount *mp,
+ const struct xfs_scrub_vec *vectors,
+ const struct xfs_scrub_vec *stop_vec)
+{
+ const struct xfs_scrub_vec *v;
+ __u32 failmask;
+
+ failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT;
+
+ for (v = vectors; v < stop_vec; v++) {
+ if (v->sv_type == XFS_SCRUB_TYPE_BARRIER)
+ continue;
+
+ /*
+ * Runtime errors count as a previous failure, except the ones
+ * used to ask userspace to retry.
+ */
+ switch (v->sv_ret) {
+ case -EBUSY:
+ case -ENOENT:
+ case -EUSERS:
+ case 0:
+ break;
+ default:
+ return -ECANCELED;
+ }
+
+ /*
+ * If any of the out-flags on the scrub vector match the mask
+ * that was set on the barrier vector, that's a previous fail.
+ */
+ if (v->sv_flags & failmask)
+ return -ECANCELED;
+ }
+
+ return 0;
+}
+
+/*
+ * If the caller provided us with a nonzero inode number that isn't the ioctl
+ * file, try to grab a reference to it to eliminate all further untrusted inode
+ * lookups. If we can't get the inode, let each scrub function try again.
+ */
+STATIC struct xfs_inode *
+xchk_scrubv_open_by_handle(
+ struct xfs_mount *mp,
+ const struct xfs_scrub_vec_head *head)
+{
+ struct xfs_trans *tp;
+ struct xfs_inode *ip;
+ int error;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return NULL;
+
+ error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip);
+ xfs_trans_cancel(tp);
+ if (error)
+ return NULL;
+
+ if (VFS_I(ip)->i_generation != head->svh_gen) {
+ xfs_irele(ip);
+ return NULL;
+ }
+
+ return ip;
+}
+
+/* Vectored scrub implementation to reduce ioctl calls. */
+int
+xfs_ioc_scrubv_metadata(
+ struct file *file,
+ void __user *arg)
+{
+ struct xfs_scrub_vec_head head;
+ struct xfs_scrub_vec_head __user *uhead = arg;
+ struct xfs_scrub_vec *vectors;
+ struct xfs_scrub_vec __user *uvectors;
+ struct xfs_inode *ip_in = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip_in->i_mount;
+ struct xfs_inode *handle_ip = NULL;
+ struct xfs_scrub_vec *v;
+ size_t vec_bytes;
+ unsigned int i;
+ int error = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&head, uhead, sizeof(head)))
+ return -EFAULT;
+
+ if (head.svh_reserved)
+ return -EINVAL;
+ if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL)
+ return -EINVAL;
+ if (head.svh_nr == 0)
+ return 0;
+
+ vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec));
+ if (vec_bytes > PAGE_SIZE)
+ return -ENOMEM;
+
+ uvectors = (void __user *)(uintptr_t)head.svh_vectors;
+ vectors = memdup_user(uvectors, vec_bytes);
+ if (IS_ERR(vectors))
+ return PTR_ERR(vectors);
+
+ trace_xchk_scrubv_start(ip_in, &head);
+
+ for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
+ if (v->sv_reserved) {
+ error = -EINVAL;
+ goto out_free;
+ }
+
+ if (v->sv_type == XFS_SCRUB_TYPE_BARRIER &&
+ (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) {
+ error = -EINVAL;
+ goto out_free;
+ }
+
+ trace_xchk_scrubv_item(mp, &head, i, v);
+ }
+
+ /*
+ * If the caller wants us to do a scrub-by-handle and the file used to
+ * call the ioctl is not the same file, load the incore inode and pin
+ * it across all the scrubv actions to avoid repeated UNTRUSTED
+ * lookups. The reference is not passed to deeper layers of scrub
+ * because each scrubber gets to decide its own strategy and return
+ * values for getting an inode.
+ */
+ if (head.svh_ino && head.svh_ino != ip_in->i_ino)
+ handle_ip = xchk_scrubv_open_by_handle(mp, &head);
+
+ /* Run all the scrubbers. */
+ for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
+ struct xfs_scrub_metadata sm = {
+ .sm_type = v->sv_type,
+ .sm_flags = v->sv_flags,
+ .sm_ino = head.svh_ino,
+ .sm_gen = head.svh_gen,
+ .sm_agno = head.svh_agno,
+ };
+
+ if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) {
+ v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v);
+ if (v->sv_ret) {
+ trace_xchk_scrubv_barrier_fail(mp, &head, i, v);
+ break;
+ }
+
+ continue;
+ }
+
+ v->sv_ret = xfs_scrub_metadata(file, &sm);
+ v->sv_flags = sm.sm_flags;
+
+ trace_xchk_scrubv_outcome(mp, &head, i, v);
+
+ if (head.svh_rest_us) {
+ ktime_t expires;
+
+ expires = ktime_add_ns(ktime_get(),
+ head.svh_rest_us * 1000);
+ set_current_state(TASK_KILLABLE);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ }
+
+ if (fatal_signal_pending(current)) {
+ error = -EINTR;
+ goto out_free;
+ }
+ }
+
+ if (copy_to_user(uvectors, vectors, vec_bytes) ||
+ copy_to_user(uhead, &head, sizeof(head))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+
+out_free:
+ if (handle_ip)
+ xfs_irele(handle_ip);
+ kfree(vectors);
+ return error;
+}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 9ad65b604fe1..1bc33f010d0e 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -8,6 +8,49 @@
struct xfs_scrub;
+struct xchk_relax {
+ unsigned long next_resched;
+ unsigned int resched_nr;
+ bool interruptible;
+};
+
+/* Yield to the scheduler at most 10x per second. */
+#define XCHK_RELAX_NEXT (jiffies + (HZ / 10))
+
+#define INIT_XCHK_RELAX \
+ (struct xchk_relax){ \
+ .next_resched = XCHK_RELAX_NEXT, \
+ .resched_nr = 0, \
+ .interruptible = true, \
+ }
+
+/*
+ * Relax during a scrub operation and exit if there's a fatal signal pending.
+ *
+ * If preemption is disabled, we need to yield to the scheduler every now and
+ * then so that we don't run afoul of the soft lockup watchdog or RCU stall
+ * detector. cond_resched calls are somewhat expensive (~5ns) so we want to
+ * ratelimit this to 10x per second. Amortize the cost of the other checks by
+ * only doing it once every 100 calls.
+ */
+static inline int xchk_maybe_relax(struct xchk_relax *widget)
+{
+ /* Amortize the cost of scheduling and checking signals. */
+ if (likely(++widget->resched_nr < 100))
+ return 0;
+ widget->resched_nr = 0;
+
+ if (unlikely(widget->next_resched <= jiffies)) {
+ cond_resched();
+ widget->next_resched = XCHK_RELAX_NEXT;
+ }
+
+ if (widget->interruptible && fatal_signal_pending(current))
+ return -EINTR;
+
+ return 0;
+}
+
/*
* Standard flags for allocating memory within scrub. NOFS context is
* configured by the process allocation scope. Scrub and repair must be able
@@ -17,6 +60,13 @@ struct xfs_scrub;
#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \
__GFP_RETRY_MAYFAIL))
+/*
+ * For opening files by handle for fsck operations, we don't trust the inumber
+ * or the allocation state; therefore, perform an untrusted lookup. We don't
+ * want these inodes to pollute the cache, so mark them for immediate removal.
+ */
+#define XCHK_IGET_FLAGS (XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE)
+
/* Type info and names for the scrub types. */
enum xchk_type {
ST_NONE = 1, /* disabled */
@@ -105,6 +155,14 @@ struct xfs_scrub {
/* Lock flags for @ip. */
uint ilock_flags;
+ /* The orphanage, for stashing files that have lost their parent. */
+ uint orphanage_ilock_flags;
+ struct xfs_inode *orphanage;
+
+ /* A temporary file on this filesystem, for staging new metadata. */
+ struct xfs_inode *tempip;
+ uint temp_ilock_flags;
+
/* See the XCHK/XREP state flags below. */
unsigned int flags;
@@ -115,6 +173,9 @@ struct xfs_scrub {
*/
unsigned int sick_mask;
+ /* next time we want to cond_resched() */
+ struct xchk_relax relax;
+
/* State tracking for single-AG operations. */
struct xchk_ag sa;
};
@@ -141,6 +202,35 @@ struct xfs_scrub {
XCHK_FSGATES_DIRENTS | \
XCHK_FSGATES_RMAP)
+struct xfs_scrub_subord {
+ struct xfs_scrub sc;
+ struct xfs_scrub *parent_sc;
+ unsigned int old_smtype;
+ unsigned int old_smflags;
+};
+
+struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc,
+ unsigned int subtype);
+void xchk_scrub_free_subord(struct xfs_scrub_subord *sub);
+
+/*
+ * We /could/ terminate a scrub/repair operation early. If we're not
+ * in a good place to continue (fatal signal, etc.) then bail out.
+ * Note that we're careful not to make any judgements about *error.
+ */
+static inline bool
+xchk_should_terminate(
+ struct xfs_scrub *sc,
+ int *error)
+{
+ if (xchk_maybe_relax(&sc->relax)) {
+ if (*error == 0)
+ *error = -EINTR;
+ return true;
+ }
+ return false;
+}
+
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
int xchk_superblock(struct xfs_scrub *sc);
@@ -159,6 +249,7 @@ int xchk_directory(struct xfs_scrub *sc);
int xchk_xattr(struct xfs_scrub *sc);
int xchk_symlink(struct xfs_scrub *sc);
int xchk_parent(struct xfs_scrub *sc);
+int xchk_dirtree(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xchk_rtbitmap(struct xfs_scrub *sc);
int xchk_rtsummary(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index 42cafbed94ac..7996c2335476 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -79,6 +79,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters",
[XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck",
[XFS_SCRUB_TYPE_NLINKS] = "nlinks",
+ [XFS_SCRUB_TYPE_DIRTREE] = "dirtree",
};
/* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index d77d8a9598f6..c848bcc07cd5 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -10,6 +10,7 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_symlink.h"
#include "xfs_health.h"
@@ -17,18 +18,28 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/health.h"
+#include "scrub/repair.h"
/* Set us up to scrub a symbolic link. */
int
xchk_setup_symlink(
struct xfs_scrub *sc)
{
+ unsigned int resblks = 0;
+ int error;
+
/* Allocate the buffer without the inode lock held. */
sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
- return xchk_setup_inode_contents(sc, 0);
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_symlink(sc, &resblks);
+ if (error)
+ return error;
+ }
+
+ return xchk_setup_inode_contents(sc, resblks);
}
/* Symbolic links. */
diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
new file mode 100644
index 000000000000..d015a86ef460
--- /dev/null
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -0,0 +1,509 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_symlink.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_symlink_remote.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/reap.h"
+
+/*
+ * Symbolic Link Repair
+ * ====================
+ *
+ * We repair symbolic links by reading whatever target data we can find, up to
+ * the first NULL byte. If the recovered target strlen matches i_size, then
+ * we rewrite the target. In all other cases, we replace the target with an
+ * overly long string that cannot possibly resolve. The new target is written
+ * into a private hidden temporary file, and then a file contents exchange
+ * commits the new symlink target to the file being repaired.
+ */
+
+/* Set us up to repair the symlink file. */
+int
+xrep_setup_symlink(
+ struct xfs_scrub *sc,
+ unsigned int *resblks)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks;
+ int error;
+
+ error = xrep_tempfile_create(sc, S_IFLNK);
+ if (error)
+ return error;
+
+ /*
+ * If we're doing a repair, we reserve enough blocks to write out a
+ * completely new symlink file, plus twice as many blocks as we would
+ * need if we can only allocate one block per data fork mapping. This
+ * should cover the preallocation of the temporary file and exchanging
+ * the extent mappings.
+ *
+ * We cannot use xfs_exchmaps_estimate because we have not yet
+ * constructed the replacement symlink and therefore do not know how
+ * many extents it will use. By the time we do, we will have a dirty
+ * transaction (which we cannot drop because we cannot drop the
+ * symlink ILOCK) and cannot ask for more reservation.
+ */
+ blocks = xfs_symlink_blocks(sc->mp, XFS_SYMLINK_MAXLEN);
+ blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+
+ *resblks += blocks;
+ return 0;
+}
+
+/*
+ * Try to salvage the pathname from remote blocks. Returns the number of bytes
+ * salvaged or a negative errno.
+ */
+STATIC ssize_t
+xrep_symlink_salvage_remote(
+ struct xfs_scrub *sc)
+{
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_buf *bp;
+ char *target_buf = sc->buf;
+ xfs_failaddr_t fa;
+ xfs_filblks_t fsblocks;
+ xfs_daddr_t d;
+ loff_t len;
+ loff_t offset = 0;
+ unsigned int byte_cnt;
+ bool magic_ok;
+ bool hdr_ok;
+ int n;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int error;
+
+ /* We'll only read until the buffer is full. */
+ len = min_t(loff_t, ip->i_disk_size, XFS_SYMLINK_MAXLEN);
+ fsblocks = xfs_symlink_blocks(sc->mp, len);
+ error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+ if (error)
+ return error;
+
+ for (n = 0; n < nmaps; n++) {
+ struct xfs_dsymlink_hdr *dsl;
+
+ d = XFS_FSB_TO_DADDR(sc->mp, mval[n].br_startblock);
+
+ /* Read the rmt block. We'll run the verifiers manually. */
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
+ d, XFS_FSB_TO_BB(sc->mp, mval[n].br_blockcount),
+ 0, &bp, NULL);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ /* How many bytes do we expect to get out of this buffer? */
+ byte_cnt = XFS_FSB_TO_B(sc->mp, mval[n].br_blockcount);
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(sc->mp, byte_cnt);
+ byte_cnt = min_t(unsigned int, byte_cnt, len);
+
+ /*
+ * See if the verifiers accept this block. We're willing to
+ * salvage if the if the offset/byte/ino are ok and either the
+ * verifier passed or the magic is ok. Anything else and we
+ * stop dead in our tracks.
+ */
+ fa = bp->b_ops->verify_struct(bp);
+ dsl = bp->b_addr;
+ magic_ok = dsl->sl_magic == cpu_to_be32(XFS_SYMLINK_MAGIC);
+ hdr_ok = xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp);
+ if (!hdr_ok || (fa != NULL && !magic_ok))
+ break;
+
+ memcpy(target_buf + offset, dsl + 1, byte_cnt);
+
+ len -= byte_cnt;
+ offset += byte_cnt;
+ }
+ return offset;
+}
+
+/*
+ * Try to salvage an inline symlink's contents. Returns the number of bytes
+ * salvaged or a negative errno.
+ */
+STATIC ssize_t
+xrep_symlink_salvage_inline(
+ struct xfs_scrub *sc)
+{
+ struct xfs_inode *ip = sc->ip;
+ char *target_buf = sc->buf;
+ char *old_target;
+ struct xfs_ifork *ifp;
+ unsigned int nr;
+
+ ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ if (!ifp->if_data)
+ return 0;
+
+ /*
+ * If inode repair zapped the link target, pretend that we didn't find
+ * any bytes at all so that we can replace the (now totally lost) link
+ * target with a warning message.
+ */
+ old_target = ifp->if_data;
+ if (xfs_inode_has_sickness(sc->ip, XFS_SICK_INO_SYMLINK_ZAPPED) &&
+ sc->ip->i_disk_size == 1 && old_target[0] == '?')
+ return 0;
+
+ nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip));
+ strncpy(target_buf, ifp->if_data, nr);
+ return nr;
+}
+
+#define DUMMY_TARGET \
+ "The target of this symbolic link could not be recovered at all and " \
+ "has been replaced with this explanatory message. To avoid " \
+ "accidentally pointing to an existing file path, this message is " \
+ "longer than the maximum supported file name length. That is an " \
+ "acceptable length for a symlink target on XFS but will produce " \
+ "File Name Too Long errors if resolved."
+
+/* Salvage whatever we can of the target. */
+STATIC int
+xrep_symlink_salvage(
+ struct xfs_scrub *sc)
+{
+ char *target_buf = sc->buf;
+ ssize_t buflen = 0;
+
+ BUILD_BUG_ON(sizeof(DUMMY_TARGET) - 1 <= NAME_MAX);
+
+ /*
+ * Salvage the target if there weren't any corruption problems observed
+ * while scanning it.
+ */
+ if (!(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ if (sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ buflen = xrep_symlink_salvage_inline(sc);
+ else
+ buflen = xrep_symlink_salvage_remote(sc);
+ if (buflen < 0)
+ return buflen;
+
+ /*
+ * NULL-terminate the buffer because the ondisk target does not
+ * do that for us. If salvage didn't find the exact amount of
+ * data that we expected to find, don't salvage anything.
+ */
+ target_buf[buflen] = 0;
+ if (strlen(target_buf) != sc->ip->i_disk_size)
+ buflen = 0;
+ }
+
+ /*
+ * Change an empty target into a dummy target and clear the symlink
+ * target zapped flag.
+ */
+ if (buflen == 0) {
+ sc->sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
+ sprintf(target_buf, DUMMY_TARGET);
+ }
+
+ trace_xrep_symlink_salvage_target(sc->ip, target_buf,
+ strlen(target_buf));
+ return 0;
+}
+
+STATIC void
+xrep_symlink_local_to_remote(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ void *priv)
+{
+ struct xfs_scrub *sc = priv;
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ xfs_symlink_local_to_remote(tp, bp, ip, ifp, NULL);
+
+ if (!xfs_has_crc(sc->mp))
+ return;
+
+ dsl->sl_owner = cpu_to_be64(sc->ip->i_ino);
+ xfs_trans_log_buf(tp, bp, 0,
+ sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1);
+}
+
+/*
+ * Prepare both links' data forks for an exchange. Promote the tempfile from
+ * local format to extents format, and if the file being repaired has a short
+ * format data fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_symlink_swap_prep(
+ struct xfs_scrub *sc,
+ bool temp_local,
+ bool ip_local)
+{
+ int error;
+
+ /*
+ * If the temp link is in shortform format, convert that to a remote
+ * target so that we can use the atomic mapping exchange.
+ */
+ if (temp_local) {
+ int logflags = XFS_ILOG_CORE;
+
+ error = xfs_bmap_local_to_extents(sc->tp, sc->tempip, 1,
+ &logflags, XFS_DATA_FORK,
+ xrep_symlink_local_to_remote,
+ sc);
+ if (error)
+ return error;
+
+ xfs_trans_log_inode(sc->tp, sc->ip, 0);
+
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If the file being repaired had a shortform data fork, convert that
+ * to an empty extent list in preparation for the atomic mapping
+ * exchange.
+ */
+ if (ip_local) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ ifp->if_bytes = 0;
+ ifp->if_data = NULL;
+ ifp->if_height = 0;
+
+ xfs_trans_log_inode(sc->tp, sc->ip,
+ XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ }
+
+ return 0;
+}
+
+/* Exchange the temporary symlink's data fork with the one being repaired. */
+STATIC int
+xrep_symlink_swap(
+ struct xfs_scrub *sc)
+{
+ struct xrep_tempexch *tx = sc->buf;
+ bool ip_local, temp_local;
+ int error;
+
+ ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+ temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+
+ /*
+ * If the both links have a local format data fork and the rebuilt
+ * remote data would fit in the repaired file's data fork, copy the
+ * contents from the tempfile and declare ourselves done.
+ */
+ if (ip_local && temp_local &&
+ sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
+ xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
+ return 0;
+ }
+
+ /* Otherwise, make sure both data forks are in block-mapping mode. */
+ error = xrep_symlink_swap_prep(sc, temp_local, ip_local);
+ if (error)
+ return error;
+
+ return xrep_tempexch_contents(sc, tx);
+}
+
+/*
+ * Free all the remote blocks and reset the data fork. The caller must join
+ * the inode to the transaction. This function returns with the inode joined
+ * to a clean scrub transaction.
+ */
+STATIC int
+xrep_symlink_reset_fork(
+ struct xfs_scrub *sc)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
+ int error;
+
+ /* Unmap all the remote target buffers. */
+ if (xfs_ifork_has_extents(ifp)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ trace_xrep_symlink_reset_fork(sc->tempip);
+
+ /* Reset the temp symlink target to dummy content. */
+ xfs_idestroy_fork(ifp);
+ return xfs_symlink_write_target(sc->tp, sc->tempip, sc->tempip->i_ino,
+ "?", 1, 0, 0);
+}
+
+/*
+ * Reinitialize a link target. Caller must ensure the inode is joined to
+ * the transaction.
+ */
+STATIC int
+xrep_symlink_rebuild(
+ struct xfs_scrub *sc)
+{
+ struct xrep_tempexch *tx;
+ char *target_buf = sc->buf;
+ xfs_fsblock_t fs_blocks;
+ unsigned int target_len;
+ unsigned int resblks;
+ int error;
+
+ /* How many blocks do we need? */
+ target_len = strlen(target_buf);
+ ASSERT(target_len != 0);
+ if (target_len == 0 || target_len > XFS_SYMLINK_MAXLEN)
+ return -EFSCORRUPTED;
+
+ trace_xrep_symlink_rebuild(sc->ip);
+
+ /*
+ * In preparation to write the new symlink target to the temporary
+ * file, drop the ILOCK of the file being repaired (it shouldn't be
+ * joined) and take the ILOCK of the temporary file.
+ *
+ * The VFS does not take the IOLOCK while reading a symlink (and new
+ * symlinks are hidden with INEW until they've been written) so it's
+ * possible that a readlink() could see the old corrupted contents
+ * while we're doing this.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_tempfile_ilock(sc);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+ /*
+ * Reserve resources to reinitialize the target. We're allowed to
+ * exceed file quota to repair inconsistent metadata, though this is
+ * unlikely.
+ */
+ fs_blocks = xfs_symlink_blocks(sc->mp, target_len);
+ resblks = xfs_symlink_space_res(sc->mp, target_len, fs_blocks);
+ error = xfs_trans_reserve_quota_nblks(sc->tp, sc->tempip, resblks, 0,
+ true);
+ if (error)
+ return error;
+
+ /* Erase the dummy target set up by the tempfile initialization. */
+ xfs_idestroy_fork(&sc->tempip->i_df);
+ sc->tempip->i_df.if_bytes = 0;
+ sc->tempip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+
+ /* Write the salvaged target to the temporary link. */
+ error = xfs_symlink_write_target(sc->tp, sc->tempip, sc->ip->i_ino,
+ target_buf, target_len, fs_blocks, resblks);
+ if (error)
+ return error;
+
+ /*
+ * Commit the repair transaction so that we can use the atomic mapping
+ * exchange functions to compute the correct block reservations and
+ * re-lock the inodes.
+ */
+ target_buf = NULL;
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ xrep_tempfile_iunlock(sc);
+
+ /*
+ * We're done with the temporary buffer, so we can reuse it for the
+ * tempfile contents exchange information.
+ */
+ tx = sc->buf;
+ error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, tx);
+ if (error)
+ return error;
+
+ /*
+ * Exchange the temp link's data fork with the file being repaired.
+ * This recreates the transaction and takes the ILOCKs of the file
+ * being repaired and the temporary file.
+ */
+ error = xrep_symlink_swap(sc);
+ if (error)
+ return error;
+
+ /*
+ * Release the old symlink blocks and reset the data fork of the temp
+ * link to an empty shortform link. This is the last repair action we
+ * perform on the symlink, so we don't need to clean the transaction.
+ */
+ return xrep_symlink_reset_fork(sc);
+}
+
+/* Repair a symbolic link. */
+int
+xrep_symlink(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ /* The rmapbt is required to reap the old data fork. */
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ error = xrep_symlink_salvage(sc);
+ if (error)
+ return error;
+
+ /* Now reset the target. */
+ error = xrep_symlink_rebuild(sc);
+ if (error)
+ return error;
+
+ return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/tempexch.h b/fs/xfs/scrub/tempexch.h
new file mode 100644
index 000000000000..995ba187c5aa
--- /dev/null
+++ b/fs/xfs/scrub/tempexch.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_TEMPEXCH_H__
+#define __XFS_SCRUB_TEMPEXCH_H__
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+struct xrep_tempexch {
+ struct xfs_exchmaps_req req;
+};
+
+int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork,
+ struct xrep_tempexch *ti);
+int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork,
+ struct xrep_tempexch *ti);
+
+int xrep_tempexch_contents(struct xfs_scrub *sc, struct xrep_tempexch *ti);
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_TEMPEXCH_H__ */
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
new file mode 100644
index 000000000000..b747b625c5ee
--- /dev/null
+++ b/fs/xfs/scrub/tempfile.c
@@ -0,0 +1,851 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_dir2.h"
+#include "xfs_exchrange.h"
+#include "xfs_exchmaps.h"
+#include "xfs_defer.h"
+#include "xfs_symlink_remote.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/trace.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/xfile.h"
+
+/*
+ * Create a temporary file for reconstructing metadata, with the intention of
+ * atomically exchanging the temporary file's contents with the file that's
+ * being repaired.
+ */
+int
+xrep_tempfile_create(
+ struct xfs_scrub *sc,
+ uint16_t mode)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = NULL;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
+ struct xfs_trans_res *tres;
+ struct xfs_inode *dp = mp->m_rootip;
+ xfs_ino_t ino;
+ unsigned int resblks;
+ bool is_dir = S_ISDIR(mode);
+ int error;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (xfs_is_readonly(mp))
+ return -EROFS;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(sc->tempip == NULL);
+
+ /*
+ * Make sure that we have allocated dquot(s) on disk. The temporary
+ * inode should be completely root owned so that we don't fail due to
+ * quota limits.
+ */
+ error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+ XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ if (is_dir) {
+ resblks = xfs_mkdir_space_res(mp, 0);
+ tres = &M_RES(mp)->tr_mkdir;
+ } else {
+ resblks = XFS_IALLOC_SPACE_RES(mp);
+ tres = &M_RES(mp)->tr_create_tmpfile;
+ }
+
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
+ if (error)
+ goto out_release_dquots;
+
+ /* Allocate inode, set up directory. */
+ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ if (error)
+ goto out_trans_cancel;
+ error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0,
+ 0, false, &sc->tempip);
+ if (error)
+ goto out_trans_cancel;
+
+ /* Change the ownership of the inode to root. */
+ VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID;
+ VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID;
+ sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
+ xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
+
+ /*
+ * Mark our temporary file as private so that LSMs and the ACL code
+ * don't try to add their own metadata or reason about these files.
+ * The file should never be exposed to userspace.
+ */
+ VFS_I(sc->tempip)->i_flags |= S_PRIVATE;
+ VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;
+
+ if (is_dir) {
+ error = xfs_dir_init(tp, sc->tempip, dp);
+ if (error)
+ goto out_trans_cancel;
+ } else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) {
+ /*
+ * Initialize the temporary symlink with a meaningless target
+ * that won't trip the verifiers. Repair must rewrite the
+ * target with meaningful content before swapping with the file
+ * being repaired. A single-byte target will not write a
+ * remote target block, so the owner is irrelevant.
+ */
+ error = xfs_symlink_write_target(tp, sc->tempip,
+ sc->tempip->i_ino, ".", 1, 0, 0);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Attach the dquot(s) to the inodes and modify them incore.
+ * These ids of the inode couldn't have changed since the new
+ * inode has been locked ever since it was created.
+ */
+ xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
+
+ /*
+ * Put our temp file on the unlinked list so it's purged automatically.
+ * All file-based metadata being reconstructed using this file must be
+ * atomically exchanged with the original file because the contents
+ * here will be purged when the inode is dropped or log recovery cleans
+ * out the unlinked list.
+ */
+ error = xfs_iunlink(tp, sc->tempip);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_release_inode;
+
+ trace_xrep_tempfile_create(sc);
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ /* Finish setting up the incore / vfs context. */
+ xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
+ xfs_setup_iops(sc->tempip);
+ xfs_finish_inode_setup(sc->tempip);
+
+ sc->temp_ilock_flags = 0;
+ return error;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+out_release_inode:
+ /*
+ * Wait until after the current transaction is aborted to finish the
+ * setup of the inode and release the inode. This prevents recursive
+ * transactions and deadlocks from xfs_inactive.
+ */
+ if (sc->tempip) {
+ xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
+ xfs_finish_inode_setup(sc->tempip);
+ xchk_irele(sc, sc->tempip);
+ }
+out_release_dquots:
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ return error;
+}
+
+/* Take IOLOCK_EXCL on the temporary file, maybe. */
+bool
+xrep_tempfile_iolock_nowait(
+ struct xfs_scrub *sc)
+{
+ if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
+ sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
+ * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
+ * to avoid deadlocks and lockdep complaints.
+ */
+int
+xrep_tempfile_iolock_polled(
+ struct xfs_scrub *sc)
+{
+ int error = 0;
+
+ while (!xrep_tempfile_iolock_nowait(sc)) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+ delay(1);
+ }
+
+ return 0;
+}
+
+/* Release IOLOCK_EXCL on the temporary file. */
+void
+xrep_tempfile_iounlock(
+ struct xfs_scrub *sc)
+{
+ xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
+ sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
+}
+
+/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
+void
+xrep_tempfile_ilock(
+ struct xfs_scrub *sc)
+{
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
+}
+
+/* Try to grab ILOCK_EXCL on the temporary file. */
+bool
+xrep_tempfile_ilock_nowait(
+ struct xfs_scrub *sc)
+{
+ if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+ return true;
+ }
+
+ return false;
+}
+
+/* Unlock ILOCK_EXCL on the temporary file after an update. */
+void
+xrep_tempfile_iunlock(
+ struct xfs_scrub *sc)
+{
+ xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
+ sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
+}
+
+/*
+ * Begin the process of making changes to both the file being scrubbed and
+ * the temporary file by taking ILOCK_EXCL on both.
+ */
+void
+xrep_tempfile_ilock_both(
+ struct xfs_scrub *sc)
+{
+ xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL);
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+}
+
+/* Unlock ILOCK_EXCL on both files. */
+void
+xrep_tempfile_iunlock_both(
+ struct xfs_scrub *sc)
+{
+ xrep_tempfile_iunlock(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+}
+
+/* Release the temporary file. */
+void
+xrep_tempfile_rele(
+ struct xfs_scrub *sc)
+{
+ if (!sc->tempip)
+ return;
+
+ if (sc->temp_ilock_flags) {
+ xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
+ sc->temp_ilock_flags = 0;
+ }
+
+ xchk_irele(sc, sc->tempip);
+ sc->tempip = NULL;
+}
+
+/*
+ * Make sure that the given range of the data fork of the temporary file is
+ * mapped to written blocks. The caller must ensure that both inodes are
+ * joined to the transaction.
+ */
+int
+xrep_tempfile_prealloc(
+ struct xfs_scrub *sc,
+ xfs_fileoff_t off,
+ xfs_filblks_t len)
+{
+ struct xfs_bmbt_irec map;
+ xfs_fileoff_t end = off + len;
+ int error;
+
+ ASSERT(sc->tempip != NULL);
+ ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip));
+
+ for (; off < end; off = map.br_startoff + map.br_blockcount) {
+ int nmaps = 1;
+
+ /*
+ * If we have a real extent mapping this block then we're
+ * in ok shape.
+ */
+ error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps,
+ XFS_DATA_FORK);
+ if (error)
+ return error;
+ if (nmaps == 0) {
+ ASSERT(nmaps != 0);
+ return -EFSCORRUPTED;
+ }
+
+ if (xfs_bmap_is_written_extent(&map))
+ continue;
+
+ /*
+ * If we find a delalloc reservation then something is very
+ * very wrong. Bail out.
+ */
+ if (map.br_startblock == DELAYSTARTBLOCK)
+ return -EFSCORRUPTED;
+
+ /*
+ * Make sure this block has a real zeroed extent allocated to
+ * it.
+ */
+ nmaps = 1;
+ error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off,
+ XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map,
+ &nmaps);
+ if (error)
+ return error;
+ if (nmaps != 1)
+ return -EFSCORRUPTED;
+
+ trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map);
+
+ /* Commit new extent and all deferred work. */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Write data to each block of a file. The given range of the tempfile's data
+ * fork must already be populated with written extents.
+ */
+int
+xrep_tempfile_copyin(
+ struct xfs_scrub *sc,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
+ xrep_tempfile_copyin_fn prep_fn,
+ void *data)
+{
+ LIST_HEAD(buffers_list);
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ xfs_fileoff_t flush_mask;
+ xfs_fileoff_t end = off + len;
+ loff_t pos = XFS_FSB_TO_B(mp, off);
+ int error = 0;
+
+ ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode));
+
+ /* Flush buffers to disk every 512K */
+ flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1;
+
+ for (; off < end; off++, pos += mp->m_sb.sb_blocksize) {
+ struct xfs_bmbt_irec map;
+ int nmaps = 1;
+
+ /* Read block mapping for this file block. */
+ error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0);
+ if (error)
+ goto out_err;
+ if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) {
+ error = -EFSCORRUPTED;
+ goto out_err;
+ }
+
+ /* Get the metadata buffer for this offset in the file. */
+ error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ mp->m_bsize, 0, &bp);
+ if (error)
+ goto out_err;
+
+ trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map);
+
+ /* Read in a block's worth of data from the xfile. */
+ error = prep_fn(sc, bp, data);
+ if (error) {
+ xfs_trans_brelse(sc->tp, bp);
+ goto out_err;
+ }
+
+ /* Queue buffer, and flush if we have too much dirty data. */
+ xfs_buf_delwri_queue_here(bp, &buffers_list);
+ xfs_trans_brelse(sc->tp, bp);
+
+ if (!(off & flush_mask)) {
+ error = xfs_buf_delwri_submit(&buffers_list);
+ if (error)
+ goto out_err;
+ }
+ }
+
+ /*
+ * Write the new blocks to disk. If the ordered list isn't empty after
+ * that, then something went wrong and we have to fail. This should
+ * never happen, but we'll check anyway.
+ */
+ error = xfs_buf_delwri_submit(&buffers_list);
+ if (error)
+ goto out_err;
+
+ if (!list_empty(&buffers_list)) {
+ ASSERT(list_empty(&buffers_list));
+ error = -EIO;
+ goto out_err;
+ }
+
+ return 0;
+
+out_err:
+ xfs_buf_delwri_cancel(&buffers_list);
+ return error;
+}
+
+/*
+ * Set the temporary file's size. Caller must join the tempfile to the scrub
+ * transaction and is responsible for adjusting block mappings as needed.
+ */
+int
+xrep_tempfile_set_isize(
+ struct xfs_scrub *sc,
+ unsigned long long isize)
+{
+ if (sc->tempip->i_disk_size == isize)
+ return 0;
+
+ sc->tempip->i_disk_size = isize;
+ i_size_write(VFS_I(sc->tempip), isize);
+ return xrep_tempfile_roll_trans(sc);
+}
+
+/*
+ * Roll a repair transaction involving the temporary file. Caller must join
+ * both the temporary file and the file being scrubbed to the transaction.
+ * This function return with both inodes joined to a new scrub transaction,
+ * or the usual negative errno.
+ */
+int
+xrep_tempfile_roll_trans(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
+ error = xrep_roll_trans(sc);
+ if (error)
+ return error;
+
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+ return 0;
+}
+
+/*
+ * Fill out the mapping exchange request in preparation for atomically
+ * committing the contents of a metadata file that we've rebuilt in the temp
+ * file.
+ */
+STATIC int
+xrep_tempexch_prep_request(
+ struct xfs_scrub *sc,
+ int whichfork,
+ struct xrep_tempexch *tx)
+{
+ struct xfs_exchmaps_req *req = &tx->req;
+
+ memset(tx, 0, sizeof(struct xrep_tempexch));
+
+ /* COW forks don't exist on disk. */
+ if (whichfork == XFS_COW_FORK) {
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ /* Both files should have the relevant forks. */
+ if (!xfs_ifork_ptr(sc->ip, whichfork) ||
+ !xfs_ifork_ptr(sc->tempip, whichfork)) {
+ ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
+ ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL);
+ return -EINVAL;
+ }
+
+ /* Exchange all mappings in both forks. */
+ req->ip1 = sc->tempip;
+ req->ip2 = sc->ip;
+ req->startoff1 = 0;
+ req->startoff2 = 0;
+ switch (whichfork) {
+ case XFS_ATTR_FORK:
+ req->flags |= XFS_EXCHMAPS_ATTR_FORK;
+ break;
+ case XFS_DATA_FORK:
+ /* Always exchange sizes when exchanging data fork mappings. */
+ req->flags |= XFS_EXCHMAPS_SET_SIZES;
+ break;
+ }
+ req->blockcount = XFS_MAX_FILEOFF;
+
+ return 0;
+}
+
+/*
+ * Fill out the mapping exchange resource estimation structures in preparation
+ * for exchanging the contents of a metadata file that we've rebuilt in the
+ * temp file. Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files.
+ */
+STATIC int
+xrep_tempexch_estimate(
+ struct xfs_scrub *sc,
+ struct xrep_tempexch *tx)
+{
+ struct xfs_exchmaps_req *req = &tx->req;
+ struct xfs_ifork *ifp;
+ struct xfs_ifork *tifp;
+ int whichfork = xfs_exchmaps_reqfork(req);
+ int state = 0;
+
+ /*
+ * The exchmaps code only knows how to exchange file fork space
+ * mappings. Any fork data in local format must be promoted to a
+ * single block before the exchange can take place.
+ */
+ ifp = xfs_ifork_ptr(sc->ip, whichfork);
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
+ state |= 1;
+
+ tifp = xfs_ifork_ptr(sc->tempip, whichfork);
+ if (tifp->if_format == XFS_DINODE_FMT_LOCAL)
+ state |= 2;
+
+ switch (state) {
+ case 0:
+ /* Both files have mapped extents; use the regular estimate. */
+ return xfs_exchrange_estimate(req);
+ case 1:
+ /*
+ * The file being repaired is in local format, but the temp
+ * file has mapped extents. To perform the exchange, the file
+ * being repaired must have its shorform data converted to an
+ * ondisk block so that the forks will be in extents format.
+ * We need one resblk for the conversion; the number of
+ * exchanges is (worst case) the temporary file's extent count
+ * plus the block we converted.
+ */
+ req->ip1_bcount = sc->tempip->i_nblocks;
+ req->ip2_bcount = 1;
+ req->nr_exchanges = 1 + tifp->if_nextents;
+ req->resblks = 1;
+ break;
+ case 2:
+ /*
+ * The temporary file is in local format, but the file being
+ * repaired has mapped extents. To perform the exchange, the
+ * temp file must have its shortform data converted to an
+ * ondisk block, and the fork changed to extents format. We
+ * need one resblk for the conversion; the number of exchanges
+ * is (worst case) the extent count of the file being repaired
+ * plus the block we converted.
+ */
+ req->ip1_bcount = 1;
+ req->ip2_bcount = sc->ip->i_nblocks;
+ req->nr_exchanges = 1 + ifp->if_nextents;
+ req->resblks = 1;
+ break;
+ case 3:
+ /*
+ * Both forks are in local format. To perform the exchange,
+ * both files must have their shortform data converted to
+ * fsblocks, and both forks must be converted to extents
+ * format. We need two resblks for the two conversions, and
+ * the number of exchanges is 1 since there's only one block at
+ * fileoff 0. Presumably, the caller could not exchange the
+ * two inode fork areas directly.
+ */
+ req->ip1_bcount = 1;
+ req->ip2_bcount = 1;
+ req->nr_exchanges = 1;
+ req->resblks = 2;
+ break;
+ }
+
+ return xfs_exchmaps_estimate_overhead(req);
+}
+
+/*
+ * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
+ * this if quota enforcement is disabled or if both inodes' dquots are the
+ * same. The qretry structure must be initialized to zeroes before the first
+ * call to this function.
+ */
+STATIC int
+xrep_tempexch_reserve_quota(
+ struct xfs_scrub *sc,
+ const struct xrep_tempexch *tx)
+{
+ struct xfs_trans *tp = sc->tp;
+ const struct xfs_exchmaps_req *req = &tx->req;
+ int64_t ddelta, rdelta;
+ int error;
+
+ /*
+ * Don't bother with a quota reservation if we're not enforcing them
+ * or the two inodes have the same dquots.
+ */
+ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
+ (req->ip1->i_udquot == req->ip2->i_udquot &&
+ req->ip1->i_gdquot == req->ip2->i_gdquot &&
+ req->ip1->i_pdquot == req->ip2->i_pdquot))
+ return 0;
+
+ /*
+ * Quota reservation for each file comes from two sources. First, we
+ * need to account for any net gain in mapped blocks during the
+ * exchange. Second, we need reservation for the gross gain in mapped
+ * blocks so that we don't trip over any quota block reservation
+ * assertions. We must reserve the gross gain because the quota code
+ * subtracts from bcount the number of blocks that we unmap; it does
+ * not add that quantity back to the quota block reservation.
+ */
+ ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount);
+ rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount);
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
+ ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount,
+ true);
+ if (error)
+ return error;
+
+ ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount);
+ rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount);
+ return xfs_trans_reserve_quota_nblks(tp, req->ip2,
+ ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount,
+ true);
+}
+
+/*
+ * Prepare an existing transaction for an atomic file contents exchange.
+ *
+ * This function fills out the mapping exchange request and resource estimation
+ * structures in preparation for exchanging the contents of a metadata file
+ * that has been rebuilt in the temp file. Next, it reserves space and quota
+ * for the transaction.
+ *
+ * The caller must hold ILOCK_EXCL of the scrub target file and the temporary
+ * file. The caller must join both inodes to the transaction with no unlock
+ * flags, and is responsible for dropping both ILOCKs when appropriate. Only
+ * use this when those ILOCKs cannot be dropped.
+ */
+int
+xrep_tempexch_trans_reserve(
+ struct xfs_scrub *sc,
+ int whichfork,
+ struct xrep_tempexch *tx)
+{
+ int error;
+
+ ASSERT(sc->tp != NULL);
+ xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
+
+ error = xrep_tempexch_prep_request(sc, whichfork, tx);
+ if (error)
+ return error;
+
+ error = xfs_exchmaps_estimate(&tx->req);
+ if (error)
+ return error;
+
+ error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0);
+ if (error)
+ return error;
+
+ return xrep_tempexch_reserve_quota(sc, tx);
+}
+
+/*
+ * Create a new transaction for a file contents exchange.
+ *
+ * This function fills out the mapping excahange request and resource
+ * estimation structures in preparation for exchanging the contents of a
+ * metadata file that has been rebuilt in the temp file. Next, it reserves
+ * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and
+ * reserves quota for the transaction.
+ *
+ * The caller is responsible for dropping both ILOCKs when appropriate.
+ */
+int
+xrep_tempexch_trans_alloc(
+ struct xfs_scrub *sc,
+ int whichfork,
+ struct xrep_tempexch *tx)
+{
+ unsigned int flags = 0;
+ int error;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(xfs_has_exchange_range(sc->mp));
+
+ error = xrep_tempexch_prep_request(sc, whichfork, tx);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_estimate(sc, tx);
+ if (error)
+ return error;
+
+ if (xfs_has_lazysbcount(sc->mp))
+ flags |= XFS_TRANS_RES_FDBLKS;
+
+ error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
+ tx->req.resblks, 0, flags, &sc->tp);
+ if (error)
+ return error;
+
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip);
+
+ return xrep_tempexch_reserve_quota(sc, tx);
+}
+
+/*
+ * Exchange file mappings (and hence file contents) between the file being
+ * repaired and the temporary file. Returns with both inodes locked and joined
+ * to a clean scrub transaction.
+ */
+int
+xrep_tempexch_contents(
+ struct xfs_scrub *sc,
+ struct xrep_tempexch *tx)
+{
+ int error;
+
+ ASSERT(xfs_has_exchange_range(sc->mp));
+
+ xfs_exchange_mappings(sc->tp, &tx->req);
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+
+ /*
+ * If we exchanged the ondisk sizes of two metadata files, we must
+ * exchanged the incore sizes as well.
+ */
+ if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) {
+ loff_t temp;
+
+ temp = i_size_read(VFS_I(sc->ip));
+ i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
+ i_size_write(VFS_I(sc->tempip), temp);
+ }
+
+ return 0;
+}
+
+/*
+ * Write local format data from one of the temporary file's forks into the same
+ * fork of file being repaired, and exchange the file sizes, if appropriate.
+ * Caller must ensure that the file being repaired has enough fork space to
+ * hold all the bytes.
+ */
+void
+xrep_tempfile_copyout_local(
+ struct xfs_scrub *sc,
+ int whichfork)
+{
+ struct xfs_ifork *temp_ifp;
+ struct xfs_ifork *ifp;
+ unsigned int ilog_flags = XFS_ILOG_CORE;
+
+ temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork);
+ ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+ ASSERT(temp_ifp != NULL);
+ ASSERT(ifp != NULL);
+ ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ ASSERT(sc->tempip->i_disk_size <=
+ xfs_inode_data_fork_size(sc->ip));
+ break;
+ case XFS_ATTR_FORK:
+ ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff);
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ /* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */
+ xfs_idestroy_fork(ifp);
+ xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data,
+ temp_ifp->if_bytes);
+
+ if (whichfork == XFS_DATA_FORK) {
+ i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
+ sc->ip->i_disk_size = sc->tempip->i_disk_size;
+ }
+
+ ilog_flags |= xfs_ilog_fdata(whichfork);
+ xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags);
+}
+
+/* Decide if a given XFS inode is a temporary file for a repair. */
+bool
+xrep_is_tempfile(
+ const struct xfs_inode *ip)
+{
+ const struct inode *inode = &ip->i_vnode;
+
+ if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR))
+ return true;
+
+ return false;
+}
diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h
new file mode 100644
index 000000000000..e51399f595fe
--- /dev/null
+++ b/fs/xfs/scrub/tempfile.h
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_TEMPFILE_H__
+#define __XFS_SCRUB_TEMPFILE_H__
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode);
+void xrep_tempfile_rele(struct xfs_scrub *sc);
+
+bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc);
+int xrep_tempfile_iolock_polled(struct xfs_scrub *sc);
+void xrep_tempfile_iounlock(struct xfs_scrub *sc);
+
+void xrep_tempfile_ilock(struct xfs_scrub *sc);
+bool xrep_tempfile_ilock_nowait(struct xfs_scrub *sc);
+void xrep_tempfile_iunlock(struct xfs_scrub *sc);
+void xrep_tempfile_iunlock_both(struct xfs_scrub *sc);
+void xrep_tempfile_ilock_both(struct xfs_scrub *sc);
+
+int xrep_tempfile_prealloc(struct xfs_scrub *sc, xfs_fileoff_t off,
+ xfs_filblks_t len);
+
+enum xfs_blft;
+
+typedef int (*xrep_tempfile_copyin_fn)(struct xfs_scrub *sc,
+ struct xfs_buf *bp, void *data);
+
+int xrep_tempfile_copyin(struct xfs_scrub *sc, xfs_fileoff_t off,
+ xfs_filblks_t len, xrep_tempfile_copyin_fn fn, void *data);
+
+int xrep_tempfile_set_isize(struct xfs_scrub *sc, unsigned long long isize);
+
+int xrep_tempfile_roll_trans(struct xfs_scrub *sc);
+void xrep_tempfile_copyout_local(struct xfs_scrub *sc, int whichfork);
+bool xrep_is_tempfile(const struct xfs_inode *ip);
+#else
+static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc)
+{
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+}
+# define xrep_is_tempfile(ip) (false)
+# define xrep_tempfile_rele(sc)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_TEMPFILE_H__ */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 3dd281d6d185..4470ad0533b8 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -19,13 +19,19 @@
#include "xfs_da_format.h"
#include "xfs_dir2.h"
#include "xfs_rmap.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/quota.h"
#include "scrub/iscan.h"
+#include "scrub/orphanage.h"
#include "scrub/nlinks.h"
#include "scrub/fscounters.h"
+#include "scrub/bitmap.h"
+#include "scrub/ino_bitmap.h"
+#include "scrub/xfblob.h"
+#include "scrub/dirtree.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 5b294be52c55..92ef4cdc486e 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -26,6 +26,10 @@ struct xchk_iscan;
struct xchk_nlink;
struct xchk_fscounters;
struct xfs_rmap_update_params;
+struct xfs_parent_rec;
+enum xchk_dirpath_outcome;
+struct xchk_dirtree;
+struct xchk_dirtree_outcomes;
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -64,6 +68,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
@@ -93,7 +99,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
{ XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \
{ XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \
{ XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \
- { XFS_SCRUB_TYPE_HEALTHY, "healthy" }
+ { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \
+ { XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \
+ { XFS_SCRUB_TYPE_BARRIER, "barrier" }
#define XFS_SCRUB_FLAG_STRINGS \
{ XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
@@ -169,6 +177,8 @@ DEFINE_EVENT(xchk_class, name, \
DEFINE_SCRUB_EVENT(xchk_start);
DEFINE_SCRUB_EVENT(xchk_done);
DEFINE_SCRUB_EVENT(xchk_deadlock_retry);
+DEFINE_SCRUB_EVENT(xchk_dirtree_start);
+DEFINE_SCRUB_EVENT(xchk_dirtree_done);
DEFINE_SCRUB_EVENT(xrep_attempt);
DEFINE_SCRUB_EVENT(xrep_done);
@@ -199,6 +209,81 @@ DEFINE_EVENT(xchk_fsgate_class, name, \
DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable);
DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable);
+DECLARE_EVENT_CLASS(xchk_vector_head_class,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead),
+ TP_ARGS(ip, vhead),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, inum)
+ __field(unsigned int, gen)
+ __field(unsigned int, flags)
+ __field(unsigned short, rest_us)
+ __field(unsigned short, nr_vecs)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->agno = vhead->svh_agno;
+ __entry->inum = vhead->svh_ino;
+ __entry->gen = vhead->svh_gen;
+ __entry->flags = vhead->svh_flags;
+ __entry->rest_us = vhead->svh_rest_us;
+ __entry->nr_vecs = vhead->svh_nr;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx agno 0x%x inum 0x%llx gen 0x%x flags 0x%x rest_us %u nr_vecs %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->agno,
+ __entry->inum,
+ __entry->gen,
+ __entry->flags,
+ __entry->rest_us,
+ __entry->nr_vecs)
+)
+#define DEFINE_SCRUBV_HEAD_EVENT(name) \
+DEFINE_EVENT(xchk_vector_head_class, name, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), \
+ TP_ARGS(ip, vhead))
+
+DEFINE_SCRUBV_HEAD_EVENT(xchk_scrubv_start);
+
+DECLARE_EVENT_CLASS(xchk_vector_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead,
+ unsigned int vec_nr, struct xfs_scrub_vec *v),
+ TP_ARGS(mp, vhead, vec_nr, v),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, vec_nr)
+ __field(unsigned int, vec_type)
+ __field(unsigned int, vec_flags)
+ __field(int, vec_ret)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->vec_nr = vec_nr;
+ __entry->vec_type = v->sv_type;
+ __entry->vec_flags = v->sv_flags;
+ __entry->vec_ret = v->sv_ret;
+ ),
+ TP_printk("dev %d:%d vec[%u] type %s flags %s ret %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->vec_nr,
+ __print_symbolic(__entry->vec_type, XFS_SCRUB_TYPE_STRINGS),
+ __print_flags(__entry->vec_flags, "|", XFS_SCRUB_FLAG_STRINGS),
+ __entry->vec_ret)
+)
+#define DEFINE_SCRUBV_EVENT(name) \
+DEFINE_EVENT(xchk_vector_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, \
+ unsigned int vec_nr, struct xfs_scrub_vec *v), \
+ TP_ARGS(mp, vhead, vec_nr, v))
+
+DEFINE_SCRUBV_EVENT(xchk_scrubv_barrier_fail);
+DEFINE_SCRUBV_EVENT(xchk_scrubv_item);
+DEFINE_SCRUBV_EVENT(xchk_scrubv_outcome);
+
TRACE_EVENT(xchk_op_error,
TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int error, void *ret_ip),
@@ -364,6 +449,7 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error);
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning);
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_preen);
#ifdef CONFIG_XFS_QUOTA
DECLARE_EVENT_CLASS(xchk_dqiter_class,
@@ -475,7 +561,7 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -518,7 +604,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->ptr = cur->bc_levels[level].ptr;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
@@ -558,7 +644,7 @@ TRACE_EVENT(xchk_btree_error,
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -598,7 +684,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -637,7 +723,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->level = level;
@@ -947,6 +1033,7 @@ DEFINE_XFILE_EVENT(xfile_store);
DEFINE_XFILE_EVENT(xfile_seek_data);
DEFINE_XFILE_EVENT(xfile_get_folio);
DEFINE_XFILE_EVENT(xfile_put_folio);
+DEFINE_XFILE_EVENT(xfile_discard);
TRACE_EVENT(xfarray_create,
TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
@@ -1300,7 +1387,7 @@ TRACE_EVENT(xchk_iscan_iget_batch,
__entry->unavail)
);
-TRACE_EVENT(xchk_iscan_iget_retry_wait,
+DECLARE_EVENT_CLASS(xchk_iscan_retry_wait_class,
TP_PROTO(struct xchk_iscan *iscan),
TP_ARGS(iscan),
TP_STRUCT__entry(
@@ -1326,7 +1413,13 @@ TRACE_EVENT(xchk_iscan_iget_retry_wait,
__entry->remaining,
__entry->iget_timeout,
__entry->retry_delay)
-);
+)
+#define DEFINE_ISCAN_RETRY_WAIT_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_retry_wait_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan), \
+ TP_ARGS(iscan))
+DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_iget_retry_wait);
+DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_agi_retry_wait);
TRACE_EVENT(xchk_nlinks_collect_dirent,
TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,
@@ -1354,6 +1447,33 @@ TRACE_EVENT(xchk_nlinks_collect_dirent,
__get_str(name))
);
+TRACE_EVENT(xchk_nlinks_collect_pptr,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,
+ const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(mp, dp, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp->i_ino;
+ __entry->ino = be64_to_cpu(pptr->p_ino);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name))
+);
+
TRACE_EVENT(xchk_nlinks_collect_metafile,
TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino),
TP_ARGS(mp, ino),
@@ -1502,6 +1622,300 @@ DEFINE_EVENT(xchk_nlinks_diff_class, name, \
TP_ARGS(mp, ip, live))
DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode);
+DECLARE_EVENT_CLASS(xchk_pptr_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name,
+ xfs_ino_t far_ino),
+ TP_ARGS(ip, name, far_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ __field(xfs_ino_t, far_ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name, name->len);
+ __entry->far_ino = far_ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx name '%.*s' far_ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name),
+ __entry->far_ino)
+)
+#define DEFINE_XCHK_PPTR_EVENT(name) \
+DEFINE_EVENT(xchk_pptr_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \
+ xfs_ino_t far_ino), \
+ TP_ARGS(ip, name, far_ino))
+DEFINE_XCHK_PPTR_EVENT(xchk_dir_defer);
+DEFINE_XCHK_PPTR_EVENT(xchk_dir_slowpath);
+DEFINE_XCHK_PPTR_EVENT(xchk_dir_ultraslowpath);
+DEFINE_XCHK_PPTR_EVENT(xchk_parent_defer);
+DEFINE_XCHK_PPTR_EVENT(xchk_parent_slowpath);
+DEFINE_XCHK_PPTR_EVENT(xchk_parent_ultraslowpath);
+
+DECLARE_EVENT_CLASS(xchk_dirtree_class,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int path_nr, const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(sc, ip, path_nr, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, path_nr)
+ __field(xfs_ino_t, child_ino)
+ __field(unsigned int, child_gen)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->child_ino = ip->i_ino;
+ __entry->child_gen = VFS_I(ip)->i_generation;
+ __entry->parent_ino = be64_to_cpu(pptr->p_ino);
+ __entry->parent_gen = be32_to_cpu(pptr->p_gen);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d path %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->child_ino,
+ __entry->child_gen,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+);
+#define DEFINE_XCHK_DIRTREE_EVENT(name) \
+DEFINE_EVENT(xchk_dirtree_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \
+ unsigned int path_nr, const struct xfs_name *name, \
+ const struct xfs_parent_rec *pptr), \
+ TP_ARGS(sc, ip, path_nr, name, pptr))
+DEFINE_XCHK_DIRTREE_EVENT(xchk_dirtree_create_path);
+DEFINE_XCHK_DIRTREE_EVENT(xchk_dirpath_walk_upwards);
+
+DECLARE_EVENT_CLASS(xchk_dirpath_class,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int path_nr, unsigned int step_nr,
+ const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(sc, ip, path_nr, step_nr, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, path_nr)
+ __field(unsigned int, step_nr)
+ __field(xfs_ino_t, child_ino)
+ __field(unsigned int, child_gen)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->step_nr = step_nr;
+ __entry->child_ino = ip->i_ino;
+ __entry->child_gen = VFS_I(ip)->i_generation;
+ __entry->parent_ino = be64_to_cpu(pptr->p_ino);
+ __entry->parent_gen = be32_to_cpu(pptr->p_gen);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d path %u step %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->step_nr,
+ __entry->child_ino,
+ __entry->child_gen,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+);
+#define DEFINE_XCHK_DIRPATH_EVENT(name) \
+DEFINE_EVENT(xchk_dirpath_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \
+ unsigned int path_nr, unsigned int step_nr, \
+ const struct xfs_name *name, \
+ const struct xfs_parent_rec *pptr), \
+ TP_ARGS(sc, ip, path_nr, step_nr, name, pptr))
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_disappeared);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step);
+
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_CORRUPT);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_LOOP);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_STALE);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_OK);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETING);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETED);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTING);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTED);
+
+#define XCHK_DIRPATH_OUTCOME_STRINGS \
+ { XCHK_DIRPATH_SCANNING, "scanning" }, \
+ { XCHK_DIRPATH_DELETE, "delete" }, \
+ { XCHK_DIRPATH_CORRUPT, "corrupt" }, \
+ { XCHK_DIRPATH_LOOP, "loop" }, \
+ { XCHK_DIRPATH_STALE, "stale" }, \
+ { XCHK_DIRPATH_OK, "ok" }, \
+ { XREP_DIRPATH_DELETING, "deleting" }, \
+ { XREP_DIRPATH_DELETED, "deleted" }, \
+ { XREP_DIRPATH_ADOPTING, "adopting" }, \
+ { XREP_DIRPATH_ADOPTED, "adopted" }
+
+DECLARE_EVENT_CLASS(xchk_dirpath_outcome_class,
+ TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr,
+ unsigned int nr_steps, \
+ unsigned int outcome),
+ TP_ARGS(sc, path_nr, nr_steps, outcome),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long long, path_nr)
+ __field(unsigned int, nr_steps)
+ __field(unsigned int, outcome)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->nr_steps = nr_steps;
+ __entry->outcome = outcome;
+ ),
+ TP_printk("dev %d:%d path %llu steps %u outcome %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->nr_steps,
+ __print_symbolic(__entry->outcome, XCHK_DIRPATH_OUTCOME_STRINGS))
+);
+#define DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(name) \
+DEFINE_EVENT(xchk_dirpath_outcome_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, \
+ unsigned int nr_steps, \
+ unsigned int outcome), \
+ TP_ARGS(sc, path_nr, nr_steps, outcome))
+DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_set_outcome);
+DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_evaluate_path);
+
+DECLARE_EVENT_CLASS(xchk_dirtree_evaluate_class,
+ TP_PROTO(const struct xchk_dirtree *dl,
+ const struct xchk_dirtree_outcomes *oc),
+ TP_ARGS(dl, oc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, rootino)
+ __field(unsigned int, nr_paths)
+ __field(unsigned int, bad)
+ __field(unsigned int, suspect)
+ __field(unsigned int, good)
+ __field(bool, needs_adoption)
+ ),
+ TP_fast_assign(
+ __entry->dev = dl->sc->mp->m_super->s_dev;
+ __entry->ino = dl->sc->ip->i_ino;
+ __entry->rootino = dl->root_ino;
+ __entry->nr_paths = dl->nr_paths;
+ __entry->bad = oc->bad;
+ __entry->suspect = oc->suspect;
+ __entry->good = oc->good;
+ __entry->needs_adoption = oc->needs_adoption ? 1 : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx rootino 0x%llx nr_paths %u bad %u suspect %u good %u adopt? %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->rootino,
+ __entry->nr_paths,
+ __entry->bad,
+ __entry->suspect,
+ __entry->good,
+ __entry->needs_adoption)
+);
+#define DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(name) \
+DEFINE_EVENT(xchk_dirtree_evaluate_class, name, \
+ TP_PROTO(const struct xchk_dirtree *dl, \
+ const struct xchk_dirtree_outcomes *oc), \
+ TP_ARGS(dl, oc))
+DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xchk_dirtree_evaluate);
+
+TRACE_EVENT(xchk_dirpath_changed,
+ TP_PROTO(struct xfs_scrub *sc, unsigned int path_nr,
+ unsigned int step_nr, const struct xfs_inode *dp,
+ const struct xfs_inode *ip, const struct xfs_name *xname),
+ TP_ARGS(sc, path_nr, step_nr, dp, ip, xname),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, path_nr)
+ __field(unsigned int, step_nr)
+ __field(xfs_ino_t, child_ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, xname->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->step_nr = step_nr;
+ __entry->child_ino = ip->i_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->namelen = xname->len;
+ memcpy(__get_str(name), xname->name, xname->len);
+ ),
+ TP_printk("dev %d:%d path %u step %u child_ino 0x%llx parent_ino 0x%llx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->step_nr,
+ __entry->child_ino,
+ __entry->parent_ino,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_dirtree_live_update,
+ TP_PROTO(struct xfs_scrub *sc, const struct xfs_inode *dp,
+ int action, const struct xfs_inode *ip, int delta,
+ const struct xfs_name *xname),
+ TP_ARGS(sc, dp, action, ip, delta, xname),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, parent_ino)
+ __field(int, action)
+ __field(xfs_ino_t, child_ino)
+ __field(int, delta)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, xname->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->parent_ino = dp->i_ino;
+ __entry->action = action;
+ __entry->child_ino = ip->i_ino;
+ __entry->delta = delta;
+ __entry->namelen = xname->len;
+ memcpy(__get_str(name), xname->name, xname->len);
+ ),
+ TP_printk("dev %d:%d parent_ino 0x%llx child_ino 0x%llx nlink_delta %d name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->parent_ino,
+ __entry->child_ino,
+ __entry->delta,
+ __entry->namelen,
+ __get_str(name))
+);
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
@@ -1533,6 +1947,7 @@ DEFINE_EVENT(xrep_extent_class, name, \
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
+DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
DECLARE_EVENT_CLASS(xrep_reap_find_class,
@@ -1566,6 +1981,7 @@ DEFINE_EVENT(xrep_reap_find_class, name, \
bool crosslinked), \
TP_ARGS(pag, agbno, len, crosslinked))
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
+DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select);
DECLARE_EVENT_CLASS(xrep_rmap_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -2273,6 +2689,891 @@ TRACE_EVENT(xrep_rmap_live_update,
__entry->flags)
);
+TRACE_EVENT(xrep_tempfile_create,
+ TP_PROTO(struct xfs_scrub *sc),
+ TP_ARGS(sc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, inum)
+ __field(unsigned int, gen)
+ __field(unsigned int, flags)
+ __field(xfs_ino_t, temp_inum)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->file ? XFS_I(file_inode(sc->file))->i_ino : 0;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = sc->sm->sm_agno;
+ __entry->inum = sc->sm->sm_ino;
+ __entry->gen = sc->sm->sm_gen;
+ __entry->flags = sc->sm->sm_flags;
+ __entry->temp_inum = sc->tempip->i_ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx type %s inum 0x%llx gen 0x%x flags 0x%x temp_inum 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __entry->inum,
+ __entry->gen,
+ __entry->flags,
+ __entry->temp_inum)
+);
+
+DECLARE_EVENT_CLASS(xrep_tempfile_class,
+ TP_PROTO(struct xfs_scrub *sc, int whichfork,
+ struct xfs_bmbt_irec *irec),
+ TP_ARGS(sc, whichfork, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_filblks_t, len)
+ __field(xfs_fsblock_t, pblk)
+ __field(int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->tempip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->lblk = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->pblk = irec->br_startblock;
+ __entry->state = irec->br_state;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->lblk,
+ __entry->len,
+ __entry->pblk,
+ __entry->state)
+);
+#define DEFINE_XREP_TEMPFILE_EVENT(name) \
+DEFINE_EVENT(xrep_tempfile_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, int whichfork, \
+ struct xfs_bmbt_irec *irec), \
+ TP_ARGS(sc, whichfork, irec))
+DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_prealloc);
+DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_copyin);
+
+TRACE_EVENT(xreap_ifork_extent,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork,
+ const struct xfs_bmbt_irec *irec),
+ TP_ARGS(sc, ip, whichfork, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, fileoff)
+ __field(xfs_filblks_t, len)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->fileoff = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock);
+ __entry->state = irec->br_state;
+ ),
+ TP_printk("dev %d:%d ip 0x%llx whichfork %s agno 0x%x agbno 0x%x fileoff 0x%llx fsbcount 0x%llx state 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->agno,
+ __entry->agbno,
+ __entry->fileoff,
+ __entry->len,
+ __entry->state)
+);
+
+TRACE_EVENT(xreap_bmapi_binval_scan,
+ TP_PROTO(struct xfs_scrub *sc, const struct xfs_bmbt_irec *irec,
+ xfs_extlen_t scan_blocks),
+ TP_ARGS(sc, irec, scan_blocks),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_filblks_t, len)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, scan_blocks)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->len = irec->br_blockcount;
+ __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock);
+ __entry->scan_blocks = scan_blocks;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%llx scan_blocks 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->scan_blocks)
+);
+
+TRACE_EVENT(xrep_xattr_recover_leafblock,
+ TP_PROTO(struct xfs_inode *ip, xfs_dablk_t dabno, uint16_t magic),
+ TP_ARGS(ip, dabno, magic),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_dablk_t, dabno)
+ __field(uint16_t, magic)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->dabno = dabno;
+ __entry->magic = magic;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx dablk 0x%x magic 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->dabno,
+ __entry->magic)
+);
+
+DECLARE_EVENT_CLASS(xrep_xattr_salvage_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name,
+ unsigned int namelen, unsigned int valuelen),
+ TP_ARGS(ip, flags, name, namelen, valuelen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, flags)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, namelen)
+ __field(unsigned int, valuelen)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->flags = flags;
+ __entry->namelen = namelen;
+ memcpy(__get_str(name), name, namelen);
+ __entry->valuelen = valuelen;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx flags %s name '%.*s' valuelen 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_flags(__entry->flags, "|", XFS_ATTR_NAMESPACE_STR),
+ __entry->namelen,
+ __get_str(name),
+ __entry->valuelen)
+);
+#define DEFINE_XREP_XATTR_SALVAGE_EVENT(name) \
+DEFINE_EVENT(xrep_xattr_salvage_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, \
+ unsigned int namelen, unsigned int valuelen), \
+ TP_ARGS(ip, flags, name, namelen, valuelen))
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_salvage_rec);
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_insert_rec);
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_stash_xattr);
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_insert_xattr);
+
+DECLARE_EVENT_CLASS(xrep_pptr_salvage_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name,
+ unsigned int namelen, const void *value, unsigned int valuelen),
+ TP_ARGS(ip, flags, name, namelen, value, valuelen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, namelen)
+ ),
+ TP_fast_assign(
+ const struct xfs_parent_rec *rec = value;
+
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = be64_to_cpu(rec->p_ino);
+ __entry->parent_gen = be32_to_cpu(rec->p_gen);
+ __entry->namelen = namelen;
+ memcpy(__get_str(name), name, namelen);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_PPTR_SALVAGE_EVENT(name) \
+DEFINE_EVENT(xrep_pptr_salvage_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, \
+ unsigned int namelen, const void *value, unsigned int valuelen), \
+ TP_ARGS(ip, flags, name, namelen, value, valuelen))
+DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr);
+DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr);
+
+TRACE_EVENT(xrep_xattr_class,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip),
+ TP_ARGS(ip, arg_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, src_ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->src_ino = arg_ip->i_ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx src 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->src_ino)
+)
+#define DEFINE_XREP_XATTR_EVENT(name) \
+DEFINE_EVENT(xrep_xattr_class, name, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), \
+ TP_ARGS(ip, arg_ip))
+DEFINE_XREP_XATTR_EVENT(xrep_xattr_rebuild_tree);
+DEFINE_XREP_XATTR_EVENT(xrep_xattr_reset_fork);
+DEFINE_XREP_XATTR_EVENT(xrep_xattr_full_reset);
+
+DECLARE_EVENT_CLASS(xrep_xattr_pptr_scan_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp,
+ const struct xfs_name *name),
+ TP_ARGS(ip, dp, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->parent_gen = VFS_IC(dp)->i_generation;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(name) \
+DEFINE_EVENT(xrep_xattr_pptr_scan_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \
+ const struct xfs_name *name), \
+ TP_ARGS(ip, dp, name))
+DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentadd);
+DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentremove);
+
+TRACE_EVENT(xrep_dir_recover_dirblock,
+ TP_PROTO(struct xfs_inode *dp, xfs_dablk_t dabno, uint32_t magic,
+ uint32_t magic_guess),
+ TP_ARGS(dp, dabno, magic, magic_guess),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_dablk_t, dabno)
+ __field(uint32_t, magic)
+ __field(uint32_t, magic_guess)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->dabno = dabno;
+ __entry->magic = magic;
+ __entry->magic_guess = magic_guess;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx dablk 0x%x magic 0x%x magic_guess 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->dabno,
+ __entry->magic,
+ __entry->magic_guess)
+);
+
+DECLARE_EVENT_CLASS(xrep_dir_class,
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino),
+ TP_ARGS(dp, parent_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, parent_ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->parent_ino = parent_ino;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx parent 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->parent_ino)
+)
+#define DEFINE_XREP_DIR_EVENT(name) \
+DEFINE_EVENT(xrep_dir_class, name, \
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), \
+ TP_ARGS(dp, parent_ino))
+DEFINE_XREP_DIR_EVENT(xrep_dir_rebuild_tree);
+DEFINE_XREP_DIR_EVENT(xrep_dir_reset_fork);
+DEFINE_XREP_DIR_EVENT(xrep_parent_reset_dotdot);
+
+DECLARE_EVENT_CLASS(xrep_dirent_class,
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name,
+ xfs_ino_t ino),
+ TP_ARGS(dp, name, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ __field(xfs_ino_t, ino)
+ __field(uint8_t, ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ __entry->ino = ino;
+ __entry->ftype = name->type;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx ftype %s name '%.*s' ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __entry->namelen,
+ __get_str(name),
+ __entry->ino)
+)
+#define DEFINE_XREP_DIRENT_EVENT(name) \
+DEFINE_EVENT(xrep_dirent_class, name, \
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, \
+ xfs_ino_t ino), \
+ TP_ARGS(dp, name, ino))
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_salvage_entry);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_createname);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_createname);
+DEFINE_XREP_DIRENT_EVENT(xrep_adoption_reparent);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_removename);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_removename);
+
+DECLARE_EVENT_CLASS(xrep_adoption_class,
+ TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved),
+ TP_ARGS(dp, ip, moved),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, child_ino)
+ __field(bool, moved)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->child_ino = ip->i_ino;
+ __entry->moved = moved;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx child 0x%llx moved? %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->child_ino,
+ __entry->moved)
+);
+#define DEFINE_XREP_ADOPTION_EVENT(name) \
+DEFINE_EVENT(xrep_adoption_class, name, \
+ TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), \
+ TP_ARGS(dp, ip, moved))
+DEFINE_XREP_ADOPTION_EVENT(xrep_adoption_trans_roll);
+
+DECLARE_EVENT_CLASS(xrep_parent_salvage_class,
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino),
+ TP_ARGS(dp, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx parent 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->ino)
+)
+#define DEFINE_XREP_PARENT_SALVAGE_EVENT(name) \
+DEFINE_EVENT(xrep_parent_salvage_class, name, \
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), \
+ TP_ARGS(dp, ino))
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_dir_salvaged_parent);
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_dirent);
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_from_dcache);
+
+DECLARE_EVENT_CLASS(xrep_pptr_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(ip, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = be64_to_cpu(pptr->p_ino);
+ __entry->parent_gen = be32_to_cpu(pptr->p_gen);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_PPTR_EVENT(name) \
+DEFINE_EVENT(xrep_pptr_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \
+ const struct xfs_parent_rec *pptr), \
+ TP_ARGS(ip, name, pptr))
+DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentadd);
+DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentremove);
+DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentadd);
+DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentremove);
+
+DECLARE_EVENT_CLASS(xrep_pptr_scan_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp,
+ const struct xfs_name *name),
+ TP_ARGS(ip, dp, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->parent_gen = VFS_IC(dp)->i_generation;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_PPTR_SCAN_EVENT(name) \
+DEFINE_EVENT(xrep_pptr_scan_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \
+ const struct xfs_name *name), \
+ TP_ARGS(ip, dp, name))
+DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentadd);
+DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentremove);
+
+TRACE_EVENT(xrep_nlinks_set_record,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *obs),
+ TP_ARGS(mp, ino, obs),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = obs->parents;
+ __entry->backrefs = obs->backrefs;
+ __entry->children = obs->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+
+DECLARE_EVENT_CLASS(xrep_dentry_class,
+ TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry),
+ TP_ARGS(mp, dentry),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, flags)
+ __field(unsigned long, ino)
+ __field(bool, positive)
+ __field(unsigned long, parent_ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, dentry->d_name.len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->flags = dentry->d_flags;
+ __entry->positive = d_is_positive(dentry);
+ if (dentry->d_parent && d_inode(dentry->d_parent))
+ __entry->parent_ino = d_inode(dentry->d_parent)->i_ino;
+ else
+ __entry->parent_ino = -1UL;
+ __entry->ino = d_inode(dentry) ? d_inode(dentry)->i_ino : 0;
+ __entry->namelen = dentry->d_name.len;
+ memcpy(__get_str(name), dentry->d_name.name, dentry->d_name.len);
+ ),
+ TP_printk("dev %d:%d flags 0x%x positive? %d parent_ino 0x%lx ino 0x%lx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->flags,
+ __entry->positive,
+ __entry->parent_ino,
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name))
+);
+#define DEFINE_REPAIR_DENTRY_EVENT(name) \
+DEFINE_EVENT(xrep_dentry_class, name, \
+ TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), \
+ TP_ARGS(mp, dentry))
+DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_check_child);
+DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_invalidate_child);
+DEFINE_REPAIR_DENTRY_EVENT(xrep_dirtree_delete_child);
+
+TRACE_EVENT(xrep_symlink_salvage_target,
+ TP_PROTO(struct xfs_inode *ip, char *target, unsigned int targetlen),
+ TP_ARGS(ip, target, targetlen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, targetlen)
+ __dynamic_array(char, target, targetlen + 1)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->targetlen = targetlen;
+ memcpy(__get_str(target), target, targetlen);
+ __get_str(target)[targetlen] = 0;
+ ),
+ TP_printk("dev %d:%d ip 0x%llx target '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->targetlen,
+ __get_str(target))
+);
+
+DECLARE_EVENT_CLASS(xrep_symlink_class,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ ),
+ TP_printk("dev %d:%d ip 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino)
+);
+
+#define DEFINE_XREP_SYMLINK_EVENT(name) \
+DEFINE_EVENT(xrep_symlink_class, name, \
+ TP_PROTO(struct xfs_inode *ip), \
+ TP_ARGS(ip))
+DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_rebuild);
+DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_reset_fork);
+
+TRACE_EVENT(xrep_iunlink_visit,
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t bucket_agino, struct xfs_inode *ip),
+ TP_ARGS(pag, bucket, bucket_agino, ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, bucket_agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->agino = XFS_INO_TO_AGINO(pag->pag_mount, ip->i_ino);
+ __entry->bucket = bucket;
+ __entry->bucket_agino = bucket_agino;
+ __entry->prev_agino = ip->i_prev_unlinked;
+ __entry->next_agino = ip->i_next_unlinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x bucket_agino 0x%x prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->agino,
+ __entry->bucket_agino,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_reload_next,
+ TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino),
+ TP_ARGS(ip, prev_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, old_prev_agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ __field(unsigned int, nlink)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->old_prev_agino = ip->i_prev_unlinked;
+ __entry->prev_agino = prev_agino;
+ __entry->next_agino = ip->i_next_unlinked;
+ __entry->nlink = VFS_I(ip)->i_nlink;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u old_prev_agino %u prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->nlink,
+ __entry->old_prev_agino,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_reload_ondisk,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(unsigned int, nlink)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->nlink = VFS_I(ip)->i_nlink;
+ __entry->next_agino = ip->i_next_unlinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->nlink,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket,
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t prev_agino, xfs_agino_t next_agino),
+ TP_ARGS(pag, bucket, prev_agino, next_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->bucket = bucket;
+ __entry->prev_agino = prev_agino;
+ __entry->next_agino = next_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class,
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t prev_agino, xfs_agino_t next_agino),
+ TP_ARGS(pag, bucket, prev_agino, next_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->bucket = bucket;
+ __entry->prev_agino = prev_agino;
+ __entry->next_agino = next_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+#define DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(name) \
+DEFINE_EVENT(xrep_iunlink_resolve_class, name, \
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket, \
+ xfs_agino_t prev_agino, xfs_agino_t next_agino), \
+ TP_ARGS(pag, bucket, prev_agino, next_agino))
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_uncached);
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_wronglist);
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_nolist);
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_ok);
+
+TRACE_EVENT(xrep_iunlink_relink_next,
+ TP_PROTO(struct xfs_inode *ip, xfs_agino_t next_agino),
+ TP_ARGS(ip, next_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, next_agino)
+ __field(xfs_agino_t, new_next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->next_agino = ip->i_next_unlinked;
+ __entry->new_next_agino = next_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x -> 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->next_agino,
+ __entry->new_next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_relink_prev,
+ TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino),
+ TP_ARGS(ip, prev_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, new_prev_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->prev_agino = ip->i_prev_unlinked;
+ __entry->new_prev_agino = prev_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x prev_agino 0x%x -> 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->prev_agino,
+ __entry->new_prev_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_add_to_bucket,
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t agino, xfs_agino_t curr_head),
+ TP_ARGS(pag, bucket, agino, curr_head),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->bucket = bucket;
+ __entry->agino = agino;
+ __entry->next_agino = curr_head;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_commit_bucket,
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t old_agino, xfs_agino_t agino),
+ TP_ARGS(pag, bucket, old_agino, agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, old_agino)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->bucket = bucket;
+ __entry->old_agino = old_agino;
+ __entry->agino = agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x -> 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->old_agino,
+ __entry->agino)
+);
+
+DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xrep_dirpath_set_outcome);
+DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path);
+DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption);
+DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate);
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index 17c982a4821d..9185ae7088d4 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -7,9 +7,9 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
-#include "scrub/scrub.h"
#include "scrub/trace.h"
/*
@@ -486,6 +486,9 @@ xfarray_sortinfo_alloc(
xfarray_sortinfo_lo(si)[0] = 0;
xfarray_sortinfo_hi(si)[0] = array->nr - 1;
+ si->relax = INIT_XCHK_RELAX;
+ if (flags & XFARRAY_SORT_KILLABLE)
+ si->relax.interruptible = false;
trace_xfarray_sort(si, nr_bytes);
*infop = si;
@@ -503,10 +506,7 @@ xfarray_sort_terminated(
* few seconds so that we don't run afoul of the soft lockup watchdog
* or RCU stall detector.
*/
- cond_resched();
-
- if ((si->flags & XFARRAY_SORT_KILLABLE) &&
- fatal_signal_pending(current)) {
+ if (xchk_maybe_relax(&si->relax)) {
if (*error == 0)
*error = -EINTR;
return true;
@@ -1051,3 +1051,20 @@ out_free:
kvfree(si);
return error;
}
+
+/* How many bytes is this array consuming? */
+unsigned long long
+xfarray_bytes(
+ struct xfarray *array)
+{
+ return xfile_bytes(array->xfile);
+}
+
+/* Empty the entire array. */
+void
+xfarray_truncate(
+ struct xfarray *array)
+{
+ xfile_discard(array->xfile, 0, MAX_LFS_FILESIZE);
+ array->nr = 0;
+}
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index acb2f94c56c1..5eeeeed13ae2 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -8,6 +8,7 @@
/* xfile array index type, along with cursor initialization */
typedef uint64_t xfarray_idx_t;
+#define XFARRAY_NULLIDX ((__force xfarray_idx_t)-1ULL)
#define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0)
/* Iterate each index of an xfile array. */
@@ -44,6 +45,8 @@ int xfarray_unset(struct xfarray *array, xfarray_idx_t idx);
int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr);
int xfarray_store_anywhere(struct xfarray *array, const void *ptr);
bool xfarray_element_is_null(struct xfarray *array, const void *ptr);
+void xfarray_truncate(struct xfarray *array);
+unsigned long long xfarray_bytes(struct xfarray *array);
/*
* Load an array element, but zero the buffer if there's no data because we
@@ -124,6 +127,9 @@ struct xfarray_sortinfo {
/* XFARRAY_SORT_* flags; see below. */
unsigned int flags;
+ /* next time we want to cond_resched() */
+ struct xchk_relax relax;
+
/* Cache a folio here for faster scanning for pivots */
struct folio *folio;
diff --git a/fs/xfs/scrub/xfblob.c b/fs/xfs/scrub/xfblob.c
new file mode 100644
index 000000000000..6ef2a9637f16
--- /dev/null
+++ b/fs/xfs/scrub/xfblob.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "scrub/scrub.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+
+/*
+ * XFS Blob Storage
+ * ================
+ * Stores and retrieves blobs using an xfile. Objects are appended to the file
+ * and the offset is returned as a magic cookie for retrieval.
+ */
+
+#define XB_KEY_MAGIC 0xABAADDAD
+struct xb_key {
+ uint32_t xb_magic; /* XB_KEY_MAGIC */
+ uint32_t xb_size; /* size of the blob, in bytes */
+ loff_t xb_offset; /* byte offset of this key */
+ /* blob comes after here */
+} __packed;
+
+/* Initialize a blob storage object. */
+int
+xfblob_create(
+ const char *description,
+ struct xfblob **blobp)
+{
+ struct xfblob *blob;
+ struct xfile *xfile;
+ int error;
+
+ error = xfile_create(description, 0, &xfile);
+ if (error)
+ return error;
+
+ blob = kmalloc(sizeof(struct xfblob), XCHK_GFP_FLAGS);
+ if (!blob) {
+ error = -ENOMEM;
+ goto out_xfile;
+ }
+
+ blob->xfile = xfile;
+ blob->last_offset = PAGE_SIZE;
+
+ *blobp = blob;
+ return 0;
+
+out_xfile:
+ xfile_destroy(xfile);
+ return error;
+}
+
+/* Destroy a blob storage object. */
+void
+xfblob_destroy(
+ struct xfblob *blob)
+{
+ xfile_destroy(blob->xfile);
+ kfree(blob);
+}
+
+/* Retrieve a blob. */
+int
+xfblob_load(
+ struct xfblob *blob,
+ xfblob_cookie cookie,
+ void *ptr,
+ uint32_t size)
+{
+ struct xb_key key;
+ int error;
+
+ error = xfile_load(blob->xfile, &key, sizeof(key), cookie);
+ if (error)
+ return error;
+
+ if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) {
+ ASSERT(0);
+ return -ENODATA;
+ }
+ if (size < key.xb_size) {
+ ASSERT(0);
+ return -EFBIG;
+ }
+
+ return xfile_load(blob->xfile, ptr, key.xb_size,
+ cookie + sizeof(key));
+}
+
+/* Store a blob. */
+int
+xfblob_store(
+ struct xfblob *blob,
+ xfblob_cookie *cookie,
+ const void *ptr,
+ uint32_t size)
+{
+ struct xb_key key = {
+ .xb_offset = blob->last_offset,
+ .xb_magic = XB_KEY_MAGIC,
+ .xb_size = size,
+ };
+ loff_t pos = blob->last_offset;
+ int error;
+
+ error = xfile_store(blob->xfile, &key, sizeof(key), pos);
+ if (error)
+ return error;
+
+ pos += sizeof(key);
+ error = xfile_store(blob->xfile, ptr, size, pos);
+ if (error)
+ goto out_err;
+
+ *cookie = blob->last_offset;
+ blob->last_offset += sizeof(key) + size;
+ return 0;
+out_err:
+ xfile_discard(blob->xfile, blob->last_offset, sizeof(key));
+ return error;
+}
+
+/* Free a blob. */
+int
+xfblob_free(
+ struct xfblob *blob,
+ xfblob_cookie cookie)
+{
+ struct xb_key key;
+ int error;
+
+ error = xfile_load(blob->xfile, &key, sizeof(key), cookie);
+ if (error)
+ return error;
+
+ if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) {
+ ASSERT(0);
+ return -ENODATA;
+ }
+
+ xfile_discard(blob->xfile, cookie, sizeof(key) + key.xb_size);
+ return 0;
+}
+
+/* How many bytes is this blob storage object consuming? */
+unsigned long long
+xfblob_bytes(
+ struct xfblob *blob)
+{
+ return xfile_bytes(blob->xfile);
+}
+
+/* Drop all the blobs. */
+void
+xfblob_truncate(
+ struct xfblob *blob)
+{
+ xfile_discard(blob->xfile, PAGE_SIZE, MAX_LFS_FILESIZE - PAGE_SIZE);
+ blob->last_offset = PAGE_SIZE;
+}
diff --git a/fs/xfs/scrub/xfblob.h b/fs/xfs/scrub/xfblob.h
new file mode 100644
index 000000000000..ae78322613ca
--- /dev/null
+++ b/fs/xfs/scrub/xfblob.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_XFBLOB_H__
+#define __XFS_SCRUB_XFBLOB_H__
+
+struct xfblob {
+ struct xfile *xfile;
+ loff_t last_offset;
+};
+
+typedef loff_t xfblob_cookie;
+
+int xfblob_create(const char *descr, struct xfblob **blobp);
+void xfblob_destroy(struct xfblob *blob);
+int xfblob_load(struct xfblob *blob, xfblob_cookie cookie, void *ptr,
+ uint32_t size);
+int xfblob_store(struct xfblob *blob, xfblob_cookie *cookie, const void *ptr,
+ uint32_t size);
+int xfblob_free(struct xfblob *blob, xfblob_cookie cookie);
+unsigned long long xfblob_bytes(struct xfblob *blob);
+void xfblob_truncate(struct xfblob *blob);
+
+static inline int
+xfblob_storename(
+ struct xfblob *blob,
+ xfblob_cookie *cookie,
+ const struct xfs_name *xname)
+{
+ return xfblob_store(blob, cookie, xname->name, xname->len);
+}
+
+static inline int
+xfblob_loadname(
+ struct xfblob *blob,
+ xfblob_cookie cookie,
+ struct xfs_name *xname,
+ uint32_t size)
+{
+ int ret = xfblob_load(blob, cookie, (void *)xname->name, size);
+ if (ret)
+ return ret;
+
+ xname->len = size;
+ return 0;
+}
+
+#endif /* __XFS_SCRUB_XFBLOB_H__ */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 8cdd863db585..d848222f802b 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -10,9 +10,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
-#include "scrub/scrub.h"
#include "scrub/trace.h"
#include <linux/shmem_fs.h>
@@ -310,3 +310,15 @@ xfile_put_folio(
folio_unlock(folio);
folio_put(folio);
}
+
+/* Discard the page cache that's backing a range of the xfile. */
+void
+xfile_discard(
+ struct xfile *xf,
+ loff_t pos,
+ u64 count)
+{
+ trace_xfile_discard(xf, pos, count);
+
+ shmem_truncate_range(file_inode(xf->file), pos, pos + count - 1);
+}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
index 76d78dba7e34..cc2cc1714cd4 100644
--- a/fs/xfs/scrub/xfile.h
+++ b/fs/xfs/scrub/xfile.h
@@ -17,6 +17,7 @@ int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos);
int xfile_store(struct xfile *xf, const void *buf, size_t count,
loff_t pos);
+void xfile_discard(struct xfile *xf, loff_t pos, u64 count);
loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
#define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER)
@@ -26,4 +27,9 @@ struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len,
unsigned int flags);
void xfile_put_folio(struct xfile *xf, struct folio *folio);
+static inline unsigned long long xfile_bytes(struct xfile *xf)
+{
+ return file_inode(xf->file)->i_blocks << SECTOR_SHIFT;
+}
+
#endif /* __XFS_SCRUB_XFILE_H__ */
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h
index a39befa743ce..f17173b83e6f 100644
--- a/fs/xfs/scrub/xfs_scrub.h
+++ b/fs/xfs/scrub/xfs_scrub.h
@@ -7,9 +7,11 @@
#define __XFS_SCRUB_H__
#ifndef CONFIG_XFS_ONLINE_SCRUB
-# define xfs_scrub_metadata(file, sm) (-ENOTTY)
+# define xfs_ioc_scrub_metadata(f, a) (-ENOTTY)
+# define xfs_ioc_scrubv_metadata(f, a) (-ENOTTY)
#else
-int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm);
+int xfs_ioc_scrub_metadata(struct file *file, void __user *arg);
+int xfs_ioc_scrubv_metadata(struct file *file, void __user *arg);
#endif /* CONFIG_XFS_ONLINE_SCRUB */
#endif /* __XFS_SCRUB_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 4bf69c9c088e..c7c3dcfa2718 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -201,16 +201,17 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (!args.value)
return -ENOMEM;
xfs_acl_to_disk(args.value, acl);
+ error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT);
+ kvfree(args.value);
+ } else {
+ error = xfs_attr_change(&args, XFS_ATTRUPDATE_REMOVE);
+ /*
+ * If the attribute didn't exist to start with that's fine.
+ */
+ if (error == -ENOATTR)
+ error = 0;
}
- error = xfs_attr_change(&args);
- kvfree(args.value);
-
- /*
- * If the attribute didn't exist to start with that's fine.
- */
- if (!acl && error == -ENOATTR)
- error = 0;
if (!error)
set_cached_acl(inode, type, acl);
return error;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3f428620ebf2..6dead20338e2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -233,45 +233,6 @@ xfs_imap_valid(
return true;
}
-/*
- * Pass in a dellalloc extent and convert it to real extents, return the real
- * extent that maps offset_fsb in wpc->iomap.
- *
- * The current page is held locked so nothing could have removed the block
- * backing offset_fsb, although it could have moved from the COW to the data
- * fork by another thread.
- */
-static int
-xfs_convert_blocks(
- struct iomap_writepage_ctx *wpc,
- struct xfs_inode *ip,
- int whichfork,
- loff_t offset)
-{
- int error;
- unsigned *seq;
-
- if (whichfork == XFS_COW_FORK)
- seq = &XFS_WPC(wpc)->cow_seq;
- else
- seq = &XFS_WPC(wpc)->data_seq;
-
- /*
- * Attempt to allocate whatever delalloc extent currently backs offset
- * and put the result into wpc->iomap. Allocate in a loop because it
- * may take several attempts to allocate real blocks for a contiguous
- * delalloc extent if free space is sufficiently fragmented.
- */
- do {
- error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
- &wpc->iomap, seq);
- if (error)
- return error;
- } while (wpc->iomap.offset + wpc->iomap.length <= offset);
-
- return 0;
-}
-
static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
@@ -290,6 +251,7 @@ xfs_map_blocks(
struct xfs_iext_cursor icur;
int retries = 0;
int error = 0;
+ unsigned int *seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -387,7 +349,19 @@ retry:
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
- error = xfs_convert_blocks(wpc, ip, whichfork, offset);
+ /*
+ * Convert a dellalloc extent to a real one. The current page is held
+ * locked so nothing could have removed the block backing offset_fsb,
+ * although it could have moved from the COW to the data fork by another
+ * thread.
+ */
+ if (whichfork == XFS_COW_FORK)
+ seq = &XFS_WPC(wpc)->cow_seq;
+ else
+ seq = &XFS_WPC(wpc)->data_seq;
+
+ error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+ &wpc->iomap, seq);
if (error) {
/*
* If we failed to find the extent in the COW fork we might have
@@ -469,7 +443,6 @@ xfs_discard_folio(
{
struct xfs_inode *ip = XFS_I(folio->mapping->host);
struct xfs_mount *mp = ip->i_mount;
- int error;
if (xfs_is_shutdown(mp))
return;
@@ -483,11 +456,8 @@ xfs_discard_folio(
* byte of the next folio. Hence the end offset is only dependent on the
* folio itself and not the start offset that is passed in.
*/
- error = xfs_bmap_punch_delalloc_range(ip, pos,
+ xfs_bmap_punch_delalloc_range(ip, pos,
folio_pos(folio) + folio_size(folio));
-
- if (error && !xfs_is_shutdown(mp))
- xfs_alert(mp, "page discard unable to remove delalloc mapping.");
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 9b4c61e1c22e..2b10ac4c5fce 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -27,6 +27,7 @@
#include "xfs_error.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
+#include "xfs_parent.h"
struct kmem_cache *xfs_attri_cache;
struct kmem_cache *xfs_attrd_cache;
@@ -73,8 +74,12 @@ static inline struct xfs_attri_log_nameval *
xfs_attri_log_nameval_alloc(
const void *name,
unsigned int name_len,
+ const void *new_name,
+ unsigned int new_name_len,
const void *value,
- unsigned int value_len)
+ unsigned int value_len,
+ const void *new_value,
+ unsigned int new_value_len)
{
struct xfs_attri_log_nameval *nv;
@@ -83,15 +88,26 @@ xfs_attri_log_nameval_alloc(
* this. But kvmalloc() utterly sucks, so we use our own version.
*/
nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) +
- name_len + value_len);
+ name_len + new_name_len + value_len +
+ new_value_len);
nv->name.i_addr = nv + 1;
nv->name.i_len = name_len;
nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME;
memcpy(nv->name.i_addr, name, name_len);
+ if (new_name_len) {
+ nv->new_name.i_addr = nv->name.i_addr + name_len;
+ nv->new_name.i_len = new_name_len;
+ memcpy(nv->new_name.i_addr, new_name, new_name_len);
+ } else {
+ nv->new_name.i_addr = NULL;
+ nv->new_name.i_len = 0;
+ }
+ nv->new_name.i_type = XLOG_REG_TYPE_ATTR_NEWNAME;
+
if (value_len) {
- nv->value.i_addr = nv->name.i_addr + name_len;
+ nv->value.i_addr = nv->name.i_addr + name_len + new_name_len;
nv->value.i_len = value_len;
memcpy(nv->value.i_addr, value, value_len);
} else {
@@ -100,6 +116,17 @@ xfs_attri_log_nameval_alloc(
}
nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE;
+ if (new_value_len) {
+ nv->new_value.i_addr = nv->name.i_addr + name_len +
+ new_name_len + value_len;
+ nv->new_value.i_len = new_value_len;
+ memcpy(nv->new_value.i_addr, new_value, new_value_len);
+ } else {
+ nv->new_value.i_addr = NULL;
+ nv->new_value.i_len = 0;
+ }
+ nv->new_value.i_type = XLOG_REG_TYPE_ATTR_NEWVALUE;
+
refcount_set(&nv->refcount, 1);
return nv;
}
@@ -145,11 +172,20 @@ xfs_attri_item_size(
*nbytes += sizeof(struct xfs_attri_log_format) +
xlog_calc_iovec_len(nv->name.i_len);
- if (!nv->value.i_len)
- return;
+ if (nv->new_name.i_len) {
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->new_name.i_len);
+ }
- *nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+ if (nv->value.i_len) {
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+ }
+
+ if (nv->new_value.i_len) {
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->new_value.i_len);
+ }
}
/*
@@ -179,15 +215,28 @@ xfs_attri_item_format(
ASSERT(nv->name.i_len > 0);
attrip->attri_format.alfi_size++;
+ if (nv->new_name.i_len > 0)
+ attrip->attri_format.alfi_size++;
+
if (nv->value.i_len > 0)
attrip->attri_format.alfi_size++;
+ if (nv->new_value.i_len > 0)
+ attrip->attri_format.alfi_size++;
+
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
&attrip->attri_format,
sizeof(struct xfs_attri_log_format));
xlog_copy_from_iovec(lv, &vecp, &nv->name);
+
+ if (nv->new_name.i_len > 0)
+ xlog_copy_from_iovec(lv, &vecp, &nv->new_name);
+
if (nv->value.i_len > 0)
xlog_copy_from_iovec(lv, &vecp, &nv->value);
+
+ if (nv->new_value.i_len > 0)
+ xlog_copy_from_iovec(lv, &vecp, &nv->new_value);
}
/*
@@ -308,6 +357,12 @@ xfs_attrd_item_intent(
return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
}
+static inline unsigned int
+xfs_attr_log_item_op(const struct xfs_attri_log_format *attrp)
+{
+ return attrp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+}
+
/* Log an attr to the intent item. */
STATIC void
xfs_attr_log_item(
@@ -316,6 +371,8 @@ xfs_attr_log_item(
const struct xfs_attr_intent *attr)
{
struct xfs_attri_log_format *attrp;
+ struct xfs_attri_log_nameval *nv = attr->xattri_nameval;
+ struct xfs_da_args *args = attr->xattri_da_args;
/*
* At this point the xfs_attr_intent has been constructed, and we've
@@ -323,13 +380,30 @@ xfs_attr_log_item(
* structure with fields from this xfs_attr_intent
*/
attrp = &attrip->attri_format;
- attrp->alfi_ino = attr->xattri_da_args->dp->i_ino;
+ attrp->alfi_ino = args->dp->i_ino;
ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK));
attrp->alfi_op_flags = attr->xattri_op_flags;
- attrp->alfi_value_len = attr->xattri_nameval->value.i_len;
- attrp->alfi_name_len = attr->xattri_nameval->name.i_len;
- ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK));
- attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter;
+ attrp->alfi_value_len = nv->value.i_len;
+
+ switch (xfs_attr_log_item_op(attrp)) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ ASSERT(nv->value.i_len == nv->new_value.i_len);
+
+ attrp->alfi_igen = VFS_I(args->dp)->i_generation;
+ attrp->alfi_old_name_len = nv->name.i_len;
+ attrp->alfi_new_name_len = nv->new_name.i_len;
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ attrp->alfi_igen = VFS_I(args->dp)->i_generation;
+ fallthrough;
+ default:
+ attrp->alfi_name_len = nv->name.i_len;
+ break;
+ }
+
+ ASSERT(!(args->attr_filter & ~XFS_ATTRI_FILTER_MASK));
+ attrp->alfi_attr_filter = args->attr_filter;
}
/* Get an ATTRI. */
@@ -368,8 +442,11 @@ xfs_attr_create_intent(
* Transfer our reference to the name/value buffer to the
* deferred work state structure.
*/
- attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name,
- args->namelen, args->value, args->valuelen);
+ attr->xattri_nameval = xfs_attri_log_nameval_alloc(
+ args->name, args->namelen,
+ args->new_name, args->new_namelen,
+ args->value, args->valuelen,
+ args->new_value, args->new_valuelen);
}
attrip = xfs_attri_init(mp, attr->xattri_nameval);
@@ -460,17 +537,19 @@ xfs_attri_item_match(
return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id;
}
+static inline bool
+xfs_attri_validate_namelen(unsigned int namelen)
+{
+ return namelen > 0 && namelen <= XATTR_NAME_MAX;
+}
+
/* Is this recovered ATTRI format ok? */
static inline bool
xfs_attri_validate(
struct xfs_mount *mp,
struct xfs_attri_log_format *attrp)
{
- unsigned int op = attrp->alfi_op_flags &
- XFS_ATTRI_OP_FLAGS_TYPE_MASK;
-
- if (attrp->__pad != 0)
- return false;
+ unsigned int op = xfs_attr_log_item_op(attrp);
if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)
return false;
@@ -478,24 +557,75 @@ xfs_attri_validate(
if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK)
return false;
- /* alfi_op_flags should be either a set or remove */
+ if (!xfs_attr_check_namespace(attrp->alfi_attr_filter &
+ XFS_ATTR_NSP_ONDISK_MASK))
+ return false;
+
switch (op) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ if (!xfs_has_parent(mp))
+ return false;
+ if (attrp->alfi_value_len != sizeof(struct xfs_parent_rec))
+ return false;
+ if (!xfs_attri_validate_namelen(attrp->alfi_name_len))
+ return false;
+ if (!(attrp->alfi_attr_filter & XFS_ATTR_PARENT))
+ return false;
+ break;
case XFS_ATTRI_OP_FLAGS_SET:
case XFS_ATTRI_OP_FLAGS_REPLACE:
+ if (!xfs_is_using_logged_xattrs(mp))
+ return false;
+ if (attrp->alfi_value_len > XATTR_SIZE_MAX)
+ return false;
+ if (!xfs_attri_validate_namelen(attrp->alfi_name_len))
+ return false;
+ break;
case XFS_ATTRI_OP_FLAGS_REMOVE:
+ if (!xfs_is_using_logged_xattrs(mp))
+ return false;
+ if (attrp->alfi_value_len != 0)
+ return false;
+ if (!xfs_attri_validate_namelen(attrp->alfi_name_len))
+ return false;
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ if (!xfs_has_parent(mp))
+ return false;
+ if (!xfs_attri_validate_namelen(attrp->alfi_old_name_len))
+ return false;
+ if (!xfs_attri_validate_namelen(attrp->alfi_new_name_len))
+ return false;
+ if (attrp->alfi_value_len != sizeof(struct xfs_parent_rec))
+ return false;
+ if (!(attrp->alfi_attr_filter & XFS_ATTR_PARENT))
+ return false;
break;
default:
return false;
}
- if (attrp->alfi_value_len > XATTR_SIZE_MAX)
- return false;
+ return xfs_verify_ino(mp, attrp->alfi_ino);
+}
- if ((attrp->alfi_name_len > XATTR_NAME_MAX) ||
- (attrp->alfi_name_len == 0))
- return false;
+static int
+xfs_attri_iread_extents(
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp;
+ int error;
- return xfs_verify_ino(mp, attrp->alfi_ino);
+ error = xfs_trans_alloc_empty(ip->i_mount, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_cancel(tp);
+
+ return error;
}
static inline struct xfs_attr_intent *
@@ -508,20 +638,46 @@ xfs_attri_recover_work(
{
struct xfs_attr_intent *attr;
struct xfs_da_args *args;
+ struct xfs_inode *ip;
int local;
int error;
- error = xlog_recover_iget(mp, attrp->alfi_ino, ipp);
- if (error)
- return ERR_PTR(error);
+ /*
+ * Parent pointer attr items record the generation but regular logged
+ * xattrs do not; select the right iget function.
+ */
+ switch (xfs_attr_log_item_op(attrp)) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ error = xlog_recover_iget_handle(mp, attrp->alfi_ino,
+ attrp->alfi_igen, &ip);
+ break;
+ default:
+ error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
+ break;
+ }
+ if (error) {
+ xfs_irele(ip);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attrp,
+ sizeof(*attrp));
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
+ if (xfs_inode_has_attr_fork(ip)) {
+ error = xfs_attri_iread_extents(ip);
+ if (error) {
+ xfs_irele(ip);
+ return ERR_PTR(error);
+ }
+ }
attr = kzalloc(sizeof(struct xfs_attr_intent) +
sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL);
args = (struct xfs_da_args *)(attr + 1);
attr->xattri_da_args = args;
- attr->xattri_op_flags = attrp->alfi_op_flags &
- XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+ attr->xattri_op_flags = xfs_attr_log_item_op(attrp);
/*
* We're reconstructing the deferred work state structure from the
@@ -531,35 +687,42 @@ xfs_attri_recover_work(
attr->xattri_nameval = xfs_attri_log_nameval_get(nv);
ASSERT(attr->xattri_nameval);
- args->dp = *ipp;
+ args->dp = ip;
args->geo = mp->m_attr_geo;
args->whichfork = XFS_ATTR_FORK;
args->name = nv->name.i_addr;
args->namelen = nv->name.i_len;
- args->hashval = xfs_da_hashname(args->name, args->namelen);
+ args->new_name = nv->new_name.i_addr;
+ args->new_namelen = nv->new_name.i_len;
+ args->value = nv->value.i_addr;
+ args->valuelen = nv->value.i_len;
+ args->new_value = nv->new_value.i_addr;
+ args->new_valuelen = nv->new_value.i_len;
args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
XFS_DA_OP_LOGGED;
+ args->owner = args->dp->i_ino;
+ xfs_attr_sethash(args);
- ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb));
-
- switch (attr->xattri_op_flags) {
+ switch (xfs_attr_intent_op(attr)) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
case XFS_ATTRI_OP_FLAGS_SET:
case XFS_ATTRI_OP_FLAGS_REPLACE:
- args->value = nv->value.i_addr;
- args->valuelen = nv->value.i_len;
args->total = xfs_attr_calc_size(args, &local);
if (xfs_inode_hasattr(args->dp))
attr->xattri_dela_state = xfs_attr_init_replace_state(args);
else
attr->xattri_dela_state = xfs_attr_init_add_state(args);
break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
case XFS_ATTRI_OP_FLAGS_REMOVE:
attr->xattri_dela_state = xfs_attr_init_remove_state(args);
break;
}
xfs_defer_add_item(dfp, &attr->xattri_list);
+ *ipp = ip;
return attr;
}
@@ -591,7 +754,8 @@ xfs_attr_recover_work(
*/
attrp = &attrip->attri_format;
if (!xfs_attri_validate(mp, attrp) ||
- !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+ !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr,
+ nv->name.i_len))
return -EFSCORRUPTED;
attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv);
@@ -614,16 +778,17 @@ xfs_attr_recover_work(
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
&attrip->attri_format,
sizeof(attrip->attri_format));
- if (error) {
- xfs_trans_cancel(tp);
- goto out_unlock;
- }
+ if (error)
+ goto out_cancel;
error = xfs_defer_ops_capture_and_commit(tp, capture_list);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
return error;
+out_cancel:
+ xfs_trans_cancel(tp);
+ goto out_unlock;
}
/* Re-log an intent item to push the log tail forward. */
@@ -649,9 +814,20 @@ xfs_attr_relog_intent(
new_attrp = &new_attrip->attri_format;
new_attrp->alfi_ino = old_attrp->alfi_ino;
+ new_attrp->alfi_igen = old_attrp->alfi_igen;
new_attrp->alfi_op_flags = old_attrp->alfi_op_flags;
new_attrp->alfi_value_len = old_attrp->alfi_value_len;
- new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+
+ switch (xfs_attr_log_item_op(old_attrp)) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ new_attrp->alfi_new_name_len = old_attrp->alfi_new_name_len;
+ new_attrp->alfi_old_name_len = old_attrp->alfi_old_name_len;
+ break;
+ default:
+ new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+ break;
+ }
+
new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
return &new_attrip->attri_item;
@@ -679,6 +855,75 @@ xfs_attr_create_done(
return &attrdp->attrd_item;
}
+void
+xfs_attr_defer_add(
+ struct xfs_da_args *args,
+ enum xfs_attr_defer_op op)
+{
+ struct xfs_attr_intent *new;
+ unsigned int log_op = 0;
+ bool is_pptr = args->attr_filter & XFS_ATTR_PARENT;
+
+ if (is_pptr) {
+ ASSERT(xfs_has_parent(args->dp->i_mount));
+ ASSERT((args->attr_filter & ~XFS_ATTR_PARENT) == 0);
+ ASSERT(args->op_flags & XFS_DA_OP_LOGGED);
+ ASSERT(args->valuelen == sizeof(struct xfs_parent_rec));
+ }
+
+ new = kmem_cache_zalloc(xfs_attr_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ new->xattri_da_args = args;
+
+ /* Compute log operation from the higher level op and namespace. */
+ switch (op) {
+ case XFS_ATTR_DEFER_SET:
+ if (is_pptr)
+ log_op = XFS_ATTRI_OP_FLAGS_PPTR_SET;
+ else
+ log_op = XFS_ATTRI_OP_FLAGS_SET;
+ break;
+ case XFS_ATTR_DEFER_REPLACE:
+ if (is_pptr)
+ log_op = XFS_ATTRI_OP_FLAGS_PPTR_REPLACE;
+ else
+ log_op = XFS_ATTRI_OP_FLAGS_REPLACE;
+ break;
+ case XFS_ATTR_DEFER_REMOVE:
+ if (is_pptr)
+ log_op = XFS_ATTRI_OP_FLAGS_PPTR_REMOVE;
+ else
+ log_op = XFS_ATTRI_OP_FLAGS_REMOVE;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ new->xattri_op_flags = log_op;
+
+ /* Set up initial attr operation state. */
+ switch (log_op) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_SET:
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ ASSERT(args->new_valuelen == args->valuelen);
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ break;
+ }
+
+ xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type);
+ trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
+}
+
const struct xfs_defer_op_type xfs_attr_defer_type = {
.name = "attr",
.max_items = 1,
@@ -691,6 +936,56 @@ const struct xfs_defer_op_type xfs_attr_defer_type = {
.relog_intent = xfs_attr_relog_intent,
};
+static inline void *
+xfs_attri_validate_name_iovec(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_format *attri_formatp,
+ const struct xfs_log_iovec *iovec,
+ unsigned int name_len)
+{
+ if (iovec->i_len != xlog_calc_iovec_len(name_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, sizeof(*attri_formatp));
+ return NULL;
+ }
+
+ if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->i_addr,
+ name_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, sizeof(*attri_formatp));
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ iovec->i_addr, iovec->i_len);
+ return NULL;
+ }
+
+ return iovec->i_addr;
+}
+
+static inline void *
+xfs_attri_validate_value_iovec(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_format *attri_formatp,
+ const struct xfs_log_iovec *iovec,
+ unsigned int value_len)
+{
+ if (iovec->i_len != xlog_calc_iovec_len(value_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, sizeof(*attri_formatp));
+ return NULL;
+ }
+
+ if ((attri_formatp->alfi_attr_filter & XFS_ATTR_PARENT) &&
+ !xfs_parent_valuecheck(mp, iovec->i_addr, value_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, sizeof(*attri_formatp));
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ iovec->i_addr, iovec->i_len);
+ return NULL;
+ }
+
+ return iovec->i_addr;
+}
+
STATIC int
xlog_recover_attri_commit_pass2(
struct xlog *log,
@@ -702,51 +997,177 @@ xlog_recover_attri_commit_pass2(
struct xfs_attri_log_item *attrip;
struct xfs_attri_log_format *attri_formatp;
struct xfs_attri_log_nameval *nv;
- const void *attr_value = NULL;
const void *attr_name;
+ const void *attr_value = NULL;
+ const void *attr_new_name = NULL;
+ const void *attr_new_value = NULL;
size_t len;
-
- attri_formatp = item->ri_buf[0].i_addr;
- attr_name = item->ri_buf[1].i_addr;
+ unsigned int name_len = 0;
+ unsigned int value_len = 0;
+ unsigned int new_name_len = 0;
+ unsigned int new_value_len = 0;
+ unsigned int op, i = 0;
/* Validate xfs_attri_log_format before the large memory allocation */
len = sizeof(struct xfs_attri_log_format);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[i].i_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
+ attri_formatp = item->ri_buf[i].i_addr;
if (!xfs_attri_validate(mp, attri_formatp)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
- /* Validate the attr name */
- if (item->ri_buf[1].i_len !=
- xlog_calc_iovec_len(attri_formatp->alfi_name_len)) {
+ /* Check the number of log iovecs makes sense for the op code. */
+ op = xfs_attr_log_item_op(attri_formatp);
+ switch (op) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ /* Log item, attr name, attr value */
+ if (item->ri_total != 3) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ name_len = attri_formatp->alfi_name_len;
+ value_len = attri_formatp->alfi_value_len;
+ break;
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ /* Log item, attr name, attr value */
+ if (item->ri_total != 3) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ name_len = attri_formatp->alfi_name_len;
+ value_len = attri_formatp->alfi_value_len;
+ break;
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ /* Log item, attr name */
+ if (item->ri_total != 2) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ name_len = attri_formatp->alfi_name_len;
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ /*
+ * Log item, attr name, new attr name, attr value, new attr
+ * value
+ */
+ if (item->ri_total != 5) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ name_len = attri_formatp->alfi_old_name_len;
+ new_name_len = attri_formatp->alfi_new_name_len;
+ new_value_len = value_len = attri_formatp->alfi_value_len;
+ break;
+ default:
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
+ i++;
- if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[1].i_addr, item->ri_buf[1].i_len);
+ /* Validate the attr name */
+ attr_name = xfs_attri_validate_name_iovec(mp, attri_formatp,
+ &item->ri_buf[i], name_len);
+ if (!attr_name)
return -EFSCORRUPTED;
+ i++;
+
+ /* Validate the new attr name */
+ if (new_name_len > 0) {
+ attr_new_name = xfs_attri_validate_name_iovec(mp,
+ attri_formatp, &item->ri_buf[i],
+ new_name_len);
+ if (!attr_new_name)
+ return -EFSCORRUPTED;
+ i++;
}
/* Validate the attr value, if present */
- if (attri_formatp->alfi_value_len != 0) {
- if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
+ if (value_len != 0) {
+ attr_value = xfs_attri_validate_value_iovec(mp, attri_formatp,
+ &item->ri_buf[i], value_len);
+ if (!attr_value)
+ return -EFSCORRUPTED;
+ i++;
+ }
+
+ /* Validate the new attr value, if present */
+ if (new_value_len != 0) {
+ attr_new_value = xfs_attri_validate_value_iovec(mp,
+ attri_formatp, &item->ri_buf[i],
+ new_value_len);
+ if (!attr_new_value)
+ return -EFSCORRUPTED;
+ i++;
+ }
+
+ /*
+ * Make sure we got the correct number of buffers for the operation
+ * that we just loaded.
+ */
+ if (i != item->ri_total) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+
+ switch (op) {
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ /* Regular remove operations operate only on names. */
+ if (attr_value != NULL || value_len != 0) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr,
- item->ri_buf[0].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
-
- attr_value = item->ri_buf[2].i_addr;
+ fallthrough;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ /*
+ * Regular xattr set/remove/replace operations require a name
+ * and do not take a newname. Values are optional for set and
+ * replace.
+ *
+ * Name-value set/remove operations must have a name, do not
+ * take a newname, and can take a value.
+ */
+ if (attr_name == NULL || name_len == 0) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ /*
+ * Name-value replace operations require the caller to
+ * specify the old and new names and values explicitly.
+ * Values are optional.
+ */
+ if (attr_name == NULL || name_len == 0) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ if (attr_new_name == NULL || new_name_len == 0) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ break;
}
/*
@@ -754,9 +1175,10 @@ xlog_recover_attri_commit_pass2(
* name/value buffer to the recovered incore log item and drop our
* reference.
*/
- nv = xfs_attri_log_nameval_alloc(attr_name,
- attri_formatp->alfi_name_len, attr_value,
- attri_formatp->alfi_value_len);
+ nv = xfs_attri_log_nameval_alloc(attr_name, name_len,
+ attr_new_name, new_name_len,
+ attr_value, value_len,
+ attr_new_value, new_value_len);
attrip = xfs_attri_init(mp, nv);
memcpy(&attrip->attri_format, attri_formatp, len);
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
index 3280a7930287..e74128cbb722 100644
--- a/fs/xfs/xfs_attr_item.h
+++ b/fs/xfs/xfs_attr_item.h
@@ -13,7 +13,9 @@ struct kmem_zone;
struct xfs_attri_log_nameval {
struct xfs_log_iovec name;
+ struct xfs_log_iovec new_name; /* PPTR_REPLACE only */
struct xfs_log_iovec value;
+ struct xfs_log_iovec new_value; /* PPTR_REPLACE only */
refcount_t refcount;
/* name and value follow the end of this struct */
@@ -51,4 +53,12 @@ struct xfs_attrd_log_item {
extern struct kmem_cache *xfs_attri_cache;
extern struct kmem_cache *xfs_attrd_cache;
+enum xfs_attr_defer_op {
+ XFS_ATTR_DEFER_SET,
+ XFS_ATTR_DEFER_REMOVE,
+ XFS_ATTR_DEFER_REPLACE,
+};
+
+void xfs_attr_defer_add(struct xfs_da_args *args, enum xfs_attr_defer_op op);
+
#endif /* __XFS_ATTR_ITEM_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a6819a642cc0..5c947e5ce8b8 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -82,7 +82,8 @@ xfs_attr_shortform_list(
(dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) {
for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(sfe->nameval,
+ !xfs_attr_namecheck(sfe->flags,
+ sfe->nameval,
sfe->namelen))) {
xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
@@ -91,6 +92,7 @@ xfs_attr_shortform_list(
sfe->flags,
sfe->nameval,
(int)sfe->namelen,
+ &sfe->nameval[sfe->namelen],
(int)sfe->valuelen);
/*
* Either search callback finished early or
@@ -122,7 +124,8 @@ xfs_attr_shortform_list(
for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (unlikely(
((char *)sfe < (char *)sf) ||
- ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
+ ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)) ||
+ !xfs_attr_check_namespace(sfe->flags))) {
XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe,
@@ -133,12 +136,16 @@ xfs_attr_shortform_list(
}
sbp->entno = i;
- sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
sbp->name = sfe->nameval;
sbp->namelen = sfe->namelen;
/* These are bytes, and both on-disk, don't endian-flip */
+ sbp->value = &sfe->nameval[sfe->namelen],
sbp->valuelen = sfe->valuelen;
sbp->flags = sfe->flags;
+ sbp->hash = xfs_attr_hashval(dp->i_mount, sfe->flags,
+ sfe->nameval, sfe->namelen,
+ sfe->nameval + sfe->namelen,
+ sfe->valuelen);
sfe = xfs_attr_sf_nextentry(sfe);
sbp++;
nsbuf++;
@@ -177,7 +184,7 @@ xfs_attr_shortform_list(
cursor->offset = 0;
}
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(sbp->name,
+ !xfs_attr_namecheck(sbp->flags, sbp->name,
sbp->namelen))) {
xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
@@ -187,6 +194,7 @@ xfs_attr_shortform_list(
sbp->flags,
sbp->name,
sbp->namelen,
+ sbp->value,
sbp->valuelen);
if (context->seen_enough)
break;
@@ -214,6 +222,7 @@ xfs_attr_node_list_lookup(
struct xfs_mount *mp = dp->i_mount;
struct xfs_trans *tp = context->tp;
struct xfs_buf *bp;
+ xfs_failaddr_t fa;
int i;
int error = 0;
unsigned int expected_level = 0;
@@ -238,6 +247,10 @@ xfs_attr_node_list_lookup(
goto out_corruptbuf;
}
+ fa = xfs_da3_node_header_check(bp, dp->i_ino);
+ if (fa)
+ goto out_corruptbuf;
+
xfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
/* Tree taller than we can handle; bail out! */
@@ -273,6 +286,12 @@ xfs_attr_node_list_lookup(
}
}
+ fa = xfs_attr3_leaf_header_check(bp, dp->i_ino);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ goto out_releasebuf;
+ }
+
if (expected_level != 0)
goto out_corruptbuf;
@@ -281,6 +300,7 @@ xfs_attr_node_list_lookup(
out_corruptbuf:
xfs_buf_mark_corrupt(bp);
+out_releasebuf:
xfs_trans_brelse(tp, bp);
xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
@@ -297,6 +317,7 @@ xfs_attr_node_list(
struct xfs_buf *bp;
struct xfs_inode *dp = context->dp;
struct xfs_mount *mp = dp->i_mount;
+ xfs_failaddr_t fa;
int error = 0;
trace_xfs_attr_node_list(context);
@@ -310,46 +331,60 @@ xfs_attr_node_list(
*/
bp = NULL;
if (cursor->blkno > 0) {
+ struct xfs_attr_leaf_entry *entries;
+
error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp,
XFS_ATTR_FORK);
if (xfs_metadata_is_sick(error))
xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
- if ((error != 0) && (error != -EFSCORRUPTED))
+ if (error != 0 && error != -EFSCORRUPTED)
return error;
- if (bp) {
- struct xfs_attr_leaf_entry *entries;
+ if (!bp)
+ goto need_lookup;
- node = bp->b_addr;
- switch (be16_to_cpu(node->hdr.info.magic)) {
- case XFS_DA_NODE_MAGIC:
- case XFS_DA3_NODE_MAGIC:
- trace_xfs_attr_list_wrong_blk(context);
+ node = bp->b_addr;
+ switch (be16_to_cpu(node->hdr.info.magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ trace_xfs_attr_list_wrong_blk(context);
+ fa = xfs_da3_node_header_check(bp, dp->i_ino);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
+ }
+ xfs_trans_brelse(context->tp, bp);
+ bp = NULL;
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ leaf = bp->b_addr;
+ fa = xfs_attr3_leaf_header_check(bp, dp->i_ino);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
xfs_trans_brelse(context->tp, bp);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
bp = NULL;
break;
- case XFS_ATTR_LEAF_MAGIC:
- case XFS_ATTR3_LEAF_MAGIC:
- leaf = bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
- &leafhdr, leaf);
- entries = xfs_attr3_leaf_entryp(leaf);
- if (cursor->hashval > be32_to_cpu(
- entries[leafhdr.count - 1].hashval)) {
- trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(context->tp, bp);
- bp = NULL;
- } else if (cursor->hashval <= be32_to_cpu(
- entries[0].hashval)) {
- trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(context->tp, bp);
- bp = NULL;
- }
- break;
- default:
+ }
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
+ &leafhdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if (cursor->hashval > be32_to_cpu(
+ entries[leafhdr.count - 1].hashval)) {
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(context->tp, bp);
+ bp = NULL;
+ } else if (cursor->hashval <= be32_to_cpu(
+ entries[0].hashval)) {
trace_xfs_attr_list_wrong_blk(context);
xfs_trans_brelse(context->tp, bp);
bp = NULL;
}
+ break;
+ default:
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(context->tp, bp);
+ bp = NULL;
}
}
@@ -359,6 +394,7 @@ xfs_attr_node_list(
* Note that start of node block is same as start of leaf block.
*/
if (bp == NULL) {
+need_lookup:
error = xfs_attr_node_list_lookup(context, cursor, &bp);
if (error || !bp)
return error;
@@ -380,8 +416,8 @@ xfs_attr_node_list(
break;
cursor->blkno = leafhdr.forw;
xfs_trans_brelse(context->tp, bp);
- error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno,
- &bp);
+ error = xfs_attr3_leaf_read(context->tp, dp, dp->i_ino,
+ cursor->blkno, &bp);
if (error)
return error;
}
@@ -446,6 +482,7 @@ xfs_attr3_leaf_list_int(
*/
for (; i < ichdr.count; entry++, i++) {
char *name;
+ void *value;
int namelen, valuelen;
if (be32_to_cpu(entry->hashval) != cursor->hashval) {
@@ -463,6 +500,7 @@ xfs_attr3_leaf_list_int(
name_loc = xfs_attr3_leaf_name_local(leaf, i);
name = name_loc->nameval;
namelen = name_loc->namelen;
+ value = &name_loc->nameval[name_loc->namelen];
valuelen = be16_to_cpu(name_loc->valuelen);
} else {
xfs_attr_leaf_name_remote_t *name_rmt;
@@ -470,16 +508,18 @@ xfs_attr3_leaf_list_int(
name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
name = name_rmt->name;
namelen = name_rmt->namelen;
+ value = NULL;
valuelen = be32_to_cpu(name_rmt->valuelen);
}
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(name, namelen))) {
+ !xfs_attr_namecheck(entry->flags, name,
+ namelen))) {
xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
context->put_listent(context, entry->flags,
- name, namelen, valuelen);
+ name, namelen, value, valuelen);
if (context->seen_enough)
break;
cursor->offset++;
@@ -501,7 +541,8 @@ xfs_attr_leaf_list(
trace_xfs_attr_leaf_list(context);
context->cursor.blkno = 0;
- error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp);
+ error = xfs_attr3_leaf_read(context->tp, context->dp,
+ context->dp->i_ino, 0, &bp);
if (error)
return error;
@@ -515,6 +556,7 @@ xfs_attr_list_ilocked(
struct xfs_attr_list_context *context)
{
struct xfs_inode *dp = context->dp;
+ int error;
xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
@@ -525,6 +567,12 @@ xfs_attr_list_ilocked(
return 0;
if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_list(context);
+
+ /* Prerequisite for xfs_attr_is_leaf */
+ error = xfs_iread_extents(NULL, dp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
if (xfs_attr_is_leaf(dp))
return xfs_attr_leaf_list(context);
return xfs_attr_node_list(context);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index d27859a684aa..a19d62e78aa1 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -524,9 +524,7 @@ xfs_bmap_recover_work(
else
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
- error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip, iext_delta);
+ error = xfs_iext_count_extend(tp, ip, work->bi_whichfork, iext_delta);
if (error)
goto err_cancel;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 19e11d1da660..ac2e77ebb54c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -440,7 +440,7 @@ out_unlock_iolock:
* if the ranges only partially overlap them, so it is up to the caller to
* ensure that partial blocks are not passed in.
*/
-int
+void
xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
xfs_off_t start_byte,
@@ -452,7 +452,6 @@ xfs_bmap_punch_delalloc_range(
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
- int error = 0;
ASSERT(!xfs_need_iread_extents(ifp));
@@ -476,15 +475,13 @@ xfs_bmap_punch_delalloc_range(
continue;
}
- error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur,
- &got, &del);
- if (error || !xfs_iext_get_extent(ifp, &icur, &got))
+ xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, &got, &del);
+ if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
}
/*
@@ -542,7 +539,7 @@ xfs_can_free_eofblocks(
* forever.
*/
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
- if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
+ if (xfs_inode_has_bigrtalloc(ip))
end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (last_fsb <= end_fsb)
@@ -713,41 +710,37 @@ xfs_alloc_file_space(
if (error)
break;
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error)
- goto error;
-
- error = xfs_bmapi_write(tp, ip, startoffset_fsb,
- allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
- &nimaps);
if (error)
goto error;
- ip->i_diflags |= XFS_DIFLAG_PREALLOC;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- break;
-
/*
* If the allocator cannot find a single free extent large
* enough to cover the start block of the requested range,
- * xfs_bmapi_write will return 0 but leave *nimaps set to 0.
+ * xfs_bmapi_write will return -ENOSR.
*
* In that case we simply need to keep looping with the same
* startoffset_fsb so that one of the following allocations
* will eventually reach the requested range.
*/
- if (nimaps) {
+ error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+ allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
+ &nimaps);
+ if (error) {
+ if (error != -ENOSR)
+ goto error;
+ error = 0;
+ } else {
startoffset_fsb += imapp->br_blockcount;
allocatesize_fsb -= imapp->br_blockcount;
}
+
+ ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
return error;
@@ -775,10 +768,8 @@ xfs_unmap_extent(
if (error)
return error;
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -843,7 +834,7 @@ xfs_free_file_space(
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/* We can only free complete realtime extents. */
- if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+ if (xfs_inode_has_bigrtalloc(ip)) {
startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb);
endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb);
}
@@ -1054,10 +1045,8 @@ xfs_insert_file_space(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -1283,23 +1272,17 @@ xfs_swap_extent_rmap(
trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
if (xfs_bmap_is_real_extent(&uirec)) {
- error = xfs_iext_count_may_overflow(ip,
+ error = xfs_iext_count_extend(tp, ip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
if (xfs_bmap_is_real_extent(&irec)) {
- error = xfs_iext_count_may_overflow(tip,
+ error = xfs_iext_count_extend(tp, tip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 77ecbb753ef2..51f84d8ff372 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -30,7 +30,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
}
#endif /* CONFIG_XFS_RT */
-int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
xfs_off_t start_byte, xfs_off_t end_byte);
struct kgetbmap {
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f0fa02264eda..aa4dbda7b536 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -494,6 +494,9 @@ _xfs_buf_obj_cmp(
* it stale has not yet committed. i.e. we are
* reallocating a busy extent. Skip this buffer and
* continue searching for an exact match.
+ *
+ * Note: If we're scanning for incore buffers to stale, don't
+ * complain if we find non-stale buffers.
*/
if (!(map->bm_flags & XBM_LIVESCAN))
ASSERT(bp->b_flags & XBF_STALE);
@@ -2043,7 +2046,7 @@ xfs_setsize_buftarg(
btp->bt_meta_sectorsize = sectorsize;
btp->bt_meta_sectormask = sectorsize - 1;
- if (set_blocksize(btp->bt_bdev, sectorsize)) {
+ if (set_blocksize(btp->bt_bdev_file, sectorsize)) {
xfs_warn(btp->bt_mount,
"Cannot set_blocksize to %u on device %pg",
sectorsize, btp->bt_bdev);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index cf9296b7e06f..06ac5a7de60a 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -157,7 +157,7 @@ xfs_dir2_block_getdents(
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
- error = xfs_dir3_block_read(args->trans, dp, &bp);
+ error = xfs_dir3_block_read(args->trans, dp, args->owner, &bp);
if (error)
return error;
@@ -282,7 +282,8 @@ xfs_dir2_leaf_readbuf(
new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
if (new_off > *cur_off)
*cur_off = new_off;
- error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, &bp);
+ error = xfs_dir3_data_read(args->trans, dp, args->owner,
+ map.br_startoff, 0, &bp);
if (error)
goto out;
@@ -515,7 +516,6 @@ xfs_readdir(
{
struct xfs_da_args args = { NULL };
unsigned int lock_mode;
- bool isblock;
int error;
trace_xfs_readdir(dp);
@@ -532,23 +532,24 @@ xfs_readdir(
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
args.trans = tp;
+ args.owner = dp->i_ino;
if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_dir2_sf_getdents(&args, ctx);
lock_mode = xfs_ilock_data_map_shared(dp);
- error = xfs_dir2_isblock(&args, &isblock);
- if (error)
- goto out_unlock;
-
- if (isblock) {
+ switch (xfs_dir2_format(&args, &error)) {
+ case XFS_DIR2_FMT_BLOCK:
error = xfs_dir2_block_getdents(&args, ctx, &lock_mode);
- goto out_unlock;
+ break;
+ case XFS_DIR2_FMT_LEAF:
+ case XFS_DIR2_FMT_NODE:
+ error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode);
+ break;
+ default:
+ break;
}
- error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode);
-
-out_unlock:
if (lock_mode)
xfs_iunlock(dp, lock_mode);
return error;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 268bb734dc0a..25fe3b932b5a 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -145,14 +145,18 @@ xfs_discard_extents(
return error;
}
+struct xfs_trim_cur {
+ xfs_agblock_t start;
+ xfs_extlen_t count;
+ xfs_agblock_t end;
+ xfs_extlen_t minlen;
+ bool by_bno;
+};
static int
xfs_trim_gather_extents(
struct xfs_perag *pag,
- xfs_daddr_t start,
- xfs_daddr_t end,
- xfs_daddr_t minlen,
- struct xfs_alloc_rec_incore *tcur,
+ struct xfs_trim_cur *tcur,
struct xfs_busy_extents *extents,
uint64_t *blocks_trimmed)
{
@@ -179,21 +183,26 @@ xfs_trim_gather_extents(
if (error)
goto out_trans_cancel;
- cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
-
- /*
- * Look up the extent length requested in the AGF and start with it.
- */
- if (tcur->ar_startblock == NULLAGBLOCK)
- error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i);
- else
- error = xfs_alloc_lookup_le(cur, tcur->ar_startblock,
- tcur->ar_blockcount, &i);
+ if (tcur->by_bno) {
+ /* sub-AG discard request always starts at tcur->start */
+ cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
+ error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i);
+ if (!error && !i)
+ error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i);
+ } else if (tcur->start == 0) {
+ /* first time through a by-len starts with max length */
+ cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
+ error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i);
+ } else {
+ /* nth time through a by-len starts where we left off */
+ cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
+ error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i);
+ }
if (error)
goto out_del_cursor;
if (i == 0) {
/* nothing of that length left in the AG, we are done */
- tcur->ar_blockcount = 0;
+ tcur->count = 0;
goto out_del_cursor;
}
@@ -204,8 +213,6 @@ xfs_trim_gather_extents(
while (i) {
xfs_agblock_t fbno;
xfs_extlen_t flen;
- xfs_daddr_t dbno;
- xfs_extlen_t dlen;
error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
if (error)
@@ -221,38 +228,46 @@ xfs_trim_gather_extents(
* Update the cursor to point at this extent so we
* restart the next batch from this extent.
*/
- tcur->ar_startblock = fbno;
- tcur->ar_blockcount = flen;
- break;
- }
-
- /*
- * use daddr format for all range/len calculations as that is
- * the format the range/len variables are supplied in by
- * userspace.
- */
- dbno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, fbno);
- dlen = XFS_FSB_TO_BB(mp, flen);
-
- /*
- * Too small? Give up.
- */
- if (dlen < minlen) {
- trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
- tcur->ar_blockcount = 0;
+ tcur->start = fbno;
+ tcur->count = flen;
break;
}
/*
* If the extent is entirely outside of the range we are
- * supposed to discard skip it. Do not bother to trim
- * down partially overlapping ranges for now.
+ * supposed to skip it. Do not bother to trim down partially
+ * overlapping ranges for now.
*/
- if (dbno + dlen < start || dbno > end) {
+ if (fbno + flen < tcur->start) {
+ trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen);
+ goto next_extent;
+ }
+ if (fbno > tcur->end) {
trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen);
+ if (tcur->by_bno) {
+ tcur->count = 0;
+ break;
+ }
goto next_extent;
}
+ /* Trim the extent returned to the range we want. */
+ if (fbno < tcur->start) {
+ flen -= tcur->start - fbno;
+ fbno = tcur->start;
+ }
+ if (fbno + flen > tcur->end + 1)
+ flen = tcur->end - fbno + 1;
+
+ /* Too small? Give up. */
+ if (flen < tcur->minlen) {
+ trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
+ if (tcur->by_bno)
+ goto next_extent;
+ tcur->count = 0;
+ break;
+ }
+
/*
* If any blocks in the range are still busy, skip the
* discard and try again the next time.
@@ -266,7 +281,10 @@ xfs_trim_gather_extents(
&extents->extent_list);
*blocks_trimmed += flen;
next_extent:
- error = xfs_btree_decrement(cur, 0, &i);
+ if (tcur->by_bno)
+ error = xfs_btree_increment(cur, 0, &i);
+ else
+ error = xfs_btree_decrement(cur, 0, &i);
if (error)
break;
@@ -276,7 +294,7 @@ next_extent:
* is no more extents to search.
*/
if (i == 0)
- tcur->ar_blockcount = 0;
+ tcur->count = 0;
}
/*
@@ -306,17 +324,22 @@ xfs_trim_should_stop(void)
static int
xfs_trim_extents(
struct xfs_perag *pag,
- xfs_daddr_t start,
- xfs_daddr_t end,
- xfs_daddr_t minlen,
+ xfs_agblock_t start,
+ xfs_agblock_t end,
+ xfs_extlen_t minlen,
uint64_t *blocks_trimmed)
{
- struct xfs_alloc_rec_incore tcur = {
- .ar_blockcount = pag->pagf_longest,
- .ar_startblock = NULLAGBLOCK,
+ struct xfs_trim_cur tcur = {
+ .start = start,
+ .count = pag->pagf_longest,
+ .end = end,
+ .minlen = minlen,
};
int error = 0;
+ if (start != 0 || end != pag->block_count)
+ tcur.by_bno = true;
+
do {
struct xfs_busy_extents *extents;
@@ -330,8 +353,8 @@ xfs_trim_extents(
extents->owner = extents;
INIT_LIST_HEAD(&extents->extent_list);
- error = xfs_trim_gather_extents(pag, start, end, minlen,
- &tcur, extents, blocks_trimmed);
+ error = xfs_trim_gather_extents(pag, &tcur, extents,
+ blocks_trimmed);
if (error) {
kfree(extents);
break;
@@ -354,7 +377,7 @@ xfs_trim_extents(
if (xfs_trim_should_stop())
break;
- } while (tcur.ar_blockcount != 0);
+ } while (tcur.count != 0);
return error;
@@ -378,8 +401,10 @@ xfs_ioc_trim(
unsigned int granularity =
bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
struct fstrim_range range;
- xfs_daddr_t start, end, minlen;
- xfs_agnumber_t agno;
+ xfs_daddr_t start, end;
+ xfs_extlen_t minlen;
+ xfs_agnumber_t start_agno, end_agno;
+ xfs_agblock_t start_agbno, end_agbno;
uint64_t blocks_trimmed = 0;
int error, last_error = 0;
@@ -399,7 +424,8 @@ xfs_ioc_trim(
return -EFAULT;
range.minlen = max_t(u64, granularity, range.minlen);
- minlen = BTOBB(range.minlen);
+ minlen = XFS_B_TO_FSB(mp, range.minlen);
+
/*
* Truncating down the len isn't actually quite correct, but using
* BBTOB would mean we trivially get overflows for values
@@ -413,15 +439,21 @@ xfs_ioc_trim(
return -EINVAL;
start = BTOBB(range.start);
- end = start + BTOBBT(range.len) - 1;
+ end = min_t(xfs_daddr_t, start + BTOBBT(range.len),
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) - 1;
+
+ start_agno = xfs_daddr_to_agno(mp, start);
+ start_agbno = xfs_daddr_to_agbno(mp, start);
+ end_agno = xfs_daddr_to_agno(mp, end);
+ end_agbno = xfs_daddr_to_agbno(mp, end);
- if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
- end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1;
+ for_each_perag_range(mp, start_agno, end_agno, pag) {
+ xfs_agblock_t agend = pag->block_count;
- agno = xfs_daddr_to_agno(mp, start);
- for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) {
- error = xfs_trim_extents(pag, start, end, minlen,
- &blocks_trimmed);
+ if (start_agno == end_agno)
+ agend = end_agbno;
+ error = xfs_trim_extents(pag, start_agbno, agend, minlen,
+ &blocks_trimmed);
if (error)
last_error = error;
@@ -429,6 +461,7 @@ xfs_ioc_trim(
xfs_perag_rele(pag);
break;
}
+ start_agbno = 0;
}
if (last_error)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index c98cb468c357..c1b211c260a9 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -341,11 +341,8 @@ xfs_dquot_disk_alloc(
goto err_cancel;
}
- error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, quotip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, quotip,
- XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto err_cancel;
@@ -357,7 +354,6 @@ xfs_dquot_disk_alloc(
goto err_cancel;
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
- ASSERT(nmaps == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
@@ -1371,6 +1367,47 @@ xfs_dqlock2(
}
}
+static int
+xfs_dqtrx_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_dqtrx *qa = a;
+ const struct xfs_dqtrx *qb = b;
+
+ if (qa->qt_dquot->q_id > qb->qt_dquot->q_id)
+ return 1;
+ if (qa->qt_dquot->q_id < qb->qt_dquot->q_id)
+ return -1;
+ return 0;
+}
+
+void
+xfs_dqlockn(
+ struct xfs_dqtrx *q)
+{
+ unsigned int i;
+
+ BUILD_BUG_ON(XFS_QM_TRANS_MAXDQS > MAX_LOCKDEP_SUBCLASSES);
+
+ /* Sort in order of dquot id, do not allow duplicates */
+ for (i = 0; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++) {
+ unsigned int j;
+
+ for (j = 0; j < i; j++)
+ ASSERT(q[i].qt_dquot != q[j].qt_dquot);
+ }
+ if (i == 0)
+ return;
+
+ sort(q, i, sizeof(struct xfs_dqtrx), xfs_dqtrx_cmp, NULL);
+
+ mutex_lock(&q[0].qt_dquot->q_qlock);
+ for (i = 1; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++)
+ mutex_lock_nested(&q[i].qt_dquot->q_qlock,
+ XFS_QLOCK_NESTED + i - 1);
+}
+
int __init
xfs_qm_init(void)
{
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 956272d9b302..677bb2dc9ac9 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -223,6 +223,7 @@ int xfs_qm_dqget_uncached(struct xfs_mount *mp,
void xfs_qm_dqput(struct xfs_dquot *dqp);
void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+void xfs_dqlockn(struct xfs_dqtrx *q);
void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 7ad0e92c6b5b..78cdc5064a8c 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -62,6 +62,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_ATTR_LEAF_TO_NODE,
XFS_RANDOM_WB_DELAY_MS,
XFS_RANDOM_WRITE_DELAY_MS,
+ XFS_RANDOM_EXCHMAPS_FINISH_ONE,
};
struct xfs_errortag_attr {
@@ -179,6 +180,7 @@ XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS);
XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS);
+XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -224,6 +226,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
+ XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one),
NULL,
};
ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_exchmaps_item.c b/fs/xfs/xfs_exchmaps_item.c
new file mode 100644
index 000000000000..264a121c5e16
--- /dev/null
+++ b/fs/xfs/xfs_exchmaps_item.c
@@ -0,0 +1,614 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_exchmaps_item.h"
+#include "xfs_exchmaps.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_exchrange.h"
+#include "xfs_trace.h"
+
+struct kmem_cache *xfs_xmi_cache;
+struct kmem_cache *xfs_xmd_cache;
+
+static const struct xfs_item_ops xfs_xmi_item_ops;
+
+static inline struct xfs_xmi_log_item *XMI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_xmi_log_item, xmi_item);
+}
+
+STATIC void
+xfs_xmi_item_free(
+ struct xfs_xmi_log_item *xmi_lip)
+{
+ kvfree(xmi_lip->xmi_item.li_lv_shadow);
+ kmem_cache_free(xfs_xmi_cache, xmi_lip);
+}
+
+/*
+ * Freeing the XMI requires that we remove it from the AIL if it has already
+ * been placed there. However, the XMI may not yet have been placed in the AIL
+ * when called by xfs_xmi_release() from XMD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the XMI.
+ */
+STATIC void
+xfs_xmi_release(
+ struct xfs_xmi_log_item *xmi_lip)
+{
+ ASSERT(atomic_read(&xmi_lip->xmi_refcount) > 0);
+ if (atomic_dec_and_test(&xmi_lip->xmi_refcount)) {
+ xfs_trans_ail_delete(&xmi_lip->xmi_item, 0);
+ xfs_xmi_item_free(xmi_lip);
+ }
+}
+
+
+STATIC void
+xfs_xmi_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_xmi_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the given xmi log
+ * item. We use only 1 iovec, and we point that at the xmi_log_format structure
+ * embedded in the xmi item.
+ */
+STATIC void
+xfs_xmi_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ xmi_lip->xmi_format.xmi_type = XFS_LI_XMI;
+ xmi_lip->xmi_format.xmi_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_XMI_FORMAT,
+ &xmi_lip->xmi_format,
+ sizeof(struct xfs_xmi_log_format));
+}
+
+/*
+ * The unpin operation is the last place an XMI is manipulated in the log. It
+ * is either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the XMI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the XMI to either construct
+ * and commit the XMD or drop the XMD's reference in the event of error. Simply
+ * drop the log's XMI reference now that the log is done with it.
+ */
+STATIC void
+xfs_xmi_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip);
+
+ xfs_xmi_release(xmi_lip);
+}
+
+/*
+ * The XMI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an XMD isn't going to be
+ * constructed and thus we free the XMI here directly.
+ */
+STATIC void
+xfs_xmi_item_release(
+ struct xfs_log_item *lip)
+{
+ xfs_xmi_release(XMI_ITEM(lip));
+}
+
+/* Allocate and initialize an xmi item. */
+STATIC struct xfs_xmi_log_item *
+xfs_xmi_init(
+ struct xfs_mount *mp)
+
+{
+ struct xfs_xmi_log_item *xmi_lip;
+
+ xmi_lip = kmem_cache_zalloc(xfs_xmi_cache, GFP_KERNEL | __GFP_NOFAIL);
+
+ xfs_log_item_init(mp, &xmi_lip->xmi_item, XFS_LI_XMI, &xfs_xmi_item_ops);
+ xmi_lip->xmi_format.xmi_id = (uintptr_t)(void *)xmi_lip;
+ atomic_set(&xmi_lip->xmi_refcount, 2);
+
+ return xmi_lip;
+}
+
+static inline struct xfs_xmd_log_item *XMD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_xmd_log_item, xmd_item);
+}
+
+STATIC void
+xfs_xmd_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_xmd_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the given xmd log
+ * item. We use only 1 iovec, and we point that at the xmd_log_format structure
+ * embedded in the xmd item.
+ */
+STATIC void
+xfs_xmd_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_xmd_log_item *xmd_lip = XMD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ xmd_lip->xmd_format.xmd_type = XFS_LI_XMD;
+ xmd_lip->xmd_format.xmd_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_XMD_FORMAT, &xmd_lip->xmd_format,
+ sizeof(struct xfs_xmd_log_format));
+}
+
+/*
+ * The XMD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the XMI and free the
+ * XMD.
+ */
+STATIC void
+xfs_xmd_item_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_xmd_log_item *xmd_lip = XMD_ITEM(lip);
+
+ xfs_xmi_release(xmd_lip->xmd_intent_log_item);
+ kvfree(xmd_lip->xmd_item.li_lv_shadow);
+ kmem_cache_free(xfs_xmd_cache, xmd_lip);
+}
+
+static struct xfs_log_item *
+xfs_xmd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &XMD_ITEM(lip)->xmd_intent_log_item->xmi_item;
+}
+
+static const struct xfs_item_ops xfs_xmd_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
+ .iop_size = xfs_xmd_item_size,
+ .iop_format = xfs_xmd_item_format,
+ .iop_release = xfs_xmd_item_release,
+ .iop_intent = xfs_xmd_item_intent,
+};
+
+/* Log file mapping exchange information in the intent item. */
+STATIC struct xfs_log_item *
+xfs_exchmaps_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_xmi_log_item *xmi_lip;
+ struct xfs_exchmaps_intent *xmi;
+ struct xfs_xmi_log_format *xlf;
+
+ ASSERT(count == 1);
+
+ xmi = list_first_entry_or_null(items, struct xfs_exchmaps_intent,
+ xmi_list);
+
+ xmi_lip = xfs_xmi_init(tp->t_mountp);
+ xlf = &xmi_lip->xmi_format;
+
+ xlf->xmi_inode1 = xmi->xmi_ip1->i_ino;
+ xlf->xmi_igen1 = VFS_I(xmi->xmi_ip1)->i_generation;
+ xlf->xmi_inode2 = xmi->xmi_ip2->i_ino;
+ xlf->xmi_igen2 = VFS_I(xmi->xmi_ip2)->i_generation;
+ xlf->xmi_startoff1 = xmi->xmi_startoff1;
+ xlf->xmi_startoff2 = xmi->xmi_startoff2;
+ xlf->xmi_blockcount = xmi->xmi_blockcount;
+ xlf->xmi_isize1 = xmi->xmi_isize1;
+ xlf->xmi_isize2 = xmi->xmi_isize2;
+ xlf->xmi_flags = xmi->xmi_flags & XFS_EXCHMAPS_LOGGED_FLAGS;
+
+ return &xmi_lip->xmi_item;
+}
+
+STATIC struct xfs_log_item *
+xfs_exchmaps_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(intent);
+ struct xfs_xmd_log_item *xmd_lip;
+
+ xmd_lip = kmem_cache_zalloc(xfs_xmd_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &xmd_lip->xmd_item, XFS_LI_XMD,
+ &xfs_xmd_item_ops);
+ xmd_lip->xmd_intent_log_item = xmi_lip;
+ xmd_lip->xmd_format.xmd_xmi_id = xmi_lip->xmi_format.xmi_id;
+
+ return &xmd_lip->xmd_item;
+}
+
+/* Add this deferred XMI to the transaction. */
+void
+xfs_exchmaps_defer_add(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ trace_xfs_exchmaps_defer(tp->t_mountp, xmi);
+
+ xfs_defer_add(tp, &xmi->xmi_list, &xfs_exchmaps_defer_type);
+}
+
+static inline struct xfs_exchmaps_intent *xmi_entry(const struct list_head *e)
+{
+ return list_entry(e, struct xfs_exchmaps_intent, xmi_list);
+}
+
+/* Cancel a deferred file mapping exchange. */
+STATIC void
+xfs_exchmaps_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_exchmaps_intent *xmi = xmi_entry(item);
+
+ kmem_cache_free(xfs_exchmaps_intent_cache, xmi);
+}
+
+/* Process a deferred file mapping exchange. */
+STATIC int
+xfs_exchmaps_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ struct xfs_exchmaps_intent *xmi = xmi_entry(item);
+ int error;
+
+ /*
+ * Exchange one more mappings between two files. If there's still more
+ * work to do, we want to requeue ourselves after all other pending
+ * deferred operations have finished. This includes all of the dfops
+ * that we queued directly as well as any new ones created in the
+ * process of finishing the others. Doing so prevents us from queuing
+ * a large number of XMI log items in kernel memory, which in turn
+ * prevents us from pinning the tail of the log (while logging those
+ * new XMI items) until the first XMI items can be processed.
+ */
+ error = xfs_exchmaps_finish_one(tp, xmi);
+ if (error != -EAGAIN)
+ xfs_exchmaps_cancel_item(item);
+ return error;
+}
+
+/* Abort all pending XMIs. */
+STATIC void
+xfs_exchmaps_abort_intent(
+ struct xfs_log_item *intent)
+{
+ xfs_xmi_release(XMI_ITEM(intent));
+}
+
+/* Is this recovered XMI ok? */
+static inline bool
+xfs_xmi_validate(
+ struct xfs_mount *mp,
+ struct xfs_xmi_log_item *xmi_lip)
+{
+ struct xfs_xmi_log_format *xlf = &xmi_lip->xmi_format;
+
+ if (!xfs_has_exchange_range(mp))
+ return false;
+
+ if (xmi_lip->xmi_format.__pad != 0)
+ return false;
+
+ if (xlf->xmi_flags & ~XFS_EXCHMAPS_LOGGED_FLAGS)
+ return false;
+
+ if (!xfs_verify_ino(mp, xlf->xmi_inode1) ||
+ !xfs_verify_ino(mp, xlf->xmi_inode2))
+ return false;
+
+ if (!xfs_verify_fileext(mp, xlf->xmi_startoff1, xlf->xmi_blockcount))
+ return false;
+
+ return xfs_verify_fileext(mp, xlf->xmi_startoff2, xlf->xmi_blockcount);
+}
+
+/*
+ * Use the recovered log state to create a new request, estimate resource
+ * requirements, and create a new incore intent state.
+ */
+STATIC struct xfs_exchmaps_intent *
+xfs_xmi_item_recover_intent(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ const struct xfs_xmi_log_format *xlf,
+ struct xfs_exchmaps_req *req,
+ struct xfs_inode **ipp1,
+ struct xfs_inode **ipp2)
+{
+ struct xfs_inode *ip1, *ip2;
+ struct xfs_exchmaps_intent *xmi;
+ int error;
+
+ /*
+ * Grab both inodes and set IRECOVERY to prevent trimming of post-eof
+ * mappings and freeing of unlinked inodes until we're totally done
+ * processing files. The ondisk format of this new log item contains
+ * file handle information, which is why recovery for other items do
+ * not check the inode generation number.
+ */
+ error = xlog_recover_iget_handle(mp, xlf->xmi_inode1, xlf->xmi_igen1,
+ &ip1);
+ if (error) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, xlf,
+ sizeof(*xlf));
+ return ERR_PTR(error);
+ }
+
+ error = xlog_recover_iget_handle(mp, xlf->xmi_inode2, xlf->xmi_igen2,
+ &ip2);
+ if (error) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, xlf,
+ sizeof(*xlf));
+ goto err_rele1;
+ }
+
+ req->ip1 = ip1;
+ req->ip2 = ip2;
+ req->startoff1 = xlf->xmi_startoff1;
+ req->startoff2 = xlf->xmi_startoff2;
+ req->blockcount = xlf->xmi_blockcount;
+ req->flags = xlf->xmi_flags & XFS_EXCHMAPS_PARAMS;
+
+ xfs_exchrange_ilock(NULL, ip1, ip2);
+ error = xfs_exchmaps_estimate(req);
+ xfs_exchrange_iunlock(ip1, ip2);
+ if (error)
+ goto err_rele2;
+
+ *ipp1 = ip1;
+ *ipp2 = ip2;
+ xmi = xfs_exchmaps_init_intent(req);
+ xfs_defer_add_item(dfp, &xmi->xmi_list);
+ return xmi;
+
+err_rele2:
+ xfs_irele(ip2);
+err_rele1:
+ xfs_irele(ip1);
+ req->ip2 = req->ip1 = NULL;
+ return ERR_PTR(error);
+}
+
+/* Process a file mapping exchange item that was recovered from the log. */
+STATIC int
+xfs_exchmaps_recover_work(
+ struct xfs_defer_pending *dfp,
+ struct list_head *capture_list)
+{
+ struct xfs_exchmaps_req req = { .flags = 0 };
+ struct xfs_trans_res resv;
+ struct xfs_exchmaps_intent *xmi;
+ struct xfs_log_item *lip = dfp->dfp_intent;
+ struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip);
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_trans *tp;
+ struct xfs_inode *ip1, *ip2;
+ int error = 0;
+
+ if (!xfs_xmi_validate(mp, xmi_lip)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &xmi_lip->xmi_format,
+ sizeof(xmi_lip->xmi_format));
+ return -EFSCORRUPTED;
+ }
+
+ xmi = xfs_xmi_item_recover_intent(mp, dfp, &xmi_lip->xmi_format, &req,
+ &ip1, &ip2);
+ if (IS_ERR(xmi))
+ return PTR_ERR(xmi);
+
+ trace_xfs_exchmaps_recover(mp, xmi);
+
+ resv = xlog_recover_resv(&M_RES(mp)->tr_write);
+ error = xfs_trans_alloc(mp, &resv, req.resblks, 0, 0, &tp);
+ if (error)
+ goto err_rele;
+
+ xfs_exchrange_ilock(tp, ip1, ip2);
+
+ xfs_exchmaps_ensure_reflink(tp, xmi);
+ xfs_exchmaps_upgrade_extent_counts(tp, xmi);
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &xmi_lip->xmi_format,
+ sizeof(xmi_lip->xmi_format));
+ if (error)
+ goto err_cancel;
+
+ /*
+ * Commit transaction, which frees the transaction and saves the inodes
+ * for later replay activities.
+ */
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+ goto err_unlock;
+
+err_cancel:
+ xfs_trans_cancel(tp);
+err_unlock:
+ xfs_exchrange_iunlock(ip1, ip2);
+err_rele:
+ xfs_irele(ip2);
+ xfs_irele(ip1);
+ return error;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_exchmaps_relog_intent(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ struct xfs_log_item *done_item)
+{
+ struct xfs_xmi_log_item *xmi_lip;
+ struct xfs_xmi_log_format *old_xlf, *new_xlf;
+
+ old_xlf = &XMI_ITEM(intent)->xmi_format;
+
+ xmi_lip = xfs_xmi_init(tp->t_mountp);
+ new_xlf = &xmi_lip->xmi_format;
+
+ new_xlf->xmi_inode1 = old_xlf->xmi_inode1;
+ new_xlf->xmi_inode2 = old_xlf->xmi_inode2;
+ new_xlf->xmi_igen1 = old_xlf->xmi_igen1;
+ new_xlf->xmi_igen2 = old_xlf->xmi_igen2;
+ new_xlf->xmi_startoff1 = old_xlf->xmi_startoff1;
+ new_xlf->xmi_startoff2 = old_xlf->xmi_startoff2;
+ new_xlf->xmi_blockcount = old_xlf->xmi_blockcount;
+ new_xlf->xmi_flags = old_xlf->xmi_flags;
+ new_xlf->xmi_isize1 = old_xlf->xmi_isize1;
+ new_xlf->xmi_isize2 = old_xlf->xmi_isize2;
+
+ return &xmi_lip->xmi_item;
+}
+
+const struct xfs_defer_op_type xfs_exchmaps_defer_type = {
+ .name = "exchmaps",
+ .max_items = 1,
+ .create_intent = xfs_exchmaps_create_intent,
+ .abort_intent = xfs_exchmaps_abort_intent,
+ .create_done = xfs_exchmaps_create_done,
+ .finish_item = xfs_exchmaps_finish_item,
+ .cancel_item = xfs_exchmaps_cancel_item,
+ .recover_work = xfs_exchmaps_recover_work,
+ .relog_intent = xfs_exchmaps_relog_intent,
+};
+
+STATIC bool
+xfs_xmi_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return XMI_ITEM(lip)->xmi_format.xmi_id == intent_id;
+}
+
+static const struct xfs_item_ops xfs_xmi_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_xmi_item_size,
+ .iop_format = xfs_xmi_item_format,
+ .iop_unpin = xfs_xmi_item_unpin,
+ .iop_release = xfs_xmi_item_release,
+ .iop_match = xfs_xmi_item_match,
+};
+
+/*
+ * This routine is called to create an in-core file mapping exchange item from
+ * the xmi format structure which was logged on disk. It allocates an in-core
+ * xmi, copies the exchange information from the format structure into it, and
+ * adds the xmi to the AIL with the given LSN.
+ */
+STATIC int
+xlog_recover_xmi_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_xmi_log_item *xmi_lip;
+ struct xfs_xmi_log_format *xmi_formatp;
+ size_t len;
+
+ len = sizeof(struct xfs_xmi_log_format);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ return -EFSCORRUPTED;
+ }
+
+ xmi_formatp = item->ri_buf[0].i_addr;
+ if (xmi_formatp->__pad != 0) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ return -EFSCORRUPTED;
+ }
+
+ xmi_lip = xfs_xmi_init(mp);
+ memcpy(&xmi_lip->xmi_format, xmi_formatp, len);
+
+ xlog_recover_intent_item(log, &xmi_lip->xmi_item, lsn,
+ &xfs_exchmaps_defer_type);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_xmi_item_ops = {
+ .item_type = XFS_LI_XMI,
+ .commit_pass2 = xlog_recover_xmi_commit_pass2,
+};
+
+/*
+ * This routine is called when an XMD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding XMI if it
+ * was still in the log. To do this it searches the AIL for the XMI with an id
+ * equal to that in the XMD format structure. If we find it we drop the XMD
+ * reference, which removes the XMI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_xmd_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_xmd_log_format *xmd_formatp;
+
+ xmd_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_xmd_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_XMI, xmd_formatp->xmd_xmi_id);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_xmd_item_ops = {
+ .item_type = XFS_LI_XMD,
+ .commit_pass2 = xlog_recover_xmd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_exchmaps_item.h b/fs/xfs/xfs_exchmaps_item.h
new file mode 100644
index 000000000000..efa368d25d09
--- /dev/null
+++ b/fs/xfs/xfs_exchmaps_item.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_EXCHMAPS_ITEM_H__
+#define __XFS_EXCHMAPS_ITEM_H__
+
+/*
+ * The file mapping exchange intent item helps us exchange multiple file
+ * mappings between two inode forks. It does this by tracking the range of
+ * file block offsets that still need to be exchanged, and relogs as progress
+ * happens.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same transaction
+ * that records the associated bmbt updates.
+ *
+ * Should the system crash after the commit of the first transaction but
+ * before the commit of the final transaction in a series, log recovery will
+ * use the redo information recorded by the intent items to replay the
+ * rest of the mapping exchanges.
+ */
+
+/* kernel only XMI/XMD definitions */
+
+struct xfs_mount;
+struct kmem_cache;
+
+/*
+ * This is the incore file mapping exchange intent log item. It is used to log
+ * the fact that we are exchanging mappings between two files. It is used in
+ * conjunction with the incore file mapping exchange done log item described
+ * below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item; see the
+ * comments about that structure (in xfs_extfree_item.h) for more details.
+ */
+struct xfs_xmi_log_item {
+ struct xfs_log_item xmi_item;
+ atomic_t xmi_refcount;
+ struct xfs_xmi_log_format xmi_format;
+};
+
+/*
+ * This is the incore file mapping exchange done log item. It is used to log
+ * the fact that an exchange mentioned in an earlier xmi item have been
+ * performed.
+ */
+struct xfs_xmd_log_item {
+ struct xfs_log_item xmd_item;
+ struct xfs_xmi_log_item *xmd_intent_log_item;
+ struct xfs_xmd_log_format xmd_format;
+};
+
+extern struct kmem_cache *xfs_xmi_cache;
+extern struct kmem_cache *xfs_xmd_cache;
+
+struct xfs_exchmaps_intent;
+
+void xfs_exchmaps_defer_add(struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi);
+
+#endif /* __XFS_EXCHMAPS_ITEM_H__ */
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
new file mode 100644
index 000000000000..c8a655c92c92
--- /dev/null
+++ b/fs/xfs/xfs_exchrange.c
@@ -0,0 +1,804 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_quota.h"
+#include "xfs_bmap_util.h"
+#include "xfs_reflink.h"
+#include "xfs_trace.h"
+#include "xfs_exchrange.h"
+#include "xfs_exchmaps.h"
+#include "xfs_sb.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
+#include "xfs_rtbitmap.h"
+#include <linux/fsnotify.h>
+
+/* Lock (and optionally join) two inodes for a file range exchange. */
+void
+xfs_exchrange_ilock(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ if (ip1 != ip2)
+ xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
+ ip2, XFS_ILOCK_EXCL);
+ else
+ xfs_ilock(ip1, XFS_ILOCK_EXCL);
+ if (tp) {
+ xfs_trans_ijoin(tp, ip1, 0);
+ if (ip2 != ip1)
+ xfs_trans_ijoin(tp, ip2, 0);
+ }
+
+}
+
+/* Unlock two inodes after a file range exchange operation. */
+void
+xfs_exchrange_iunlock(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ if (ip2 != ip1)
+ xfs_iunlock(ip2, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip1, XFS_ILOCK_EXCL);
+}
+
+/*
+ * Estimate the resource requirements to exchange file contents between the two
+ * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to
+ * have flushed both inodes' pagecache and active direct-ios.
+ */
+int
+xfs_exchrange_estimate(
+ struct xfs_exchmaps_req *req)
+{
+ int error;
+
+ xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
+ error = xfs_exchmaps_estimate(req);
+ xfs_exchrange_iunlock(req->ip1, req->ip2);
+ return error;
+}
+
+#define QRETRY_IP1 (0x1)
+#define QRETRY_IP2 (0x2)
+
+/*
+ * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
+ * this if quota enforcement is disabled or if both inodes' dquots are the
+ * same. The qretry structure must be initialized to zeroes before the first
+ * call to this function.
+ */
+STATIC int
+xfs_exchrange_reserve_quota(
+ struct xfs_trans *tp,
+ const struct xfs_exchmaps_req *req,
+ unsigned int *qretry)
+{
+ int64_t ddelta, rdelta;
+ int ip1_error = 0;
+ int error;
+
+ /*
+ * Don't bother with a quota reservation if we're not enforcing them
+ * or the two inodes have the same dquots.
+ */
+ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
+ (req->ip1->i_udquot == req->ip2->i_udquot &&
+ req->ip1->i_gdquot == req->ip2->i_gdquot &&
+ req->ip1->i_pdquot == req->ip2->i_pdquot))
+ return 0;
+
+ *qretry = 0;
+
+ /*
+ * For each file, compute the net gain in the number of regular blocks
+ * that will be mapped into that file and reserve that much quota. The
+ * quota counts must be able to absorb at least that much space.
+ */
+ ddelta = req->ip2_bcount - req->ip1_bcount;
+ rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
+ if (ddelta > 0 || rdelta > 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
+ ddelta > 0 ? ddelta : 0,
+ rdelta > 0 ? rdelta : 0,
+ false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ /*
+ * Save this error and see what happens if we try to
+ * reserve quota for ip2. Then report both.
+ */
+ *qretry |= QRETRY_IP1;
+ ip1_error = error;
+ error = 0;
+ }
+ if (error)
+ return error;
+ }
+ if (ddelta < 0 || rdelta < 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
+ ddelta < 0 ? -ddelta : 0,
+ rdelta < 0 ? -rdelta : 0,
+ false);
+ if (error == -EDQUOT || error == -ENOSPC)
+ *qretry |= QRETRY_IP2;
+ if (error)
+ return error;
+ }
+ if (ip1_error)
+ return ip1_error;
+
+ /*
+ * For each file, forcibly reserve the gross gain in mapped blocks so
+ * that we don't trip over any quota block reservation assertions.
+ * We must reserve the gross gain because the quota code subtracts from
+ * bcount the number of blocks that we unmap; it does not add that
+ * quantity back to the quota block reservation.
+ */
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
+ req->ip1_rtbcount, true);
+ if (error)
+ return error;
+
+ return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
+ req->ip2_rtbcount, true);
+}
+
+/* Exchange the mappings (and hence the contents) of two files' forks. */
+STATIC int
+xfs_exchrange_mappings(
+ const struct xfs_exchrange *fxr,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ struct xfs_mount *mp = ip1->i_mount;
+ struct xfs_exchmaps_req req = {
+ .ip1 = ip1,
+ .ip2 = ip2,
+ .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset),
+ .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset),
+ .blockcount = XFS_B_TO_FSB(mp, fxr->length),
+ };
+ struct xfs_trans *tp;
+ unsigned int qretry;
+ bool retried = false;
+ int error;
+
+ trace_xfs_exchrange_mappings(fxr, ip1, ip2);
+
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
+ req.flags |= XFS_EXCHMAPS_SET_SIZES;
+ if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
+ req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
+
+ /*
+ * Round the request length up to the nearest file allocation unit.
+ * The prep function already checked that the request offsets and
+ * length in @fxr are safe to round up.
+ */
+ if (xfs_inode_has_bigrtalloc(ip2))
+ req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount);
+
+ error = xfs_exchrange_estimate(&req);
+ if (error)
+ return error;
+
+retry:
+ /* Allocate the transaction, lock the inodes, and join them. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
+ XFS_TRANS_RES_FDBLKS, &tp);
+ if (error)
+ return error;
+
+ xfs_exchrange_ilock(tp, ip1, ip2);
+
+ trace_xfs_exchrange_before(ip2, 2);
+ trace_xfs_exchrange_before(ip1, 1);
+
+ error = xfs_exchmaps_check_forks(mp, &req);
+ if (error)
+ goto out_trans_cancel;
+
+ /*
+ * Reserve ourselves some quota if any of them are in enforcing mode.
+ * In theory we only need enough to satisfy the change in the number
+ * of blocks between the two ranges being remapped.
+ */
+ error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_exchrange_iunlock(ip1, ip2);
+ if (qretry & QRETRY_IP1)
+ xfs_blockgc_free_quota(ip1, 0);
+ if (qretry & QRETRY_IP2)
+ xfs_blockgc_free_quota(ip2, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error)
+ goto out_trans_cancel;
+
+ /* If we got this far on a dry run, all parameters are ok. */
+ if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
+ goto out_trans_cancel;
+
+ /* Update the mtime and ctime of both files. */
+ if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
+ xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
+ xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ xfs_exchange_mappings(tp, &req);
+
+ /*
+ * Force the log to persist metadata updates if the caller or the
+ * administrator requires this. The generic prep function already
+ * flushed the relevant parts of the page cache.
+ */
+ if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
+ xfs_trans_set_sync(tp);
+
+ error = xfs_trans_commit(tp);
+
+ trace_xfs_exchrange_after(ip2, 2);
+ trace_xfs_exchrange_after(ip1, 1);
+
+ if (error)
+ goto out_unlock;
+
+ /*
+ * If the caller wanted us to exchange the contents of two complete
+ * files of unequal length, exchange the incore sizes now. This should
+ * be safe because we flushed both files' page caches, exchanged all
+ * the mappings, and updated the ondisk sizes.
+ */
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
+ loff_t temp;
+
+ temp = i_size_read(VFS_I(ip2));
+ i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
+ i_size_write(VFS_I(ip1), temp);
+ }
+
+out_unlock:
+ xfs_exchrange_iunlock(ip1, ip2);
+ return error;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+}
+
+/*
+ * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
+ * This part deals with struct file objects and byte ranges and does not deal
+ * with XFS-specific data structures such as xfs_inodes and block ranges. This
+ * separation may some day facilitate porting to another filesystem.
+ *
+ * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
+ * file1 with the same number of bytes starting at fxr.file2_offset in file2.
+ * Implementations must call xfs_exchange_range_prep to prepare the two
+ * files prior to taking locks; and they must update the inode change and mod
+ * times of both files as part of the metadata update. The timestamp update
+ * and freshness checks must be done atomically as part of the data exchange
+ * operation to ensure correctness of the freshness check.
+ * xfs_exchange_range_finish must be called after the operation completes
+ * successfully but before locks are dropped.
+ */
+
+/* Verify that we have security clearance to perform this operation. */
+static int
+xfs_exchange_range_verify_area(
+ struct xfs_exchrange *fxr)
+{
+ int ret;
+
+ ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
+ true);
+ if (ret)
+ return ret;
+
+ return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
+ true);
+}
+
+/*
+ * Performs necessary checks before doing a range exchange, having stabilized
+ * mutable inode attributes via i_rwsem.
+ */
+static inline int
+xfs_exchange_range_checks(
+ struct xfs_exchrange *fxr,
+ unsigned int alloc_unit)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ uint64_t allocmask = alloc_unit - 1;
+ int64_t test_len;
+ uint64_t blen;
+ loff_t size1, size2, tmp;
+ int error;
+
+ /* Don't touch certain kinds of inodes */
+ if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
+ return -EPERM;
+ if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
+ return -ETXTBSY;
+
+ size1 = i_size_read(inode1);
+ size2 = i_size_read(inode2);
+
+ /* Ranges cannot start after EOF. */
+ if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
+ return -EINVAL;
+
+ /*
+ * If the caller said to exchange to EOF, we set the length of the
+ * request large enough to cover everything to the end of both files.
+ */
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
+ fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
+ size2 - fxr->file2_offset);
+
+ error = xfs_exchange_range_verify_area(fxr);
+ if (error)
+ return error;
+ }
+
+ /*
+ * The start of both ranges must be aligned to the file allocation
+ * unit.
+ */
+ if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
+ !IS_ALIGNED(fxr->file2_offset, alloc_unit))
+ return -EINVAL;
+
+ /* Ensure offsets don't wrap. */
+ if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
+ check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
+ return -EINVAL;
+
+ /*
+ * We require both ranges to end within EOF, unless we're exchanging
+ * to EOF.
+ */
+ if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
+ (fxr->file1_offset + fxr->length > size1 ||
+ fxr->file2_offset + fxr->length > size2))
+ return -EINVAL;
+
+ /*
+ * Make sure we don't hit any file size limits. If we hit any size
+ * limits such that test_length was adjusted, we abort the whole
+ * operation.
+ */
+ test_len = fxr->length;
+ error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
+ &test_len);
+ if (error)
+ return error;
+ error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
+ &test_len);
+ if (error)
+ return error;
+ if (test_len != fxr->length)
+ return -EINVAL;
+
+ /*
+ * If the user wanted us to exchange up to the infile's EOF, round up
+ * to the next allocation unit boundary for this check. Do the same
+ * for the outfile.
+ *
+ * Otherwise, reject the range length if it's not aligned to an
+ * allocation unit.
+ */
+ if (fxr->file1_offset + fxr->length == size1)
+ blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
+ else if (fxr->file2_offset + fxr->length == size2)
+ blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
+ else if (!IS_ALIGNED(fxr->length, alloc_unit))
+ return -EINVAL;
+ else
+ blen = fxr->length;
+
+ /* Don't allow overlapped exchanges within the same file. */
+ if (inode1 == inode2 &&
+ fxr->file2_offset + blen > fxr->file1_offset &&
+ fxr->file1_offset + blen > fxr->file2_offset)
+ return -EINVAL;
+
+ /*
+ * Ensure that we don't exchange a partial EOF block into the middle of
+ * another file.
+ */
+ if ((fxr->length & allocmask) == 0)
+ return 0;
+
+ blen = fxr->length;
+ if (fxr->file2_offset + blen < size2)
+ blen &= ~allocmask;
+
+ if (fxr->file1_offset + blen < size1)
+ blen &= ~allocmask;
+
+ return blen == fxr->length ? 0 : -EINVAL;
+}
+
+/*
+ * Check that the two inodes are eligible for range exchanges, the ranges make
+ * sense, and then flush all dirty data. Caller must ensure that the inodes
+ * have been locked against any other modifications.
+ */
+static inline int
+xfs_exchange_range_prep(
+ struct xfs_exchrange *fxr,
+ unsigned int alloc_unit)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ bool same_inode = (inode1 == inode2);
+ int error;
+
+ /* Check that we don't violate system file offset limits. */
+ error = xfs_exchange_range_checks(fxr, alloc_unit);
+ if (error || fxr->length == 0)
+ return error;
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode1);
+ if (!same_inode)
+ inode_dio_wait(inode2);
+
+ error = filemap_write_and_wait_range(inode1->i_mapping,
+ fxr->file1_offset,
+ fxr->file1_offset + fxr->length - 1);
+ if (error)
+ return error;
+
+ error = filemap_write_and_wait_range(inode2->i_mapping,
+ fxr->file2_offset,
+ fxr->file2_offset + fxr->length - 1);
+ if (error)
+ return error;
+
+ /*
+ * If the files or inodes involved require synchronous writes, amend
+ * the request to force the filesystem to flush all data and metadata
+ * to disk after the operation completes.
+ */
+ if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
+ IS_SYNC(inode1) || IS_SYNC(inode2))
+ fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
+
+ return 0;
+}
+
+/*
+ * Finish a range exchange operation, if it was successful. Caller must ensure
+ * that the inodes are still locked against any other modifications.
+ */
+static inline int
+xfs_exchange_range_finish(
+ struct xfs_exchrange *fxr)
+{
+ int error;
+
+ error = file_remove_privs(fxr->file1);
+ if (error)
+ return error;
+ if (file_inode(fxr->file1) == file_inode(fxr->file2))
+ return 0;
+
+ return file_remove_privs(fxr->file2);
+}
+
+/*
+ * Check the alignment of an exchange request when the allocation unit size
+ * isn't a power of two. The generic file-level helpers use (fast)
+ * bitmask-based alignment checks, but here we have to use slow long division.
+ */
+static int
+xfs_exchrange_check_rtalign(
+ const struct xfs_exchrange *fxr,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2,
+ unsigned int alloc_unit)
+{
+ uint64_t length = fxr->length;
+ uint64_t blen;
+ loff_t size1, size2;
+
+ size1 = i_size_read(VFS_I(ip1));
+ size2 = i_size_read(VFS_I(ip2));
+
+ /* The start of both ranges must be aligned to a rt extent. */
+ if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
+ !isaligned_64(fxr->file2_offset, alloc_unit))
+ return -EINVAL;
+
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
+ length = max_t(int64_t, size1 - fxr->file1_offset,
+ size2 - fxr->file2_offset);
+
+ /*
+ * If the user wanted us to exchange up to the infile's EOF, round up
+ * to the next rt extent boundary for this check. Do the same for the
+ * outfile.
+ *
+ * Otherwise, reject the range length if it's not rt extent aligned.
+ * We already confirmed the starting offsets' rt extent block
+ * alignment.
+ */
+ if (fxr->file1_offset + length == size1)
+ blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
+ else if (fxr->file2_offset + length == size2)
+ blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
+ else if (!isaligned_64(length, alloc_unit))
+ return -EINVAL;
+ else
+ blen = length;
+
+ /* Don't allow overlapped exchanges within the same file. */
+ if (ip1 == ip2 &&
+ fxr->file2_offset + blen > fxr->file1_offset &&
+ fxr->file1_offset + blen > fxr->file2_offset)
+ return -EINVAL;
+
+ /*
+ * Ensure that we don't exchange a partial EOF rt extent into the
+ * middle of another file.
+ */
+ if (isaligned_64(length, alloc_unit))
+ return 0;
+
+ blen = length;
+ if (fxr->file2_offset + length < size2)
+ blen = rounddown_64(blen, alloc_unit);
+
+ if (fxr->file1_offset + blen < size1)
+ blen = rounddown_64(blen, alloc_unit);
+
+ return blen == length ? 0 : -EINVAL;
+}
+
+/* Prepare two files to have their data exchanged. */
+STATIC int
+xfs_exchrange_prep(
+ struct xfs_exchrange *fxr,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ struct xfs_mount *mp = ip2->i_mount;
+ unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2);
+ int error;
+
+ trace_xfs_exchrange_prep(fxr, ip1, ip2);
+
+ /* Verify both files are either real-time or non-realtime */
+ if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
+ return -EINVAL;
+
+ /* Check non-power of two alignment issues, if necessary. */
+ if (!is_power_of_2(alloc_unit)) {
+ error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
+ if (error)
+ return error;
+
+ /*
+ * Do the generic file-level checks with the regular block
+ * alignment.
+ */
+ alloc_unit = mp->m_sb.sb_blocksize;
+ }
+
+ error = xfs_exchange_range_prep(fxr, alloc_unit);
+ if (error || fxr->length == 0)
+ return error;
+
+ /* Attach dquots to both inodes before changing block maps. */
+ error = xfs_qm_dqattach(ip2);
+ if (error)
+ return error;
+ error = xfs_qm_dqattach(ip1);
+ if (error)
+ return error;
+
+ trace_xfs_exchrange_flush(fxr, ip1, ip2);
+
+ /* Flush the relevant ranges of both files. */
+ error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
+ if (error)
+ return error;
+ error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
+ if (error)
+ return error;
+
+ /*
+ * Cancel CoW fork preallocations for the ranges of both files. The
+ * prep function should have flushed all the dirty data, so the only
+ * CoW mappings remaining should be speculative.
+ */
+ if (xfs_inode_has_cow_data(ip1)) {
+ error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
+ fxr->length, true);
+ if (error)
+ return error;
+ }
+
+ if (xfs_inode_has_cow_data(ip2)) {
+ error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
+ fxr->length, true);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Exchange contents of files. This is the binding between the generic
+ * file-level concepts and the XFS inode-specific implementation.
+ */
+STATIC int
+xfs_exchrange_contents(
+ struct xfs_exchrange *fxr)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ struct xfs_inode *ip1 = XFS_I(inode1);
+ struct xfs_inode *ip2 = XFS_I(inode2);
+ struct xfs_mount *mp = ip1->i_mount;
+ int error;
+
+ if (!xfs_has_exchange_range(mp))
+ return -EOPNOTSUPP;
+
+ if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
+ XFS_EXCHANGE_RANGE_PRIV_FLAGS))
+ return -EINVAL;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ /* Lock both files against IO */
+ error = xfs_ilock2_io_mmap(ip1, ip2);
+ if (error)
+ goto out_err;
+
+ /* Prepare and then exchange file contents. */
+ error = xfs_exchrange_prep(fxr, ip1, ip2);
+ if (error)
+ goto out_unlock;
+
+ error = xfs_exchrange_mappings(fxr, ip1, ip2);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Finish the exchange by removing special file privileges like any
+ * other file write would do. This may involve turning on support for
+ * logged xattrs if either file has security capabilities.
+ */
+ error = xfs_exchange_range_finish(fxr);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock2_io_mmap(ip1, ip2);
+out_err:
+ if (error)
+ trace_xfs_exchrange_error(ip2, error, _RET_IP_);
+ return error;
+}
+
+/* Exchange parts of two files. */
+static int
+xfs_exchange_range(
+ struct xfs_exchrange *fxr)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ int ret;
+
+ BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
+ XFS_EXCHANGE_RANGE_PRIV_FLAGS);
+
+ /* Both files must be on the same mount/filesystem. */
+ if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
+ return -EXDEV;
+
+ if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ return -EINVAL;
+
+ /* Userspace requests only honored for regular files. */
+ if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+ return -EINVAL;
+
+ /* Both files must be opened for read and write. */
+ if (!(fxr->file1->f_mode & FMODE_READ) ||
+ !(fxr->file1->f_mode & FMODE_WRITE) ||
+ !(fxr->file2->f_mode & FMODE_READ) ||
+ !(fxr->file2->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ /* Neither file can be opened append-only. */
+ if ((fxr->file1->f_flags & O_APPEND) ||
+ (fxr->file2->f_flags & O_APPEND))
+ return -EBADF;
+
+ /*
+ * If we're not exchanging to EOF, we can check the areas before
+ * stabilizing both files' i_size.
+ */
+ if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
+ ret = xfs_exchange_range_verify_area(fxr);
+ if (ret)
+ return ret;
+ }
+
+ /* Update cmtime if the fd/inode don't forbid it. */
+ if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
+ fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
+ if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
+ fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
+
+ file_start_write(fxr->file2);
+ ret = xfs_exchrange_contents(fxr);
+ file_end_write(fxr->file2);
+ if (ret)
+ return ret;
+
+ fsnotify_modify(fxr->file1);
+ if (fxr->file2 != fxr->file1)
+ fsnotify_modify(fxr->file2);
+ return 0;
+}
+
+/* Collect exchange-range arguments from userspace. */
+long
+xfs_ioc_exchange_range(
+ struct file *file,
+ struct xfs_exchange_range __user *argp)
+{
+ struct xfs_exchrange fxr = {
+ .file2 = file,
+ };
+ struct xfs_exchange_range args;
+ struct fd file1;
+ int error;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
+ return -EINVAL;
+ if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ return -EINVAL;
+
+ fxr.file1_offset = args.file1_offset;
+ fxr.file2_offset = args.file2_offset;
+ fxr.length = args.length;
+ fxr.flags = args.flags;
+
+ file1 = fdget(args.file1_fd);
+ if (!file1.file)
+ return -EBADF;
+ fxr.file1 = file1.file;
+
+ error = xfs_exchange_range(&fxr);
+ fdput(file1);
+ return error;
+}
diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h
new file mode 100644
index 000000000000..039abcca546e
--- /dev/null
+++ b/fs/xfs/xfs_exchrange.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_EXCHRANGE_H__
+#define __XFS_EXCHRANGE_H__
+
+/* Update the mtime/cmtime of file1 and file2 */
+#define __XFS_EXCHANGE_RANGE_UPD_CMTIME1 (1ULL << 63)
+#define __XFS_EXCHANGE_RANGE_UPD_CMTIME2 (1ULL << 62)
+
+#define XFS_EXCHANGE_RANGE_PRIV_FLAGS (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \
+ __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
+
+struct xfs_exchrange {
+ struct file *file1;
+ struct file *file2;
+
+ loff_t file1_offset;
+ loff_t file2_offset;
+ u64 length;
+
+ u64 flags; /* XFS_EXCHANGE_RANGE flags */
+};
+
+long xfs_ioc_exchange_range(struct file *file,
+ struct xfs_exchange_range __user *argp);
+
+struct xfs_exchmaps_req;
+
+void xfs_exchrange_ilock(struct xfs_trans *tp, struct xfs_inode *ip1,
+ struct xfs_inode *ip2);
+void xfs_exchrange_iunlock(struct xfs_inode *ip1, struct xfs_inode *ip2);
+
+int xfs_exchrange_estimate(struct xfs_exchmaps_req *req);
+
+#endif /* __XFS_EXCHRANGE_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 7cd09c3a82cb..201489d3de08 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -102,7 +102,7 @@ xfs_fs_encode_fh(
return fileid_type;
}
-STATIC struct inode *
+struct inode *
xfs_nfs_get_inode(
struct super_block *sb,
u64 ino,
@@ -160,7 +160,7 @@ xfs_nfs_get_inode(
}
}
- if (VFS_I(ip)->i_generation != generation) {
+ if (VFS_I(ip)->i_generation != generation || IS_PRIVATE(VFS_I(ip))) {
xfs_irele(ip);
return ERR_PTR(-ESTALE);
}
diff --git a/fs/xfs/xfs_export.h b/fs/xfs/xfs_export.h
index 64471a3ddb04..3cd85e8901a5 100644
--- a/fs/xfs/xfs_export.h
+++ b/fs/xfs/xfs_export.h
@@ -57,4 +57,6 @@ struct xfs_fid64 {
/* This flag goes on the wire. Don't play with it. */
#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */
+struct inode *xfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 gen);
+
#endif /* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 56cfa1498571..a73e7c73b664 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -518,35 +518,26 @@ fail:
goto out;
}
-STATIC void
+static bool
xfs_extent_busy_clear_one(
- struct xfs_mount *mp,
struct xfs_perag *pag,
- struct xfs_extent_busy *busyp)
+ struct xfs_extent_busy *busyp,
+ bool do_discard)
{
if (busyp->length) {
- trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
- busyp->length);
+ if (do_discard &&
+ !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
+ busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
+ return false;
+ }
+ trace_xfs_extent_busy_clear(pag->pag_mount, busyp->agno,
+ busyp->bno, busyp->length);
rb_erase(&busyp->rb_node, &pag->pagb_tree);
}
list_del_init(&busyp->list);
kfree(busyp);
-}
-
-static void
-xfs_extent_busy_put_pag(
- struct xfs_perag *pag,
- bool wakeup)
- __releases(pag->pagb_lock)
-{
- if (wakeup) {
- pag->pagb_gen++;
- wake_up_all(&pag->pagb_wait);
- }
-
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
+ return true;
}
/*
@@ -560,32 +551,33 @@ xfs_extent_busy_clear(
struct list_head *list,
bool do_discard)
{
- struct xfs_extent_busy *busyp, *n;
- struct xfs_perag *pag = NULL;
- xfs_agnumber_t agno = NULLAGNUMBER;
- bool wakeup = false;
-
- list_for_each_entry_safe(busyp, n, list, list) {
- if (busyp->agno != agno) {
- if (pag)
- xfs_extent_busy_put_pag(pag, wakeup);
- agno = busyp->agno;
- pag = xfs_perag_get(mp, agno);
- spin_lock(&pag->pagb_lock);
- wakeup = false;
- }
+ struct xfs_extent_busy *busyp, *next;
- if (do_discard && busyp->length &&
- !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
- busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
- } else {
- xfs_extent_busy_clear_one(mp, pag, busyp);
- wakeup = true;
- }
- }
+ busyp = list_first_entry_or_null(list, typeof(*busyp), list);
+ if (!busyp)
+ return;
- if (pag)
- xfs_extent_busy_put_pag(pag, wakeup);
+ do {
+ bool wakeup = false;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, busyp->agno);
+ spin_lock(&pag->pagb_lock);
+ do {
+ next = list_next_entry(busyp, list);
+ if (xfs_extent_busy_clear_one(pag, busyp, do_discard))
+ wakeup = true;
+ busyp = next;
+ } while (!list_entry_is_head(busyp, list, list) &&
+ busyp->agno == pag->pag_agno);
+
+ if (wakeup) {
+ pag->pagb_gen++;
+ wake_up_all(&pag->pagb_wait);
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ } while (!list_entry_is_head(busyp, list, list));
}
/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2ce302b4885f..b240ea5241dc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -24,6 +24,7 @@
#include "xfs_pnfs.h"
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include "xfs_file.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -38,33 +39,19 @@ static const struct vm_operations_struct xfs_file_vm_ops;
* Decide if the given file range is aligned to the size of the fundamental
* allocation unit for the file.
*/
-static bool
+bool
xfs_is_falloc_aligned(
struct xfs_inode *ip,
loff_t pos,
long long int len)
{
- struct xfs_mount *mp = ip->i_mount;
- uint64_t mask;
-
- if (XFS_IS_REALTIME_INODE(ip)) {
- if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
- u64 rextbytes;
- u32 mod;
-
- rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
- div_u64_rem(pos, rextbytes, &mod);
- if (mod)
- return false;
- div_u64_rem(len, rextbytes, &mod);
- return mod == 0;
- }
- mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
- } else {
- mask = mp->m_sb.sb_blocksize - 1;
- }
+ unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
+
+ if (!is_power_of_2(alloc_unit))
+ return isaligned_64(pos, alloc_unit) &&
+ isaligned_64(len, alloc_unit);
- return !((pos | len) & mask);
+ return !((pos | len) & (alloc_unit - 1));
}
/*
@@ -861,67 +848,6 @@ xfs_file_write_iter(
return xfs_file_buffered_write(iocb, from);
}
-static void
-xfs_wait_dax_page(
- struct inode *inode)
-{
- struct xfs_inode *ip = XFS_I(inode);
-
- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
- schedule();
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-}
-
-int
-xfs_break_dax_layouts(
- struct inode *inode,
- bool *retry)
-{
- struct page *page;
-
- xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
-
- page = dax_layout_busy_page(inode->i_mapping);
- if (!page)
- return 0;
-
- *retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, xfs_wait_dax_page(inode));
-}
-
-int
-xfs_break_layouts(
- struct inode *inode,
- uint *iolock,
- enum layout_break_reason reason)
-{
- bool retry;
- int error;
-
- xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
-
- do {
- retry = false;
- switch (reason) {
- case BREAK_UNMAP:
- error = xfs_break_dax_layouts(inode, &retry);
- if (error || retry)
- break;
- fallthrough;
- case BREAK_WRITE:
- error = xfs_break_leased_layouts(inode, iolock, &retry);
- break;
- default:
- WARN_ON_ONCE(1);
- error = -EINVAL;
- }
- } while (error == 0 && retry);
-
- return error;
-}
-
/* Does this file, inode, or mount want synchronous writes? */
static inline bool xfs_file_sync_writes(struct file *filp)
{
diff --git a/fs/xfs/xfs_file.h b/fs/xfs/xfs_file.h
new file mode 100644
index 000000000000..2ad91f755caf
--- /dev/null
+++ b/fs/xfs/xfs_file.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_FILE_H__
+#define __XFS_FILE_H__
+
+extern const struct file_operations xfs_file_operations;
+extern const struct file_operations xfs_dir_file_operations;
+
+bool xfs_is_falloc_aligned(struct xfs_inode *ip, loff_t pos,
+ long long int len);
+
+#endif /* __XFS_FILE_H__ */
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index de59eec74765..85dbb46452ca 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -533,7 +533,7 @@ xfs_getfsmap_rtdev_rtbitmap(
trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb);
trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb);
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
/*
* Set up query parameters to return free rtextents covering the range
@@ -557,7 +557,7 @@ xfs_getfsmap_rtdev_rtbitmap(
if (error)
goto err;
err:
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
return error;
}
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 83f708f62ed9..c211ea2b63c4 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -213,10 +213,8 @@ xfs_growfs_data_private(
struct xfs_perag *pag;
pag = xfs_perag_get(mp, id.agno);
- error = xfs_ag_resv_free(pag);
+ xfs_ag_resv_free(pag);
xfs_perag_put(pag);
- if (error)
- return error;
}
/*
* Reserve AG metadata blocks. ENOSPC here does not mean there
@@ -385,14 +383,14 @@ xfs_reserve_blocks(
*/
if (mp->m_resblks > request) {
lcounter = mp->m_resblks_avail - request;
- if (lcounter > 0) { /* release unused blocks */
+ if (lcounter > 0) { /* release unused blocks */
fdblks_delta = lcounter;
mp->m_resblks_avail -= lcounter;
}
mp->m_resblks = request;
if (fdblks_delta) {
spin_unlock(&mp->m_sb_lock);
- error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+ xfs_add_fdblocks(mp, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
@@ -428,9 +426,9 @@ xfs_reserve_blocks(
*/
fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
- error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+ error = xfs_dec_fdblocks(mp, fdblks_delta, 0);
if (!error)
- xfs_mod_fdblocks(mp, fdblks_delta, 0);
+ xfs_add_fdblocks(mp, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
out:
@@ -556,24 +554,13 @@ xfs_fs_reserve_ag_blocks(
/*
* Free space reserved for per-AG metadata.
*/
-int
+void
xfs_fs_unreserve_ag_blocks(
struct xfs_mount *mp)
{
xfs_agnumber_t agno;
struct xfs_perag *pag;
- int error = 0;
- int err2;
- for_each_perag(mp, agno, pag) {
- err2 = xfs_ag_resv_free(pag);
- if (err2 && !error)
- error = err2;
- }
-
- if (error)
- xfs_warn(mp,
- "Error %d freeing per-AG metadata reserve pool.", error);
-
- return error;
+ for_each_perag(mp, agno, pag)
+ xfs_ag_resv_free(pag);
}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 44457b0a0593..3e2f73bcf831 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -12,6 +12,6 @@ int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
-int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+void xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
new file mode 100644
index 000000000000..c8785ed59543
--- /dev/null
+++ b/fs/xfs/xfs_handle.c
@@ -0,0 +1,952 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2022-2024 Oracle.
+ * All rights reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
+#include "xfs_parent.h"
+#include "xfs_da_btree.h"
+#include "xfs_handle.h"
+#include "xfs_health.h"
+#include "xfs_icache.h"
+#include "xfs_export.h"
+#include "xfs_xattr.h"
+#include "xfs_acl.h"
+
+#include <linux/namei.h>
+
+static inline size_t
+xfs_filehandle_fid_len(void)
+{
+ struct xfs_handle *handle = NULL;
+
+ return sizeof(struct xfs_fid) - sizeof(handle->ha_fid.fid_len);
+}
+
+static inline size_t
+xfs_filehandle_init(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ uint32_t gen,
+ struct xfs_handle *handle)
+{
+ memcpy(&handle->ha_fsid, mp->m_fixedfsid, sizeof(struct xfs_fsid));
+
+ handle->ha_fid.fid_len = xfs_filehandle_fid_len();
+ handle->ha_fid.fid_pad = 0;
+ handle->ha_fid.fid_gen = gen;
+ handle->ha_fid.fid_ino = ino;
+
+ return sizeof(struct xfs_handle);
+}
+
+static inline size_t
+xfs_fshandle_init(
+ struct xfs_mount *mp,
+ struct xfs_handle *handle)
+{
+ memcpy(&handle->ha_fsid, mp->m_fixedfsid, sizeof(struct xfs_fsid));
+ memset(&handle->ha_fid, 0, sizeof(handle->ha_fid));
+
+ return sizeof(struct xfs_fsid);
+}
+
+/*
+ * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
+ * a file or fs handle.
+ *
+ * XFS_IOC_PATH_TO_FSHANDLE
+ * returns fs handle for a mount point or path within that mount point
+ * XFS_IOC_FD_TO_HANDLE
+ * returns full handle for a FD opened in user space
+ * XFS_IOC_PATH_TO_HANDLE
+ * returns full handle for a path
+ */
+int
+xfs_find_handle(
+ unsigned int cmd,
+ xfs_fsop_handlereq_t *hreq)
+{
+ int hsize;
+ xfs_handle_t handle;
+ struct inode *inode;
+ struct fd f = {NULL};
+ struct path path;
+ int error;
+ struct xfs_inode *ip;
+
+ if (cmd == XFS_IOC_FD_TO_HANDLE) {
+ f = fdget(hreq->fd);
+ if (!f.file)
+ return -EBADF;
+ inode = file_inode(f.file);
+ } else {
+ error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
+ if (error)
+ return error;
+ inode = d_inode(path.dentry);
+ }
+ ip = XFS_I(inode);
+
+ /*
+ * We can only generate handles for inodes residing on a XFS filesystem,
+ * and only for regular files, directories or symbolic links.
+ */
+ error = -EINVAL;
+ if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+ goto out_put;
+
+ error = -EBADF;
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
+ goto out_put;
+
+
+ memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+ if (cmd == XFS_IOC_PATH_TO_FSHANDLE)
+ hsize = xfs_fshandle_init(ip->i_mount, &handle);
+ else
+ hsize = xfs_filehandle_init(ip->i_mount, ip->i_ino,
+ inode->i_generation, &handle);
+
+ error = -EFAULT;
+ if (copy_to_user(hreq->ohandle, &handle, hsize) ||
+ copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
+ goto out_put;
+
+ error = 0;
+
+ out_put:
+ if (cmd == XFS_IOC_FD_TO_HANDLE)
+ fdput(f);
+ else
+ path_put(&path);
+ return error;
+}
+
+/*
+ * No need to do permission checks on the various pathname components
+ * as the handle operations are privileged.
+ */
+STATIC int
+xfs_handle_acceptable(
+ void *context,
+ struct dentry *dentry)
+{
+ return 1;
+}
+
+/* Convert handle already copied to kernel space into a dentry. */
+static struct dentry *
+xfs_khandle_to_dentry(
+ struct file *file,
+ struct xfs_handle *handle)
+{
+ struct xfs_fid64 fid = {
+ .ino = handle->ha_fid.fid_ino,
+ .gen = handle->ha_fid.fid_gen,
+ };
+
+ /*
+ * Only allow handle opens under a directory.
+ */
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return ERR_PTR(-ENOTDIR);
+
+ if (handle->ha_fid.fid_len != xfs_filehandle_fid_len())
+ return ERR_PTR(-EINVAL);
+
+ return exportfs_decode_fh(file->f_path.mnt, (struct fid *)&fid, 3,
+ FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
+ xfs_handle_acceptable, NULL);
+}
+
+/* Convert handle already copied to kernel space into an xfs_inode. */
+static struct xfs_inode *
+xfs_khandle_to_inode(
+ struct file *file,
+ struct xfs_handle *handle)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode;
+
+ if (!S_ISDIR(VFS_I(ip)->i_mode))
+ return ERR_PTR(-ENOTDIR);
+
+ if (handle->ha_fid.fid_len != xfs_filehandle_fid_len())
+ return ERR_PTR(-EINVAL);
+
+ inode = xfs_nfs_get_inode(mp->m_super, handle->ha_fid.fid_ino,
+ handle->ha_fid.fid_gen);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return XFS_I(inode);
+}
+
+/*
+ * Convert userspace handle data into a dentry.
+ */
+struct dentry *
+xfs_handle_to_dentry(
+ struct file *parfilp,
+ void __user *uhandle,
+ u32 hlen)
+{
+ xfs_handle_t handle;
+
+ if (hlen != sizeof(xfs_handle_t))
+ return ERR_PTR(-EINVAL);
+ if (copy_from_user(&handle, uhandle, hlen))
+ return ERR_PTR(-EFAULT);
+
+ return xfs_khandle_to_dentry(parfilp, &handle);
+}
+
+STATIC struct dentry *
+xfs_handlereq_to_dentry(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq)
+{
+ return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
+}
+
+int
+xfs_open_by_handle(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq)
+{
+ const struct cred *cred = current_cred();
+ int error;
+ int fd;
+ int permflag;
+ struct file *filp;
+ struct inode *inode;
+ struct dentry *dentry;
+ fmode_t fmode;
+ struct path path;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ inode = d_inode(dentry);
+
+ /* Restrict xfs_open_by_handle to directories & regular files. */
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+ error = -EPERM;
+ goto out_dput;
+ }
+
+#if BITS_PER_LONG != 32
+ hreq->oflags |= O_LARGEFILE;
+#endif
+
+ permflag = hreq->oflags;
+ fmode = OPEN_FMODE(permflag);
+ if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
+ (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
+ error = -EPERM;
+ goto out_dput;
+ }
+
+ if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+ error = -EPERM;
+ goto out_dput;
+ }
+
+ /* Can't write directories. */
+ if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
+ error = -EISDIR;
+ goto out_dput;
+ }
+
+ fd = get_unused_fd_flags(0);
+ if (fd < 0) {
+ error = fd;
+ goto out_dput;
+ }
+
+ path.mnt = parfilp->f_path.mnt;
+ path.dentry = dentry;
+ filp = dentry_open(&path, hreq->oflags, cred);
+ dput(dentry);
+ if (IS_ERR(filp)) {
+ put_unused_fd(fd);
+ return PTR_ERR(filp);
+ }
+
+ if (S_ISREG(inode->i_mode)) {
+ filp->f_flags |= O_NOATIME;
+ filp->f_mode |= FMODE_NOCMTIME;
+ }
+
+ fd_install(fd, filp);
+ return fd;
+
+ out_dput:
+ dput(dentry);
+ return error;
+}
+
+int
+xfs_readlink_by_handle(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq)
+{
+ struct dentry *dentry;
+ __u32 olen;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ /* Restrict this handle operation to symlinks only. */
+ if (!d_is_symlink(dentry)) {
+ error = -EINVAL;
+ goto out_dput;
+ }
+
+ if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
+ error = -EFAULT;
+ goto out_dput;
+ }
+
+ error = vfs_readlink(dentry, hreq->ohandle, olen);
+
+ out_dput:
+ dput(dentry);
+ return error;
+}
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+static void
+xfs_ioc_attr_put_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ void *value,
+ int valuelen)
+{
+ struct xfs_attrlist *alist = context->buffer;
+ struct xfs_attrlist_ent *aep;
+ int arraytop;
+
+ ASSERT(!context->seen_enough);
+ ASSERT(context->count >= 0);
+ ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+ ASSERT(context->firstu >= sizeof(*alist));
+ ASSERT(context->firstu <= context->bufsize);
+
+ /*
+ * Only list entries in the right namespace.
+ */
+ if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return;
+
+ arraytop = sizeof(*alist) +
+ context->count * sizeof(alist->al_offset[0]);
+
+ /* decrement by the actual bytes used by the attr */
+ context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) +
+ namelen + 1, sizeof(uint32_t));
+ if (context->firstu < arraytop) {
+ trace_xfs_attr_list_full(context);
+ alist->al_more = 1;
+ context->seen_enough = 1;
+ return;
+ }
+
+ aep = context->buffer + context->firstu;
+ aep->a_valuelen = valuelen;
+ memcpy(aep->a_name, name, namelen);
+ aep->a_name[namelen] = 0;
+ alist->al_offset[context->count++] = context->firstu;
+ alist->al_count = context->count;
+ trace_xfs_attr_list_add(context);
+}
+
+static unsigned int
+xfs_attr_filter(
+ u32 ioc_flags)
+{
+ if (ioc_flags & XFS_IOC_ATTR_ROOT)
+ return XFS_ATTR_ROOT;
+ if (ioc_flags & XFS_IOC_ATTR_SECURE)
+ return XFS_ATTR_SECURE;
+ return 0;
+}
+
+static inline enum xfs_attr_update
+xfs_xattr_flags(
+ u32 ioc_flags,
+ void *value)
+{
+ if (!value)
+ return XFS_ATTRUPDATE_REMOVE;
+ if (ioc_flags & XFS_IOC_ATTR_CREATE)
+ return XFS_ATTRUPDATE_CREATE;
+ if (ioc_flags & XFS_IOC_ATTR_REPLACE)
+ return XFS_ATTRUPDATE_REPLACE;
+ return XFS_ATTRUPDATE_UPSERT;
+}
+
+int
+xfs_ioc_attr_list(
+ struct xfs_inode *dp,
+ void __user *ubuf,
+ size_t bufsize,
+ int flags,
+ struct xfs_attrlist_cursor __user *ucursor)
+{
+ struct xfs_attr_list_context context = { };
+ struct xfs_attrlist *alist;
+ void *buffer;
+ int error;
+
+ if (bufsize < sizeof(struct xfs_attrlist) ||
+ bufsize > XFS_XATTR_LIST_MAX)
+ return -EINVAL;
+
+ /*
+ * Reject flags, only allow namespaces.
+ */
+ if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+ if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+
+ /*
+ * Validate the cursor.
+ */
+ if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor)))
+ return -EFAULT;
+ if (context.cursor.pad1 || context.cursor.pad2)
+ return -EINVAL;
+ if (!context.cursor.initted &&
+ (context.cursor.hashval || context.cursor.blkno ||
+ context.cursor.offset))
+ return -EINVAL;
+
+ buffer = kvzalloc(bufsize, GFP_KERNEL);
+ if (!buffer)
+ return -ENOMEM;
+
+ /*
+ * Initialize the output buffer.
+ */
+ context.dp = dp;
+ context.resynch = 1;
+ context.attr_filter = xfs_attr_filter(flags);
+ context.buffer = buffer;
+ context.bufsize = round_down(bufsize, sizeof(uint32_t));
+ context.firstu = context.bufsize;
+ context.put_listent = xfs_ioc_attr_put_listent;
+
+ alist = context.buffer;
+ alist->al_count = 0;
+ alist->al_more = 0;
+ alist->al_offset[0] = context.bufsize;
+
+ error = xfs_attr_list(&context);
+ if (error)
+ goto out_free;
+
+ if (copy_to_user(ubuf, buffer, bufsize) ||
+ copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
+ error = -EFAULT;
+out_free:
+ kvfree(buffer);
+ return error;
+}
+
+int
+xfs_attrlist_by_handle(
+ struct file *parfilp,
+ struct xfs_fsop_attrlist_handlereq __user *p)
+{
+ struct xfs_fsop_attrlist_handlereq al_hreq;
+ struct dentry *dentry;
+ int error = -ENOMEM;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
+ return -EFAULT;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer,
+ al_hreq.buflen, al_hreq.flags, &p->pos);
+ dput(dentry);
+ return error;
+}
+
+static int
+xfs_attrmulti_attr_get(
+ struct inode *inode,
+ unsigned char *name,
+ unsigned char __user *ubuf,
+ uint32_t *len,
+ uint32_t flags)
+{
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = xfs_attr_filter(flags),
+ .name = name,
+ .namelen = strlen(name),
+ .valuelen = *len,
+ };
+ int error;
+
+ if (*len > XFS_XATTR_SIZE_MAX)
+ return -EINVAL;
+
+ error = xfs_attr_get(&args);
+ if (error)
+ goto out_kfree;
+
+ *len = args.valuelen;
+ if (copy_to_user(ubuf, args.value, args.valuelen))
+ error = -EFAULT;
+
+out_kfree:
+ kvfree(args.value);
+ return error;
+}
+
+static int
+xfs_attrmulti_attr_set(
+ struct inode *inode,
+ unsigned char *name,
+ const unsigned char __user *ubuf,
+ uint32_t len,
+ uint32_t flags)
+{
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = xfs_attr_filter(flags),
+ .name = name,
+ .namelen = strlen(name),
+ };
+ int error;
+
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ return -EPERM;
+
+ if (ubuf) {
+ if (len > XFS_XATTR_SIZE_MAX)
+ return -EINVAL;
+ args.value = memdup_user(ubuf, len);
+ if (IS_ERR(args.value))
+ return PTR_ERR(args.value);
+ args.valuelen = len;
+ }
+
+ error = xfs_attr_change(&args, xfs_xattr_flags(flags, args.value));
+ if (!error && (flags & XFS_IOC_ATTR_ROOT))
+ xfs_forget_acl(inode, name);
+ kfree(args.value);
+ return error;
+}
+
+int
+xfs_ioc_attrmulti_one(
+ struct file *parfilp,
+ struct inode *inode,
+ uint32_t opcode,
+ void __user *uname,
+ void __user *value,
+ uint32_t *len,
+ uint32_t flags)
+{
+ unsigned char *name;
+ int error;
+
+ if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+
+ name = strndup_user(uname, MAXNAMELEN);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ switch (opcode) {
+ case ATTR_OP_GET:
+ error = xfs_attrmulti_attr_get(inode, name, value, len, flags);
+ break;
+ case ATTR_OP_REMOVE:
+ value = NULL;
+ *len = 0;
+ fallthrough;
+ case ATTR_OP_SET:
+ error = mnt_want_write_file(parfilp);
+ if (error)
+ break;
+ error = xfs_attrmulti_attr_set(inode, name, value, *len, flags);
+ mnt_drop_write_file(parfilp);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ kfree(name);
+ return error;
+}
+
+int
+xfs_attrmulti_by_handle(
+ struct file *parfilp,
+ void __user *arg)
+{
+ int error;
+ xfs_attr_multiop_t *ops;
+ xfs_fsop_attrmulti_handlereq_t am_hreq;
+ struct dentry *dentry;
+ unsigned int i, size;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
+ return -EFAULT;
+
+ /* overflow check */
+ if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+ return -E2BIG;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ error = -E2BIG;
+ size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
+ if (!size || size > 16 * PAGE_SIZE)
+ goto out_dput;
+
+ ops = memdup_user(am_hreq.ops, size);
+ if (IS_ERR(ops)) {
+ error = PTR_ERR(ops);
+ goto out_dput;
+ }
+
+ error = 0;
+ for (i = 0; i < am_hreq.opcount; i++) {
+ ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
+ d_inode(dentry), ops[i].am_opcode,
+ ops[i].am_attrname, ops[i].am_attrvalue,
+ &ops[i].am_length, ops[i].am_flags);
+ }
+
+ if (copy_to_user(am_hreq.ops, ops, size))
+ error = -EFAULT;
+
+ kfree(ops);
+ out_dput:
+ dput(dentry);
+ return error;
+}
+
+struct xfs_getparents_ctx {
+ struct xfs_attr_list_context context;
+ struct xfs_getparents_by_handle gph;
+
+ /* File to target */
+ struct xfs_inode *ip;
+
+ /* Internal buffer where we format records */
+ void *krecords;
+
+ /* Last record filled out */
+ struct xfs_getparents_rec *lastrec;
+
+ unsigned int count;
+};
+
+static inline unsigned int
+xfs_getparents_rec_sizeof(
+ unsigned int namelen)
+{
+ return round_up(sizeof(struct xfs_getparents_rec) + namelen + 1,
+ sizeof(uint64_t));
+}
+
+static void
+xfs_getparents_put_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ void *value,
+ int valuelen)
+{
+ struct xfs_getparents_ctx *gpx =
+ container_of(context, struct xfs_getparents_ctx, context);
+ struct xfs_inode *ip = context->dp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_getparents *gp = &gpx->gph.gph_request;
+ struct xfs_getparents_rec *gpr = gpx->krecords + context->firstu;
+ unsigned short reclen =
+ xfs_getparents_rec_sizeof(namelen);
+ xfs_ino_t ino;
+ uint32_t gen;
+ int error;
+
+ if (!(flags & XFS_ATTR_PARENT))
+ return;
+
+ error = xfs_parent_from_attr(mp, flags, name, namelen, value, valuelen,
+ &ino, &gen);
+ if (error) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_PARENT);
+ context->seen_enough = -EFSCORRUPTED;
+ return;
+ }
+
+ /*
+ * We found a parent pointer, but we've filled up the buffer. Signal
+ * to the caller that we did /not/ reach the end of the parent pointer
+ * recordset.
+ */
+ if (context->firstu > context->bufsize - reclen) {
+ context->seen_enough = 1;
+ return;
+ }
+
+ /* Format the parent pointer directly into the caller buffer. */
+ gpr->gpr_reclen = reclen;
+ xfs_filehandle_init(mp, ino, gen, &gpr->gpr_parent);
+ memcpy(gpr->gpr_name, name, namelen);
+ gpr->gpr_name[namelen] = 0;
+
+ trace_xfs_getparents_put_listent(ip, gp, context, gpr);
+
+ context->firstu += reclen;
+ gpx->count++;
+ gpx->lastrec = gpr;
+}
+
+/* Expand the last record to fill the rest of the caller's buffer. */
+static inline void
+xfs_getparents_expand_lastrec(
+ struct xfs_getparents_ctx *gpx)
+{
+ struct xfs_getparents *gp = &gpx->gph.gph_request;
+ struct xfs_getparents_rec *gpr = gpx->lastrec;
+
+ if (!gpx->lastrec)
+ gpr = gpx->krecords;
+
+ gpr->gpr_reclen = gp->gp_bufsize - ((void *)gpr - gpx->krecords);
+
+ trace_xfs_getparents_expand_lastrec(gpx->ip, gp, &gpx->context, gpr);
+}
+
+static inline void __user *u64_to_uptr(u64 val)
+{
+ return (void __user *)(uintptr_t)val;
+}
+
+/* Retrieve the parent pointers for a given inode. */
+STATIC int
+xfs_getparents(
+ struct xfs_getparents_ctx *gpx)
+{
+ struct xfs_getparents *gp = &gpx->gph.gph_request;
+ struct xfs_inode *ip = gpx->ip;
+ struct xfs_mount *mp = ip->i_mount;
+ size_t bufsize;
+ int error;
+
+ /* Check size of buffer requested by user */
+ if (gp->gp_bufsize > XFS_XATTR_LIST_MAX)
+ return -ENOMEM;
+ if (gp->gp_bufsize < xfs_getparents_rec_sizeof(1))
+ return -EINVAL;
+
+ if (gp->gp_iflags & ~XFS_GETPARENTS_IFLAGS_ALL)
+ return -EINVAL;
+ if (gp->gp_reserved)
+ return -EINVAL;
+
+ bufsize = round_down(gp->gp_bufsize, sizeof(uint64_t));
+ gpx->krecords = kvzalloc(bufsize, GFP_KERNEL);
+ if (!gpx->krecords) {
+ bufsize = min(bufsize, PAGE_SIZE);
+ gpx->krecords = kvzalloc(bufsize, GFP_KERNEL);
+ if (!gpx->krecords)
+ return -ENOMEM;
+ }
+
+ gpx->context.dp = ip;
+ gpx->context.resynch = 1;
+ gpx->context.put_listent = xfs_getparents_put_listent;
+ gpx->context.bufsize = bufsize;
+ /* firstu is used to track the bytes filled in the buffer */
+ gpx->context.firstu = 0;
+
+ /* Copy the cursor provided by caller */
+ memcpy(&gpx->context.cursor, &gp->gp_cursor,
+ sizeof(struct xfs_attrlist_cursor));
+ gpx->count = 0;
+ gp->gp_oflags = 0;
+
+ trace_xfs_getparents_begin(ip, gp, &gpx->context.cursor);
+
+ error = xfs_attr_list(&gpx->context);
+ if (error)
+ goto out_free_buf;
+ if (gpx->context.seen_enough < 0) {
+ error = gpx->context.seen_enough;
+ goto out_free_buf;
+ }
+ xfs_getparents_expand_lastrec(gpx);
+
+ /* Update the caller with the current cursor position */
+ memcpy(&gp->gp_cursor, &gpx->context.cursor,
+ sizeof(struct xfs_attrlist_cursor));
+
+ /* Is this the root directory? */
+ if (ip->i_ino == mp->m_sb.sb_rootino)
+ gp->gp_oflags |= XFS_GETPARENTS_OFLAG_ROOT;
+
+ if (gpx->context.seen_enough == 0) {
+ /*
+ * If we did not run out of buffer space, then we reached the
+ * end of the pptr recordset, so set the DONE flag.
+ */
+ gp->gp_oflags |= XFS_GETPARENTS_OFLAG_DONE;
+ } else if (gpx->count == 0) {
+ /*
+ * If we ran out of buffer space before copying any parent
+ * pointers at all, the caller's buffer was too short. Tell
+ * userspace that, erm, the message is too long.
+ */
+ error = -EMSGSIZE;
+ goto out_free_buf;
+ }
+
+ trace_xfs_getparents_end(ip, gp, &gpx->context.cursor);
+
+ ASSERT(gpx->context.firstu <= gpx->gph.gph_request.gp_bufsize);
+
+ /* Copy the records to userspace. */
+ if (copy_to_user(u64_to_uptr(gpx->gph.gph_request.gp_buffer),
+ gpx->krecords, gpx->context.firstu))
+ error = -EFAULT;
+
+out_free_buf:
+ kvfree(gpx->krecords);
+ gpx->krecords = NULL;
+ return error;
+}
+
+/* Retrieve the parents of this file and pass them back to userspace. */
+int
+xfs_ioc_getparents(
+ struct file *file,
+ struct xfs_getparents __user *ureq)
+{
+ struct xfs_getparents_ctx gpx = {
+ .ip = XFS_I(file_inode(file)),
+ };
+ struct xfs_getparents *kreq = &gpx.gph.gph_request;
+ struct xfs_mount *mp = gpx.ip->i_mount;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (!xfs_has_parent(mp))
+ return -EOPNOTSUPP;
+ if (copy_from_user(kreq, ureq, sizeof(*kreq)))
+ return -EFAULT;
+
+ error = xfs_getparents(&gpx);
+ if (error)
+ return error;
+
+ if (copy_to_user(ureq, kreq, sizeof(*kreq)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Retrieve the parents of this file handle and pass them back to userspace. */
+int
+xfs_ioc_getparents_by_handle(
+ struct file *file,
+ struct xfs_getparents_by_handle __user *ureq)
+{
+ struct xfs_getparents_ctx gpx = { };
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_getparents_by_handle *kreq = &gpx.gph;
+ struct xfs_handle *handle = &kreq->gph_handle;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (!xfs_has_parent(mp))
+ return -EOPNOTSUPP;
+ if (copy_from_user(kreq, ureq, sizeof(*kreq)))
+ return -EFAULT;
+
+ /*
+ * We don't use exportfs_decode_fh because it does too much work here.
+ * If the handle refers to a directory, the exportfs code will walk
+ * upwards through the directory tree to connect the dentries to the
+ * root directory dentry. For GETPARENTS we don't care about that
+ * because we're not actually going to open a file descriptor; we only
+ * want to open an inode and read its parent pointers.
+ *
+ * Note that xfs_scrub uses GETPARENTS to log that it will try to fix a
+ * corrupted file's metadata. For this usecase we would really rather
+ * userspace single-step the path reconstruction to avoid loops or
+ * other strange things if the directory tree is corrupt.
+ */
+ gpx.ip = xfs_khandle_to_inode(file, handle);
+ if (IS_ERR(gpx.ip))
+ return PTR_ERR(gpx.ip);
+
+ error = xfs_getparents(&gpx);
+ if (error)
+ goto out_rele;
+
+ if (copy_to_user(ureq, kreq, sizeof(*kreq)))
+ error = -EFAULT;
+
+out_rele:
+ xfs_irele(gpx.ip);
+ return error;
+}
diff --git a/fs/xfs/xfs_handle.h b/fs/xfs/xfs_handle.h
new file mode 100644
index 000000000000..6799a86d8565
--- /dev/null
+++ b/fs/xfs/xfs_handle.h
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2022-2024 Oracle.
+ * All rights reserved.
+ */
+#ifndef __XFS_HANDLE_H__
+#define __XFS_HANDLE_H__
+
+int xfs_attrlist_by_handle(struct file *parfilp,
+ struct xfs_fsop_attrlist_handlereq __user *p);
+int xfs_attrmulti_by_handle(struct file *parfilp, void __user *arg);
+
+int xfs_find_handle(unsigned int cmd, struct xfs_fsop_handlereq *hreq);
+int xfs_open_by_handle(struct file *parfilp, struct xfs_fsop_handlereq *hreq);
+int xfs_readlink_by_handle(struct file *parfilp,
+ struct xfs_fsop_handlereq *hreq);
+
+int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode,
+ uint32_t opcode, void __user *uname, void __user *value,
+ uint32_t *len, uint32_t flags);
+int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf,
+ size_t bufsize, int flags,
+ struct xfs_attrlist_cursor __user *ucursor);
+
+struct dentry *xfs_handle_to_dentry(struct file *parfilp, void __user *uhandle,
+ u32 hlen);
+
+int xfs_ioc_getparents(struct file *file, struct xfs_getparents __user *arg);
+int xfs_ioc_getparents_by_handle(struct file *file,
+ struct xfs_getparents_by_handle __user *arg);
+
+#endif /* __XFS_HANDLE_H__ */
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index b39f959146bc..10f116d093a2 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -470,6 +470,7 @@ static const struct ioctl_sick_map ino_map[] = {
{ XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA },
{ XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR },
{ XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK },
+ { XFS_SICK_INO_DIRTREE, XFS_BS_SICK_DIRTREE },
{ 0, 0 },
};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 74f1812b03cb..0953163a2d84 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -613,7 +613,6 @@ xfs_iget_cache_miss(
struct xfs_inode *ip;
int error;
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
- int iflags;
ip = xfs_inode_alloc(mp, ino);
if (!ip)
@@ -693,13 +692,12 @@ xfs_iget_cache_miss(
* memory barrier that ensures this detection works correctly at lookup
* time.
*/
- iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
d_mark_dontcache(VFS_I(ip));
ip->i_udquot = NULL;
ip->i_gdquot = NULL;
ip->i_pdquot = NULL;
- xfs_iflags_set(ip, iflags);
+ xfs_iflags_set(ip, XFS_INEW);
/* insert the new inode */
spin_lock(&pag->pag_ici_lock);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d55b42b2480d..58fb7a5062e1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -16,6 +16,7 @@
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_attr.h"
+#include "xfs_bit.h"
#include "xfs_trans_space.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
@@ -38,13 +39,12 @@
#include "xfs_ag.h"
#include "xfs_log_priv.h"
#include "xfs_health.h"
+#include "xfs_pnfs.h"
+#include "xfs_parent.h"
+#include "xfs_xattr.h"
struct kmem_cache *xfs_inode_cache;
-STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
-STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
- struct xfs_inode *);
-
/*
* helper function to extract extent size hint from inode
*/
@@ -60,7 +60,8 @@ xfs_get_extsz_hint(
return 0;
if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
return ip->i_extsize;
- if (XFS_IS_REALTIME_INODE(ip))
+ if (XFS_IS_REALTIME_INODE(ip) &&
+ ip->i_mount->m_sb.sb_rextsize > 1)
return ip->i_mount->m_sb.sb_rextsize;
return 0;
}
@@ -420,7 +421,7 @@ xfs_lock_inumorder(
* lock more than one at a time, lockdep will report false positives saying we
* have violated locking orders.
*/
-static void
+void
xfs_lock_inodes(
struct xfs_inode **ips,
int inodes,
@@ -749,6 +750,8 @@ xfs_inode_inherit_flags2(
/*
* Initialise a newly allocated inode and return the in-core inode to the
* caller locked exclusively.
+ *
+ * Caller is responsible for unlocking the inode manually upon return
*/
int
xfs_init_new_inode(
@@ -875,7 +878,7 @@ xfs_init_new_inode(
/*
* Log the new values stuffed into the inode.
*/
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, flags);
/* now that we have an i_mode we can setup the inode structure */
@@ -890,24 +893,27 @@ xfs_init_new_inode(
* link count to go to zero, move the inode to AGI unlinked list so that it can
* be freed when the last active reference goes away via xfs_inactive().
*/
-static int /* error */
+int
xfs_droplink(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- if (VFS_I(ip)->i_nlink == 0) {
- xfs_alert(ip->i_mount,
- "%s: Attempt to drop inode (%llu) with nlink zero.",
- __func__, ip->i_ino);
- return -EFSCORRUPTED;
- }
+ struct inode *inode = VFS_I(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- drop_nlink(VFS_I(ip));
+ if (inode->i_nlink == 0) {
+ xfs_info_ratelimited(tp->t_mountp,
+ "Inode 0x%llx link count dropped below zero. Pinning link count.",
+ ip->i_ino);
+ set_nlink(inode, XFS_NLINK_PINNED);
+ }
+ if (inode->i_nlink != XFS_NLINK_PINNED)
+ drop_nlink(inode);
+
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (VFS_I(ip)->i_nlink)
+ if (inode->i_nlink)
return 0;
return xfs_iunlink(tp, ip);
@@ -916,14 +922,22 @@ xfs_droplink(
/*
* Increment the link count on an inode & log the change.
*/
-static void
+void
xfs_bumplink(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
+ struct inode *inode = VFS_I(ip);
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- inc_nlink(VFS_I(ip));
+ if (inode->i_nlink == XFS_NLINK_PINNED - 1)
+ xfs_info_ratelimited(tp->t_mountp,
+ "Inode 0x%llx link count exceeded maximum. Pinning link count.",
+ ip->i_ino);
+ if (inode->i_nlink != XFS_NLINK_PINNED)
+ inc_nlink(inode);
+
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -1005,7 +1019,7 @@ xfs_dir_hook_setup(
int
xfs_create(
struct mnt_idmap *idmap,
- xfs_inode_t *dp,
+ struct xfs_inode *dp,
struct xfs_name *name,
umode_t mode,
dev_t rdev,
@@ -1017,7 +1031,7 @@ xfs_create(
struct xfs_inode *ip = NULL;
struct xfs_trans *tp = NULL;
int error;
- bool unlock_dp_on_error = false;
+ bool unlock_dp_on_error = false;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -1025,6 +1039,7 @@ xfs_create(
struct xfs_trans_res *tres;
uint resblks;
xfs_ino_t ino;
+ struct xfs_parent_args *ppargs;
trace_xfs_create(dp, name);
@@ -1046,13 +1061,17 @@ xfs_create(
return error;
if (is_dir) {
- resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+ resblks = xfs_mkdir_space_res(mp, name->len);
tres = &M_RES(mp)->tr_mkdir;
} else {
- resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+ resblks = xfs_create_space_res(mp, name->len);
tres = &M_RES(mp)->tr_create;
}
+ error = xfs_parent_start(mp, &ppargs);
+ if (error)
+ goto out_release_dquots;
+
/*
* Initially assume that the file does not exist and
* reserve the resources for that case. If that is not
@@ -1068,7 +1087,7 @@ xfs_create(
resblks, &tp);
}
if (error)
- goto out_release_dquots;
+ goto out_parent;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -1092,8 +1111,7 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- unlock_dp_on_error = false;
+ xfs_trans_ijoin(tp, dp, 0);
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
resblks - XFS_IALLOC_SPACE_RES(mp));
@@ -1113,6 +1131,16 @@ xfs_create(
}
/*
+ * If we have parent pointers, we need to add the attribute containing
+ * the parent information now.
+ */
+ if (ppargs) {
+ error = xfs_parent_addname(tp, ppargs, dp, name, ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
* Create ip with a reference from dp, and add '.' and '..' references
* if it's a directory.
*/
@@ -1142,6 +1170,9 @@ xfs_create(
xfs_qm_dqrele(pdqp);
*ipp = ip;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_parent_finish(mp, ppargs);
return 0;
out_trans_cancel:
@@ -1153,9 +1184,12 @@ xfs_create(
* transactions and deadlocks from xfs_inactive.
*/
if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
+ out_parent:
+ xfs_parent_finish(mp, ppargs);
out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -1171,6 +1205,7 @@ xfs_create_tmpfile(
struct mnt_idmap *idmap,
struct xfs_inode *dp,
umode_t mode,
+ bool init_xattrs,
struct xfs_inode **ipp)
{
struct xfs_mount *mp = dp->i_mount;
@@ -1211,7 +1246,7 @@ xfs_create_tmpfile(
error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
if (!error)
error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
- 0, 0, prid, false, &ip);
+ 0, 0, prid, init_xattrs, &ip);
if (error)
goto out_trans_cancel;
@@ -1238,6 +1273,7 @@ xfs_create_tmpfile(
xfs_qm_dqrele(pdqp);
*ipp = ip;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
out_trans_cancel:
@@ -1249,6 +1285,7 @@ xfs_create_tmpfile(
* transactions and deadlocks from xfs_inactive.
*/
if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
@@ -1262,14 +1299,15 @@ xfs_create_tmpfile(
int
xfs_link(
- xfs_inode_t *tdp,
- xfs_inode_t *sip,
+ struct xfs_inode *tdp,
+ struct xfs_inode *sip,
struct xfs_name *target_name)
{
- xfs_mount_t *mp = tdp->i_mount;
- xfs_trans_t *tp;
+ struct xfs_mount *mp = tdp->i_mount;
+ struct xfs_trans *tp;
int error, nospace_error = 0;
int resblks;
+ struct xfs_parent_args *ppargs;
trace_xfs_link(tdp, target_name);
@@ -1288,11 +1326,25 @@ xfs_link(
if (error)
goto std_return;
- resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+ error = xfs_parent_start(mp, &ppargs);
+ if (error)
+ goto std_return;
+
+ resblks = xfs_link_space_res(mp, target_name->len);
error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
&tp, &nospace_error);
if (error)
- goto std_return;
+ goto out_parent;
+
+ /*
+ * We don't allow reservationless or quotaless hardlinking when parent
+ * pointers are enabled because we can't back out if the xattrs must
+ * grow.
+ */
+ if (ppargs && nospace_error) {
+ error = nospace_error;
+ goto error_return;
+ }
/*
* If we are using project inheritance, we only allow hard link
@@ -1343,6 +1395,19 @@ xfs_link(
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
xfs_bumplink(tp, sip);
+
+ /*
+ * If we have parent pointers, we now need to add the parent record to
+ * the attribute fork of the inode. If this is the initial parent
+ * attribute, we need to create it correctly, otherwise we can just add
+ * the parent to the inode.
+ */
+ if (ppargs) {
+ error = xfs_parent_addname(tp, ppargs, tdp, target_name, sip);
+ if (error)
+ goto error_return;
+ }
+
xfs_dir_update_hook(tdp, sip, 1, target_name);
/*
@@ -1353,10 +1418,18 @@ xfs_link(
if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(tdp, XFS_ILOCK_EXCL);
+ xfs_iunlock(sip, XFS_ILOCK_EXCL);
+ xfs_parent_finish(mp, ppargs);
+ return error;
error_return:
xfs_trans_cancel(tp);
+ xfs_iunlock(tdp, XFS_ILOCK_EXCL);
+ xfs_iunlock(sip, XFS_ILOCK_EXCL);
+ out_parent:
+ xfs_parent_finish(mp, ppargs);
std_return:
if (error == -ENOSPC && nospace_error)
error = nospace_error;
@@ -1555,6 +1628,51 @@ out_unlock:
}
/*
+ * Mark all the buffers attached to this directory stale. In theory we should
+ * never be freeing a directory with any blocks at all, but this covers the
+ * case where we've recovered a directory swap with a "temporary" directory
+ * created by online repair and now need to dump it.
+ */
+STATIC void
+xfs_inactive_dir(
+ struct xfs_inode *dp)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
+ xfs_fileoff_t off;
+
+ /*
+ * Invalidate each directory block. All directory blocks are of
+ * fsbcount length and alignment, so we only need to walk those same
+ * offsets. We hold the only reference to this inode, so we must wait
+ * for the buffer locks.
+ */
+ for_each_xfs_iext(ifp, &icur, &got) {
+ for (off = round_up(got.br_startoff, geo->fsbcount);
+ off < got.br_startoff + got.br_blockcount;
+ off += geo->fsbcount) {
+ struct xfs_buf *bp = NULL;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ fsbno = (off - got.br_startoff) + got.br_startblock;
+ error = xfs_buf_incore(mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, fsbno),
+ XFS_FSB_TO_BB(mp, geo->fsbcount),
+ XBF_LIVESCAN, &bp);
+ if (error)
+ continue;
+
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ }
+ }
+}
+
+/*
* xfs_inactive_truncate
*
* Called to perform a truncate when an inode becomes unlinked.
@@ -1864,6 +1982,11 @@ xfs_inactive(
goto out;
}
+ if (S_ISDIR(VFS_I(ip)->i_mode) && ip->i_df.if_nextents > 0) {
+ xfs_inactive_dir(ip);
+ truncate = 1;
+ }
+
if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
else if (truncate)
@@ -1937,7 +2060,7 @@ out:
* only unlinked, referenced inodes can be on the unlinked inode list. If we
* don't find the inode in cache, then let the caller handle the situation.
*/
-static struct xfs_inode *
+struct xfs_inode *
xfs_iunlink_lookup(
struct xfs_perag *pag,
xfs_agino_t agino)
@@ -2150,7 +2273,7 @@ xfs_iunlink_insert_inode(
* We place the on-disk inode on a list in the AGI. It will be pulled from this
* list when the inode is freed.
*/
-STATIC int
+int
xfs_iunlink(
struct xfs_trans *tp,
struct xfs_inode *ip)
@@ -2167,7 +2290,7 @@ xfs_iunlink(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
/* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(pag, tp, &agibp);
+ error = xfs_read_agi(pag, tp, 0, &agibp);
if (error)
goto out;
@@ -2252,7 +2375,7 @@ xfs_iunlink_remove_inode(
/*
* Pull the on-disk inode from the AGI unlinked list.
*/
-STATIC int
+int
xfs_iunlink_remove(
struct xfs_trans *tp,
struct xfs_perag *pag,
@@ -2264,7 +2387,7 @@ xfs_iunlink_remove(
trace_xfs_iunlink_remove(ip);
/* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(pag, tp, &agibp);
+ error = xfs_read_agi(pag, tp, 0, &agibp);
if (error)
return error;
@@ -2598,16 +2721,17 @@ xfs_iunpin_wait(
*/
int
xfs_remove(
- xfs_inode_t *dp,
+ struct xfs_inode *dp,
struct xfs_name *name,
- xfs_inode_t *ip)
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp = dp->i_mount;
- xfs_trans_t *tp = NULL;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_trans *tp = NULL;
int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
int dontcare;
int error = 0;
uint resblks;
+ struct xfs_parent_args *ppargs;
trace_xfs_remove(dp, name);
@@ -2624,6 +2748,10 @@ xfs_remove(
if (error)
goto std_return;
+ error = xfs_parent_start(mp, &ppargs);
+ if (error)
+ goto std_return;
+
/*
* We try to get the real space reservation first, allowing for
* directory btree deletion(s) implying possible bmap insert(s). If we
@@ -2635,12 +2763,12 @@ xfs_remove(
* the directory code can handle a reservationless update and we don't
* want to prevent a user from trying to free space by deleting things.
*/
- resblks = XFS_REMOVE_SPACE_RES(mp);
+ resblks = xfs_remove_space_res(mp, name->len);
error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
&tp, &dontcare);
if (error) {
ASSERT(error != -ENOSPC);
- goto std_return;
+ goto out_parent;
}
/*
@@ -2700,6 +2828,13 @@ xfs_remove(
goto out_trans_cancel;
}
+ /* Remove parent pointer. */
+ if (ppargs) {
+ error = xfs_parent_removename(tp, ppargs, dp, name, ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
/*
* Drop the link from dp to ip, and if ip was a directory, remove the
* '.' and '..' references since we freed the directory.
@@ -2716,19 +2851,42 @@ xfs_remove(
error = xfs_trans_commit(tp);
if (error)
- goto std_return;
+ goto out_unlock;
if (is_dir && xfs_inode_is_filestream(ip))
xfs_filestream_deassociate(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_parent_finish(mp, ppargs);
return 0;
out_trans_cancel:
xfs_trans_cancel(tp);
+ out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ out_parent:
+ xfs_parent_finish(mp, ppargs);
std_return:
return error;
}
+static inline void
+xfs_iunlock_rename(
+ struct xfs_inode **i_tab,
+ int num_inodes)
+{
+ int i;
+
+ for (i = num_inodes - 1; i >= 0; i--) {
+ /* Skip duplicate inodes if src and target dps are the same */
+ if (!i_tab[i] || (i > 0 && i_tab[i] == i_tab[i - 1]))
+ continue;
+ xfs_iunlock(i_tab[i], XFS_ILOCK_EXCL);
+ }
+}
+
/*
* Enter all inodes for a rename transaction into a sorted array.
*/
@@ -2743,7 +2901,7 @@ xfs_sort_for_rename(
struct xfs_inode **i_tab,/* out: sorted array of inodes */
int *num_inodes) /* in/out: inodes in array */
{
- int i, j;
+ int i;
ASSERT(*num_inodes == __XFS_SORT_INODES);
memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
@@ -2765,17 +2923,26 @@ xfs_sort_for_rename(
i_tab[i++] = wip;
*num_inodes = i;
+ xfs_sort_inodes(i_tab, *num_inodes);
+}
+
+void
+xfs_sort_inodes(
+ struct xfs_inode **i_tab,
+ unsigned int num_inodes)
+{
+ int i, j;
+
+ ASSERT(num_inodes <= __XFS_SORT_INODES);
+
/*
* Sort the elements via bubble sort. (Remember, there are at
* most 5 elements to sort, so this is adequate.)
*/
- for (i = 0; i < *num_inodes; i++) {
- for (j = 1; j < *num_inodes; j++) {
- if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
- struct xfs_inode *temp = i_tab[j];
- i_tab[j] = i_tab[j-1];
- i_tab[j-1] = temp;
- }
+ for (i = 0; i < num_inodes; i++) {
+ for (j = 1; j < num_inodes; j++) {
+ if (i_tab[j]->i_ino < i_tab[j-1]->i_ino)
+ swap(i_tab[j], i_tab[j - 1]);
}
}
}
@@ -2805,15 +2972,17 @@ xfs_cross_rename(
struct xfs_inode *dp1,
struct xfs_name *name1,
struct xfs_inode *ip1,
+ struct xfs_parent_args *ip1_ppargs,
struct xfs_inode *dp2,
struct xfs_name *name2,
struct xfs_inode *ip2,
+ struct xfs_parent_args *ip2_ppargs,
int spaceres)
{
- int error = 0;
- int ip1_flags = 0;
- int ip2_flags = 0;
- int dp2_flags = 0;
+ int error = 0;
+ int ip1_flags = 0;
+ int ip2_flags = 0;
+ int dp2_flags = 0;
/* Swap inode number for dirent in first parent */
error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
@@ -2882,6 +3051,21 @@ xfs_cross_rename(
}
}
+ /* Schedule parent pointer replacements */
+ if (ip1_ppargs) {
+ error = xfs_parent_replacename(tp, ip1_ppargs, dp1, name1, dp2,
+ name2, ip1);
+ if (error)
+ goto out_trans_abort;
+ }
+
+ if (ip2_ppargs) {
+ error = xfs_parent_replacename(tp, ip2_ppargs, dp2, name2, dp1,
+ name1, ip2);
+ if (error)
+ goto out_trans_abort;
+ }
+
if (ip1_flags) {
xfs_trans_ichgtime(tp, ip1, ip1_flags);
xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
@@ -2937,7 +3121,7 @@ xfs_rename_alloc_whiteout(
int error;
error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE,
- &tmpfile);
+ xfs_has_parent(dp->i_mount), &tmpfile);
if (error)
return error;
@@ -2981,6 +3165,9 @@ xfs_rename(
struct xfs_trans *tp;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
+ struct xfs_parent_args *src_ppargs = NULL;
+ struct xfs_parent_args *tgt_ppargs = NULL;
+ struct xfs_parent_args *wip_ppargs = NULL;
int i;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
@@ -3012,9 +3199,26 @@ xfs_rename(
xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
inodes, &num_inodes);
+ error = xfs_parent_start(mp, &src_ppargs);
+ if (error)
+ goto out_release_wip;
+
+ if (wip) {
+ error = xfs_parent_start(mp, &wip_ppargs);
+ if (error)
+ goto out_src_ppargs;
+ }
+
+ if (target_ip) {
+ error = xfs_parent_start(mp, &tgt_ppargs);
+ if (error)
+ goto out_wip_ppargs;
+ }
+
retry:
nospace_error = 0;
- spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+ spaceres = xfs_rename_space_res(mp, src_name->len, target_ip != NULL,
+ target_name->len, wip != NULL);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
if (error == -ENOSPC) {
nospace_error = error;
@@ -3023,14 +3227,26 @@ retry:
&tp);
}
if (error)
- goto out_release_wip;
+ goto out_tgt_ppargs;
+
+ /*
+ * We don't allow reservationless renaming when parent pointers are
+ * enabled because we can't back out if the xattrs must grow.
+ */
+ if (src_ppargs && nospace_error) {
+ error = nospace_error;
+ xfs_trans_cancel(tp);
+ goto out_tgt_ppargs;
+ }
/*
* Attach the dquots to the inodes
*/
error = xfs_qm_vop_rename_dqattach(inodes);
- if (error)
- goto out_trans_cancel;
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_tgt_ppargs;
+ }
/*
* Lock all the participating inodes. Depending upon whether
@@ -3041,18 +3257,16 @@ retry:
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
/*
- * Join all the inodes to the transaction. From this point on,
- * we can rely on either trans_commit or trans_cancel to unlock
- * them.
+ * Join all the inodes to the transaction.
*/
- xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_dp, 0);
if (new_parent)
- xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_dp, 0);
+ xfs_trans_ijoin(tp, src_ip, 0);
if (target_ip)
- xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_ip, 0);
if (wip)
- xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, wip, 0);
/*
* If we are using project inheritance, we only allow renames
@@ -3066,10 +3280,13 @@ retry:
}
/* RENAME_EXCHANGE is unique from here on. */
- if (flags & RENAME_EXCHANGE)
- return xfs_cross_rename(tp, src_dp, src_name, src_ip,
- target_dp, target_name, target_ip,
- spaceres);
+ if (flags & RENAME_EXCHANGE) {
+ error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
+ src_ppargs, target_dp, target_name, target_ip,
+ tgt_ppargs, spaceres);
+ nospace_error = 0;
+ goto out_unlock;
+ }
/*
* Try to reserve quota to handle an expansion of the target directory.
@@ -3083,6 +3300,7 @@ retry:
if (error == -EDQUOT || error == -ENOSPC) {
if (!retried) {
xfs_trans_cancel(tp);
+ xfs_iunlock_rename(inodes, num_inodes);
xfs_blockgc_free_quota(target_dp, 0);
retried = true;
goto retry;
@@ -3097,6 +3315,15 @@ retry:
}
/*
+ * We don't allow quotaless renaming when parent pointers are enabled
+ * because we can't back out if the xattrs must grow.
+ */
+ if (src_ppargs && nospace_error) {
+ error = nospace_error;
+ goto out_trans_cancel;
+ }
+
+ /*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
*/
@@ -3142,7 +3369,7 @@ retry:
pag = xfs_perag_get(mp,
XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
- error = xfs_read_agi(pag, tp, &bp);
+ error = xfs_read_agi(pag, tp, 0, &bp);
xfs_perag_put(pag);
if (error)
goto out_trans_cancel;
@@ -3288,6 +3515,28 @@ retry:
if (error)
goto out_trans_cancel;
+ /* Schedule parent pointer updates. */
+ if (wip_ppargs) {
+ error = xfs_parent_addname(tp, wip_ppargs, src_dp, src_name,
+ wip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ if (src_ppargs) {
+ error = xfs_parent_replacename(tp, src_ppargs, src_dp,
+ src_name, target_dp, target_name, src_ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ if (tgt_ppargs) {
+ error = xfs_parent_removename(tp, tgt_ppargs, target_dp,
+ target_name, target_ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
if (new_parent)
@@ -3309,12 +3558,19 @@ retry:
xfs_dir_update_hook(src_dp, wip, 1, src_name);
error = xfs_finish_rename(tp);
- if (wip)
- xfs_irele(wip);
- return error;
+ nospace_error = 0;
+ goto out_unlock;
out_trans_cancel:
xfs_trans_cancel(tp);
+out_unlock:
+ xfs_iunlock_rename(inodes, num_inodes);
+out_tgt_ppargs:
+ xfs_parent_finish(mp, tgt_ppargs);
+out_wip_ppargs:
+ xfs_parent_finish(mp, wip_ppargs);
+out_src_ppargs:
+ xfs_parent_finish(mp, src_ppargs);
out_release_wip:
if (wip)
xfs_irele(wip);
@@ -3814,7 +4070,7 @@ xfs_inode_reload_unlinked_bucket(
/* Grab the first inode in the list */
pag = xfs_perag_get(mp, agno);
- error = xfs_ialloc_read_agi(pag, tp, &agibp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &agibp);
xfs_perag_put(pag);
if (error)
return error;
@@ -3946,3 +4202,77 @@ xfs_inode_count_blocks(
xfs_bmap_count_leaves(ifp, rblocks);
*dblocks = ip->i_nblocks - *rblocks;
}
+
+static void
+xfs_wait_dax_page(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+ schedule();
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+}
+
+int
+xfs_break_dax_layouts(
+ struct inode *inode,
+ bool *retry)
+{
+ struct page *page;
+
+ xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
+
+ page = dax_layout_busy_page(inode->i_mapping);
+ if (!page)
+ return 0;
+
+ *retry = true;
+ return ___wait_var_event(&page->_refcount,
+ atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+ 0, 0, xfs_wait_dax_page(inode));
+}
+
+int
+xfs_break_layouts(
+ struct inode *inode,
+ uint *iolock,
+ enum layout_break_reason reason)
+{
+ bool retry;
+ int error;
+
+ xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
+
+ do {
+ retry = false;
+ switch (reason) {
+ case BREAK_UNMAP:
+ error = xfs_break_dax_layouts(inode, &retry);
+ if (error || retry)
+ break;
+ fallthrough;
+ case BREAK_WRITE:
+ error = xfs_break_leased_layouts(inode, iolock, &retry);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ error = -EINVAL;
+ }
+ } while (error == 0 && retry);
+
+ return error;
+}
+
+/* Returns the size of fundamental allocation unit for a file, in bytes. */
+unsigned int
+xfs_inode_alloc_unitsize(
+ struct xfs_inode *ip)
+{
+ unsigned int blocks = 1;
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ blocks = ip->i_mount->m_sb.sb_rextsize;
+
+ return XFS_FSB_TO_B(ip->i_mount, blocks);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ab46ffb3ac19..292b90b5f2ac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -207,13 +207,13 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
* i_flags helper functions
*/
static inline void
-__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+__xfs_iflags_set(xfs_inode_t *ip, unsigned long flags)
{
ip->i_flags |= flags;
}
static inline void
-xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+xfs_iflags_set(xfs_inode_t *ip, unsigned long flags)
{
spin_lock(&ip->i_flags_lock);
__xfs_iflags_set(ip, flags);
@@ -221,7 +221,7 @@ xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
}
static inline void
-xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags)
+xfs_iflags_clear(xfs_inode_t *ip, unsigned long flags)
{
spin_lock(&ip->i_flags_lock);
ip->i_flags &= ~flags;
@@ -229,13 +229,13 @@ xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags)
}
static inline int
-__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+__xfs_iflags_test(xfs_inode_t *ip, unsigned long flags)
{
return (ip->i_flags & flags);
}
static inline int
-xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+xfs_iflags_test(xfs_inode_t *ip, unsigned long flags)
{
int ret;
spin_lock(&ip->i_flags_lock);
@@ -245,7 +245,7 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
}
static inline int
-xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
+xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned long flags)
{
int ret;
@@ -258,7 +258,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
}
static inline int
-xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags)
{
int ret;
@@ -312,6 +312,15 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
}
/*
+ * Decide if this file is a realtime file whose data allocation unit is larger
+ * than a single filesystem block.
+ */
+static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
+{
+ return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
+}
+
+/*
* Return the buftarg used for data allocations on a given inode.
*/
#define xfs_inode_buftarg(ip) \
@@ -513,7 +522,7 @@ int xfs_create(struct mnt_idmap *idmap,
umode_t mode, dev_t rdev, bool need_xattr,
struct xfs_inode **ipp);
int xfs_create_tmpfile(struct mnt_idmap *idmap,
- struct xfs_inode *dp, umode_t mode,
+ struct xfs_inode *dp, umode_t mode, bool init_xattrs,
struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
@@ -565,16 +574,10 @@ xfs_itruncate_extents(
return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0);
}
-/* from xfs_file.c */
int xfs_break_dax_layouts(struct inode *inode, bool *retry);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
-/* from xfs_iops.c */
-extern void xfs_setup_inode(struct xfs_inode *ip);
-extern void xfs_setup_iops(struct xfs_inode *ip);
-extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
-
static inline void xfs_update_stable_writes(struct xfs_inode *ip)
{
if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
@@ -613,11 +616,20 @@ extern struct kmem_cache *xfs_inode_cache;
bool xfs_inode_needs_inactive(struct xfs_inode *ip);
+int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
+int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
+ struct xfs_inode *ip);
+struct xfs_inode *xfs_iunlink_lookup(struct xfs_perag *pag, xfs_agino_t agino);
+
void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
+int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip);
+void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip);
+void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode);
+void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes);
static inline bool
xfs_inode_unlinked_incomplete(
@@ -631,6 +643,7 @@ int xfs_inode_reload_unlinked(struct xfs_inode *ip);
bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork);
void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_filblks_t *dblocks, xfs_filblks_t *rblocks);
+unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip);
struct xfs_dir_update_params {
const struct xfs_inode *dp;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d0e2cec6210d..f0117188f302 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -23,11 +23,9 @@
#include "xfs_fsops.h"
#include "xfs_discard.h"
#include "xfs_quota.h"
-#include "xfs_export.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_trans.h"
-#include "xfs_acl.h"
#include "xfs_btree.h"
#include <linux/fsmap.h>
#include "xfs_fsmap.h"
@@ -39,596 +37,13 @@
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
#include "xfs_rtbitmap.h"
+#include "xfs_file.h"
+#include "xfs_exchrange.h"
+#include "xfs_handle.h"
#include <linux/mount.h>
-#include <linux/namei.h>
#include <linux/fileattr.h>
-/*
- * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
- * a file or fs handle.
- *
- * XFS_IOC_PATH_TO_FSHANDLE
- * returns fs handle for a mount point or path within that mount point
- * XFS_IOC_FD_TO_HANDLE
- * returns full handle for a FD opened in user space
- * XFS_IOC_PATH_TO_HANDLE
- * returns full handle for a path
- */
-int
-xfs_find_handle(
- unsigned int cmd,
- xfs_fsop_handlereq_t *hreq)
-{
- int hsize;
- xfs_handle_t handle;
- struct inode *inode;
- struct fd f = {NULL};
- struct path path;
- int error;
- struct xfs_inode *ip;
-
- if (cmd == XFS_IOC_FD_TO_HANDLE) {
- f = fdget(hreq->fd);
- if (!f.file)
- return -EBADF;
- inode = file_inode(f.file);
- } else {
- error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
- if (error)
- return error;
- inode = d_inode(path.dentry);
- }
- ip = XFS_I(inode);
-
- /*
- * We can only generate handles for inodes residing on a XFS filesystem,
- * and only for regular files, directories or symbolic links.
- */
- error = -EINVAL;
- if (inode->i_sb->s_magic != XFS_SB_MAGIC)
- goto out_put;
-
- error = -EBADF;
- if (!S_ISREG(inode->i_mode) &&
- !S_ISDIR(inode->i_mode) &&
- !S_ISLNK(inode->i_mode))
- goto out_put;
-
-
- memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
-
- if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
- /*
- * This handle only contains an fsid, zero the rest.
- */
- memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
- hsize = sizeof(xfs_fsid_t);
- } else {
- handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
- sizeof(handle.ha_fid.fid_len);
- handle.ha_fid.fid_pad = 0;
- handle.ha_fid.fid_gen = inode->i_generation;
- handle.ha_fid.fid_ino = ip->i_ino;
- hsize = sizeof(xfs_handle_t);
- }
-
- error = -EFAULT;
- if (copy_to_user(hreq->ohandle, &handle, hsize) ||
- copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
- goto out_put;
-
- error = 0;
-
- out_put:
- if (cmd == XFS_IOC_FD_TO_HANDLE)
- fdput(f);
- else
- path_put(&path);
- return error;
-}
-
-/*
- * No need to do permission checks on the various pathname components
- * as the handle operations are privileged.
- */
-STATIC int
-xfs_handle_acceptable(
- void *context,
- struct dentry *dentry)
-{
- return 1;
-}
-
-/*
- * Convert userspace handle data into a dentry.
- */
-struct dentry *
-xfs_handle_to_dentry(
- struct file *parfilp,
- void __user *uhandle,
- u32 hlen)
-{
- xfs_handle_t handle;
- struct xfs_fid64 fid;
-
- /*
- * Only allow handle opens under a directory.
- */
- if (!S_ISDIR(file_inode(parfilp)->i_mode))
- return ERR_PTR(-ENOTDIR);
-
- if (hlen != sizeof(xfs_handle_t))
- return ERR_PTR(-EINVAL);
- if (copy_from_user(&handle, uhandle, hlen))
- return ERR_PTR(-EFAULT);
- if (handle.ha_fid.fid_len !=
- sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
- return ERR_PTR(-EINVAL);
-
- memset(&fid, 0, sizeof(struct fid));
- fid.ino = handle.ha_fid.fid_ino;
- fid.gen = handle.ha_fid.fid_gen;
-
- return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
- FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
- xfs_handle_acceptable, NULL);
-}
-
-STATIC struct dentry *
-xfs_handlereq_to_dentry(
- struct file *parfilp,
- xfs_fsop_handlereq_t *hreq)
-{
- return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
-}
-
-int
-xfs_open_by_handle(
- struct file *parfilp,
- xfs_fsop_handlereq_t *hreq)
-{
- const struct cred *cred = current_cred();
- int error;
- int fd;
- int permflag;
- struct file *filp;
- struct inode *inode;
- struct dentry *dentry;
- fmode_t fmode;
- struct path path;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- dentry = xfs_handlereq_to_dentry(parfilp, hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- inode = d_inode(dentry);
-
- /* Restrict xfs_open_by_handle to directories & regular files. */
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
- error = -EPERM;
- goto out_dput;
- }
-
-#if BITS_PER_LONG != 32
- hreq->oflags |= O_LARGEFILE;
-#endif
-
- permflag = hreq->oflags;
- fmode = OPEN_FMODE(permflag);
- if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
- (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
- error = -EPERM;
- goto out_dput;
- }
-
- if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
- error = -EPERM;
- goto out_dput;
- }
-
- /* Can't write directories. */
- if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
- error = -EISDIR;
- goto out_dput;
- }
-
- fd = get_unused_fd_flags(0);
- if (fd < 0) {
- error = fd;
- goto out_dput;
- }
-
- path.mnt = parfilp->f_path.mnt;
- path.dentry = dentry;
- filp = dentry_open(&path, hreq->oflags, cred);
- dput(dentry);
- if (IS_ERR(filp)) {
- put_unused_fd(fd);
- return PTR_ERR(filp);
- }
-
- if (S_ISREG(inode->i_mode)) {
- filp->f_flags |= O_NOATIME;
- filp->f_mode |= FMODE_NOCMTIME;
- }
-
- fd_install(fd, filp);
- return fd;
-
- out_dput:
- dput(dentry);
- return error;
-}
-
-int
-xfs_readlink_by_handle(
- struct file *parfilp,
- xfs_fsop_handlereq_t *hreq)
-{
- struct dentry *dentry;
- __u32 olen;
- int error;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- dentry = xfs_handlereq_to_dentry(parfilp, hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- /* Restrict this handle operation to symlinks only. */
- if (!d_is_symlink(dentry)) {
- error = -EINVAL;
- goto out_dput;
- }
-
- if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
- error = -EFAULT;
- goto out_dput;
- }
-
- error = vfs_readlink(dentry, hreq->ohandle, olen);
-
- out_dput:
- dput(dentry);
- return error;
-}
-
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-static void
-xfs_ioc_attr_put_listent(
- struct xfs_attr_list_context *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen)
-{
- struct xfs_attrlist *alist = context->buffer;
- struct xfs_attrlist_ent *aep;
- int arraytop;
-
- ASSERT(!context->seen_enough);
- ASSERT(context->count >= 0);
- ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
- ASSERT(context->firstu >= sizeof(*alist));
- ASSERT(context->firstu <= context->bufsize);
-
- /*
- * Only list entries in the right namespace.
- */
- if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK))
- return;
-
- arraytop = sizeof(*alist) +
- context->count * sizeof(alist->al_offset[0]);
-
- /* decrement by the actual bytes used by the attr */
- context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) +
- namelen + 1, sizeof(uint32_t));
- if (context->firstu < arraytop) {
- trace_xfs_attr_list_full(context);
- alist->al_more = 1;
- context->seen_enough = 1;
- return;
- }
-
- aep = context->buffer + context->firstu;
- aep->a_valuelen = valuelen;
- memcpy(aep->a_name, name, namelen);
- aep->a_name[namelen] = 0;
- alist->al_offset[context->count++] = context->firstu;
- alist->al_count = context->count;
- trace_xfs_attr_list_add(context);
-}
-
-static unsigned int
-xfs_attr_filter(
- u32 ioc_flags)
-{
- if (ioc_flags & XFS_IOC_ATTR_ROOT)
- return XFS_ATTR_ROOT;
- if (ioc_flags & XFS_IOC_ATTR_SECURE)
- return XFS_ATTR_SECURE;
- return 0;
-}
-
-static unsigned int
-xfs_attr_flags(
- u32 ioc_flags)
-{
- if (ioc_flags & XFS_IOC_ATTR_CREATE)
- return XATTR_CREATE;
- if (ioc_flags & XFS_IOC_ATTR_REPLACE)
- return XATTR_REPLACE;
- return 0;
-}
-
-int
-xfs_ioc_attr_list(
- struct xfs_inode *dp,
- void __user *ubuf,
- size_t bufsize,
- int flags,
- struct xfs_attrlist_cursor __user *ucursor)
-{
- struct xfs_attr_list_context context = { };
- struct xfs_attrlist *alist;
- void *buffer;
- int error;
-
- if (bufsize < sizeof(struct xfs_attrlist) ||
- bufsize > XFS_XATTR_LIST_MAX)
- return -EINVAL;
-
- /*
- * Reject flags, only allow namespaces.
- */
- if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
- return -EINVAL;
- if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
- return -EINVAL;
-
- /*
- * Validate the cursor.
- */
- if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor)))
- return -EFAULT;
- if (context.cursor.pad1 || context.cursor.pad2)
- return -EINVAL;
- if (!context.cursor.initted &&
- (context.cursor.hashval || context.cursor.blkno ||
- context.cursor.offset))
- return -EINVAL;
-
- buffer = kvzalloc(bufsize, GFP_KERNEL);
- if (!buffer)
- return -ENOMEM;
-
- /*
- * Initialize the output buffer.
- */
- context.dp = dp;
- context.resynch = 1;
- context.attr_filter = xfs_attr_filter(flags);
- context.buffer = buffer;
- context.bufsize = round_down(bufsize, sizeof(uint32_t));
- context.firstu = context.bufsize;
- context.put_listent = xfs_ioc_attr_put_listent;
-
- alist = context.buffer;
- alist->al_count = 0;
- alist->al_more = 0;
- alist->al_offset[0] = context.bufsize;
-
- error = xfs_attr_list(&context);
- if (error)
- goto out_free;
-
- if (copy_to_user(ubuf, buffer, bufsize) ||
- copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
- error = -EFAULT;
-out_free:
- kvfree(buffer);
- return error;
-}
-
-STATIC int
-xfs_attrlist_by_handle(
- struct file *parfilp,
- struct xfs_fsop_attrlist_handlereq __user *p)
-{
- struct xfs_fsop_attrlist_handlereq al_hreq;
- struct dentry *dentry;
- int error = -ENOMEM;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
- return -EFAULT;
-
- dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer,
- al_hreq.buflen, al_hreq.flags, &p->pos);
- dput(dentry);
- return error;
-}
-
-static int
-xfs_attrmulti_attr_get(
- struct inode *inode,
- unsigned char *name,
- unsigned char __user *ubuf,
- uint32_t *len,
- uint32_t flags)
-{
- struct xfs_da_args args = {
- .dp = XFS_I(inode),
- .attr_filter = xfs_attr_filter(flags),
- .attr_flags = xfs_attr_flags(flags),
- .name = name,
- .namelen = strlen(name),
- .valuelen = *len,
- };
- int error;
-
- if (*len > XFS_XATTR_SIZE_MAX)
- return -EINVAL;
-
- error = xfs_attr_get(&args);
- if (error)
- goto out_kfree;
-
- *len = args.valuelen;
- if (copy_to_user(ubuf, args.value, args.valuelen))
- error = -EFAULT;
-
-out_kfree:
- kvfree(args.value);
- return error;
-}
-
-static int
-xfs_attrmulti_attr_set(
- struct inode *inode,
- unsigned char *name,
- const unsigned char __user *ubuf,
- uint32_t len,
- uint32_t flags)
-{
- struct xfs_da_args args = {
- .dp = XFS_I(inode),
- .attr_filter = xfs_attr_filter(flags),
- .attr_flags = xfs_attr_flags(flags),
- .name = name,
- .namelen = strlen(name),
- };
- int error;
-
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
-
- if (ubuf) {
- if (len > XFS_XATTR_SIZE_MAX)
- return -EINVAL;
- args.value = memdup_user(ubuf, len);
- if (IS_ERR(args.value))
- return PTR_ERR(args.value);
- args.valuelen = len;
- }
-
- error = xfs_attr_change(&args);
- if (!error && (flags & XFS_IOC_ATTR_ROOT))
- xfs_forget_acl(inode, name);
- kfree(args.value);
- return error;
-}
-
-int
-xfs_ioc_attrmulti_one(
- struct file *parfilp,
- struct inode *inode,
- uint32_t opcode,
- void __user *uname,
- void __user *value,
- uint32_t *len,
- uint32_t flags)
-{
- unsigned char *name;
- int error;
-
- if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE))
- return -EINVAL;
-
- name = strndup_user(uname, MAXNAMELEN);
- if (IS_ERR(name))
- return PTR_ERR(name);
-
- switch (opcode) {
- case ATTR_OP_GET:
- error = xfs_attrmulti_attr_get(inode, name, value, len, flags);
- break;
- case ATTR_OP_REMOVE:
- value = NULL;
- *len = 0;
- fallthrough;
- case ATTR_OP_SET:
- error = mnt_want_write_file(parfilp);
- if (error)
- break;
- error = xfs_attrmulti_attr_set(inode, name, value, *len, flags);
- mnt_drop_write_file(parfilp);
- break;
- default:
- error = -EINVAL;
- break;
- }
-
- kfree(name);
- return error;
-}
-
-STATIC int
-xfs_attrmulti_by_handle(
- struct file *parfilp,
- void __user *arg)
-{
- int error;
- xfs_attr_multiop_t *ops;
- xfs_fsop_attrmulti_handlereq_t am_hreq;
- struct dentry *dentry;
- unsigned int i, size;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
- return -EFAULT;
-
- /* overflow check */
- if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
- return -E2BIG;
-
- dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- error = -E2BIG;
- size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
- if (!size || size > 16 * PAGE_SIZE)
- goto out_dput;
-
- ops = memdup_user(am_hreq.ops, size);
- if (IS_ERR(ops)) {
- error = PTR_ERR(ops);
- goto out_dput;
- }
-
- error = 0;
- for (i = 0; i < am_hreq.opcount; i++) {
- ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
- d_inode(dentry), ops[i].am_opcode,
- ops[i].am_attrname, ops[i].am_attrvalue,
- &ops[i].am_length, ops[i].am_flags);
- }
-
- if (copy_to_user(am_hreq.ops, ops, size))
- error = -EFAULT;
-
- kfree(ops);
- out_dput:
- dput(dentry);
- return error;
-}
-
/* Return 0 on success or positive error */
int
xfs_fsbulkstat_one_fmt(
@@ -1640,30 +1055,6 @@ out_free:
return error;
}
-STATIC int
-xfs_ioc_scrub_metadata(
- struct file *file,
- void __user *arg)
-{
- struct xfs_scrub_metadata scrub;
- int error;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&scrub, arg, sizeof(scrub)))
- return -EFAULT;
-
- error = xfs_scrub_metadata(file, &scrub);
- if (error)
- return error;
-
- if (copy_to_user(arg, &scrub, sizeof(scrub)))
- return -EFAULT;
-
- return 0;
-}
-
int
xfs_ioc_swapext(
xfs_swapext_t *sxp)
@@ -2010,7 +1401,10 @@ xfs_file_ioctl(
case XFS_IOC_FSGETXATTRA:
return xfs_ioc_fsgetxattra(ip, arg);
-
+ case XFS_IOC_GETPARENTS:
+ return xfs_ioc_getparents(filp, arg);
+ case XFS_IOC_GETPARENTS_BY_HANDLE:
+ return xfs_ioc_getparents_by_handle(filp, arg);
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
case XFS_IOC_GETBMAPX:
@@ -2019,6 +1413,8 @@ xfs_file_ioctl(
case FS_IOC_GETFSMAP:
return xfs_ioc_getfsmap(ip, arg);
+ case XFS_IOC_SCRUBV_METADATA:
+ return xfs_ioc_scrubv_metadata(filp, arg);
case XFS_IOC_SCRUB_METADATA:
return xfs_ioc_scrub_metadata(filp, arg);
@@ -2169,6 +1565,9 @@ xfs_file_ioctl(
return error;
}
+ case XFS_IOC_EXCHANGE_RANGE:
+ return xfs_ioc_exchange_range(filp, arg);
+
default:
return -ENOTTY;
}
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 38be600b5e1e..12124946f347 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -15,34 +15,6 @@ xfs_ioc_swapext(
xfs_swapext_t *sxp);
extern int
-xfs_find_handle(
- unsigned int cmd,
- xfs_fsop_handlereq_t *hreq);
-
-extern int
-xfs_open_by_handle(
- struct file *parfilp,
- xfs_fsop_handlereq_t *hreq);
-
-extern int
-xfs_readlink_by_handle(
- struct file *parfilp,
- xfs_fsop_handlereq_t *hreq);
-
-int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode,
- uint32_t opcode, void __user *uname, void __user *value,
- uint32_t *len, uint32_t flags);
-int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf,
- size_t bufsize, int flags,
- struct xfs_attrlist_cursor __user *ucursor);
-
-extern struct dentry *
-xfs_handle_to_dentry(
- struct file *parfilp,
- void __user *uhandle,
- u32 hlen);
-
-extern int
xfs_fileattr_get(
struct dentry *dentry,
struct fileattr *fa);
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ee35eea1ecce..b64785dc4354 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -24,6 +24,7 @@
#include "xfs_ioctl32.h"
#include "xfs_trace.h"
#include "xfs_sb.h"
+#include "xfs_handle.h"
#define _NATIVE_IOC(cmd, type) \
_IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 4087af7f3c9f..378342673925 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -28,6 +28,7 @@
#include "xfs_dquot.h"
#include "xfs_reflink.h"
#include "xfs_health.h"
+#include "xfs_rtbitmap.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -298,9 +299,7 @@ xfs_iomap_write_direct(
if (error)
return error;
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip, nr_exts);
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, nr_exts);
if (error)
goto out_trans_cancel;
@@ -321,14 +320,6 @@ xfs_iomap_write_direct(
if (error)
goto out_unlock;
- /*
- * Copy any maps to caller's array and return any error.
- */
- if (nimaps == 0) {
- error = -ENOSPC;
- goto out_unlock;
- }
-
if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = xfs_alert_fsblock_zero(ip, imap);
@@ -404,6 +395,29 @@ xfs_quota_calc_throttle(
}
}
+static int64_t
+xfs_iomap_freesp(
+ struct percpu_counter *counter,
+ uint64_t low_space[XFS_LOWSP_MAX],
+ int *shift)
+{
+ int64_t freesp;
+
+ freesp = percpu_counter_read_positive(counter);
+ if (freesp < low_space[XFS_LOWSP_5_PCNT]) {
+ *shift = 2;
+ if (freesp < low_space[XFS_LOWSP_4_PCNT])
+ (*shift)++;
+ if (freesp < low_space[XFS_LOWSP_3_PCNT])
+ (*shift)++;
+ if (freesp < low_space[XFS_LOWSP_2_PCNT])
+ (*shift)++;
+ if (freesp < low_space[XFS_LOWSP_1_PCNT])
+ (*shift)++;
+ }
+ return freesp;
+}
+
/*
* If we don't have a user specified preallocation size, dynamically increase
* the preallocation size as the size of the file grows. Cap the maximum size
@@ -486,18 +500,13 @@ xfs_iomap_prealloc_size(
alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
alloc_blocks);
- freesp = percpu_counter_read_positive(&mp->m_fdblocks);
- if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
- shift = 2;
- if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
- shift++;
- }
+ if (unlikely(XFS_IS_REALTIME_INODE(ip)))
+ freesp = xfs_rtx_to_rtb(mp,
+ xfs_iomap_freesp(&mp->m_frextents,
+ mp->m_low_rtexts, &shift));
+ else
+ freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
+ &shift);
/*
* Check each quota to cap the prealloc size, provide a shift value to
@@ -606,11 +615,8 @@ xfs_iomap_write_unwritten(
if (error)
return error;
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_WRITE_UNWRITTEN_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
@@ -982,8 +988,6 @@ xfs_buffered_write_iomap_begin(
return xfs_direct_write_iomap_begin(inode, offset, count,
flags, iomap, srcmap);
- ASSERT(!XFS_IS_REALTIME_INODE(ip));
-
error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -1023,6 +1027,24 @@ xfs_buffered_write_iomap_begin(
}
/*
+ * For zeroing, trim a delalloc extent that extends beyond the EOF
+ * block. If it starts beyond the EOF block, convert it to an
+ * unwritten extent.
+ */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
+ isnullstartblock(imap.br_startblock)) {
+ xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+
+ if (offset_fsb >= eof_fsb)
+ goto convert_delay;
+ if (end_fsb > eof_fsb) {
+ end_fsb = eof_fsb;
+ xfs_trim_extent(&imap, offset_fsb,
+ end_fsb - offset_fsb);
+ }
+ }
+
+ /*
* Search the COW fork extent list even if we did not find a data fork
* extent. This serves two purposes: first this implements the
* speculative preallocation using cowextsize, so that we also unshare
@@ -1158,15 +1180,26 @@ retry:
* them out if the write happens to fail.
*/
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
found_imap:
seq = xfs_iomap_inode_sequence(ip, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+convert_delay:
+ xfs_iunlock(ip, lockmode);
+ truncate_pagecache(inode, offset);
+ error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset,
+ iomap, NULL);
+ if (error)
+ return error;
+
+ trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap);
+ return 0;
+
found_cow:
seq = xfs_iomap_inode_sequence(ip, 0);
if (imap.br_startoff <= offset_fsb) {
@@ -1174,17 +1207,17 @@ found_cow:
if (error)
goto out_unlock;
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
IOMAP_F_SHARED, seq);
}
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return error;
}
@@ -1194,8 +1227,8 @@ xfs_buffered_write_delalloc_punch(
loff_t offset,
loff_t length)
{
- return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset,
- offset + length);
+ xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length);
+ return 0;
}
static int
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 66f8c47642e8..ff222827e550 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -25,6 +25,7 @@
#include "xfs_error.h"
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
+#include "xfs_file.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -62,7 +63,7 @@ xfs_initxattrs(
.value = xattr->value,
.valuelen = xattr->value_len,
};
- error = xfs_attr_change(&args);
+ error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT);
if (error < 0)
break;
}
@@ -156,6 +157,8 @@ xfs_create_need_xattr(
if (dir->i_sb->s_security)
return true;
#endif
+ if (xfs_has_parent(XFS_I(dir)->i_mount))
+ return true;
return false;
}
@@ -200,7 +203,18 @@ xfs_generic_create(
xfs_create_need_xattr(dir, default_acl, acl),
&ip);
} else {
- error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, &ip);
+ bool init_xattrs = false;
+
+ /*
+ * If this temporary file will be linkable, set up the file
+ * with an attr fork to receive a parent pointer.
+ */
+ if (!(tmpfile->f_flags & O_EXCL) &&
+ xfs_has_parent(XFS_I(dir)->i_mount))
+ init_xattrs = true;
+
+ error = xfs_create_tmpfile(idmap, XFS_I(dir), mode,
+ init_xattrs, &ip);
}
if (unlikely(error))
goto out_free_acl;
@@ -364,6 +378,9 @@ xfs_vn_link(
if (unlikely(error))
return error;
+ if (IS_PRIVATE(inode))
+ return -EPERM;
+
error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
if (unlikely(error))
return error;
@@ -521,7 +538,7 @@ xfs_stat_blksize(
* always return the realtime extent size.
*/
if (XFS_IS_REALTIME_INODE(ip))
- return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip));
+ return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip) ? : 1);
/*
* Allow large block sizes to be reported to userspace programs if the
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 7f84a0843b24..3c1a2605ffd2 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -8,9 +8,6 @@
struct xfs_inode;
-extern const struct file_operations xfs_file_operations;
-extern const struct file_operations xfs_dir_file_operations;
-
extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
int xfs_vn_setattr_size(struct mnt_idmap *idmap,
@@ -19,4 +16,8 @@ int xfs_vn_setattr_size(struct mnt_idmap *idmap,
int xfs_inode_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr);
+extern void xfs_setup_inode(struct xfs_inode *ip);
+extern void xfs_setup_iops(struct xfs_inode *ip);
+extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
+
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 95fc31b9f87d..c0757ab99495 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -97,6 +97,14 @@ xfs_bulkstat_one_int(
vfsuid = i_uid_into_vfsuid(idmap, inode);
vfsgid = i_gid_into_vfsgid(idmap, inode);
+ /* If this is a private inode, don't leak its details to userspace. */
+ if (IS_PRIVATE(inode)) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_irele(ip);
+ error = -EINVAL;
+ goto out_advance;
+ }
+
/* xfs_iget returns the following without needing
* further change.
*/
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 01b55f03a102..730c8d48da28 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -268,7 +268,7 @@ xfs_iwalk_ag_start(
/* Set up a fresh cursor and empty the inobt cache. */
iwag->nr_recs = 0;
- error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
if (error)
return error;
*curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp);
@@ -386,7 +386,7 @@ xfs_iwalk_run_callbacks(
}
/* ...and recreate the cursor just past where we left off. */
- error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp);
+ error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, 0, agi_bpp);
if (error)
return error;
*curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp);
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 8f07c9f6157f..ac355328121a 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -198,6 +198,11 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
return x;
}
+static inline bool isaligned_64(uint64_t x, uint32_t y)
+{
+ return do_div(x, y) == 0;
+}
+
/* If @b is a power of 2, return log2(b). Else return -1. */
static inline int8_t log2_if_power2(unsigned long b)
{
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 5004f23d344e..416c15494983 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1448,7 +1448,7 @@ xfs_log_work_queue(
* Clear the log incompat flags if we have the opportunity.
*
* This only happens if we're about to log the second dummy transaction as part
- * of covering the log and we can get the log incompat feature usage lock.
+ * of covering the log.
*/
static inline void
xlog_clear_incompat(
@@ -1463,11 +1463,7 @@ xlog_clear_incompat(
if (log->l_covered_state != XLOG_STATE_COVER_DONE2)
return;
- if (!down_write_trylock(&log->l_incompat_users))
- return;
-
xfs_clear_incompat_log_features(mp);
- up_write(&log->l_incompat_users);
}
/*
@@ -1585,8 +1581,6 @@ xlog_alloc_log(
}
log->l_sectBBsize = 1 << log2_size;
- init_rwsem(&log->l_incompat_users);
-
xlog_get_iclog_buffer_size(mp, log);
spin_lock_init(&log->l_icloglock);
@@ -3871,23 +3865,3 @@ xfs_log_check_lsn(
return valid;
}
-
-/*
- * Notify the log that we're about to start using a feature that is protected
- * by a log incompat feature flag. This will prevent log covering from
- * clearing those flags.
- */
-void
-xlog_use_incompat_feat(
- struct xlog *log)
-{
- down_read(&log->l_incompat_users);
-}
-
-/* Notify the log that we've finished using log incompat features. */
-void
-xlog_drop_incompat_feat(
- struct xlog *log)
-{
- up_read(&log->l_incompat_users);
-}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 2728886c2963..d69acf881153 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -159,8 +159,6 @@ bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags);
-void xlog_use_incompat_feat(struct xlog *log);
-void xlog_drop_incompat_feat(struct xlog *log);
int xfs_attr_use_log_assist(struct xfs_mount *mp);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 73f5b7f628f4..f51cbc6405c1 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -1378,7 +1378,7 @@ out_abort_free_ticket:
*/
static void
xlog_cil_push_background(
- struct xlog *log) __releases(cil->xc_ctx_lock)
+ struct xlog *log)
{
struct xfs_cil *cil = log->l_cilp;
int space_used = atomic_read(&cil->xc_ctx->space_used);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index e30c06ec20e3..40e22ec0fbe6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -450,9 +450,6 @@ struct xlog {
xfs_lsn_t l_recovery_lsn;
uint32_t l_iclog_roundoff;/* padding roundoff */
-
- /* Users of log incompat features should take a read lock. */
- struct rw_semaphore l_incompat_users;
};
/*
@@ -623,7 +620,8 @@ xlog_wait(
remove_wait_queue(wq, &wait);
}
-int xlog_wait_on_iclog(struct xlog_in_core *iclog);
+int xlog_wait_on_iclog(struct xlog_in_core *iclog)
+ __releases(iclog->ic_log->l_icloglock);
/*
* The LSN is valid so long as it is behind the current LSN. If it isn't, this
@@ -683,7 +681,7 @@ xlog_valid_lsn(
* flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
* will do direct reclaim and compaction in the slow path, both of which are
* horrendously expensive. We just want kmalloc to fail fast and fall back to
- * vmalloc if it can't get somethign straight away from the free lists or
+ * vmalloc if it can't get something straight away from the free lists or
* buddy allocator. Hence we have to open code kvmalloc outselves here.
*
* This assumes that the caller uses memalloc_nofs_save task context here, so
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13f1d2e91540..4fe627991e86 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1767,6 +1767,37 @@ xlog_recover_iget(
return 0;
}
+/*
+ * Get an inode so that we can recover a log operation.
+ *
+ * Log intent items that target inodes effectively contain a file handle.
+ * Check that the generation number matches the intent item like we do for
+ * other file handles. Log intent items defined after this validation weakness
+ * was identified must use this function.
+ */
+int
+xlog_recover_iget_handle(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ uint32_t gen,
+ struct xfs_inode **ipp)
+{
+ struct xfs_inode *ip;
+ int error;
+
+ error = xlog_recover_iget(mp, ino, &ip);
+ if (error)
+ return error;
+
+ if (VFS_I(ip)->i_generation != gen) {
+ xfs_irele(ip);
+ return -EFSCORRUPTED;
+ }
+
+ *ipp = ip;
+ return 0;
+}
+
/******************************************************************************
*
* Log recover routines
@@ -1789,6 +1820,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
&xlog_bud_item_ops,
&xlog_attri_item_ops,
&xlog_attrd_item_ops,
+ &xlog_xmi_item_ops,
+ &xlog_xmd_item_ops,
};
static const struct xlog_recover_item_ops *
@@ -2656,7 +2689,7 @@ xlog_recover_clear_agi_bucket(
if (error)
goto out_error;
- error = xfs_read_agi(pag, tp, &agibp);
+ error = xfs_read_agi(pag, tp, 0, &agibp);
if (error)
goto out_abort;
@@ -2772,7 +2805,7 @@ xlog_recover_iunlink_ag(
int bucket;
int error;
- error = xfs_read_agi(pag, NULL, &agibp);
+ error = xfs_read_agi(pag, NULL, 0, &agibp);
if (error) {
/*
* AGI is b0rked. Don't process it.
@@ -2966,7 +2999,7 @@ xlog_do_recovery_pass(
int error = 0, h_size, h_len;
int error2 = 0;
int bblks, split_bblks;
- int hblks, split_hblks, wrapped_hblks;
+ int hblks = 1, split_hblks, wrapped_hblks;
int i;
struct hlist_head rhash[XLOG_RHASH_SIZE];
LIST_HEAD (buffer_list);
@@ -2977,6 +3010,10 @@ xlog_do_recovery_pass(
for (i = 0; i < XLOG_RHASH_SIZE; i++)
INIT_HLIST_HEAD(&rhash[i]);
+ hbp = xlog_alloc_buffer(log, hblks);
+ if (!hbp)
+ return -ENOMEM;
+
/*
* Read the header of the tail block and get the iclog buffer size from
* h_size. Use this to tell how many sectors make up the log header.
@@ -2987,10 +3024,6 @@ xlog_do_recovery_pass(
* iclog header and extract the header size from it. Get a
* new hbp that is the correct size.
*/
- hbp = xlog_alloc_buffer(log, 1);
- if (!hbp)
- return -ENOMEM;
-
error = xlog_bread(log, tail_blk, 1, hbp, &offset);
if (error)
goto bread_err1;
@@ -3022,20 +3055,27 @@ xlog_do_recovery_pass(
if (error)
goto bread_err1;
- hblks = xlog_logrec_hblks(log, rhead);
- if (hblks != 1) {
- kvfree(hbp);
- hbp = xlog_alloc_buffer(log, hblks);
+ /*
+ * This open codes xlog_logrec_hblks so that we can reuse the
+ * fixed up h_size value calculated above. Without that we'd
+ * still allocate the buffer based on the incorrect on-disk
+ * size.
+ */
+ if (h_size > XLOG_HEADER_CYCLE_SIZE &&
+ (rhead->h_version & cpu_to_be32(XLOG_VERSION_2))) {
+ hblks = DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
+ if (hblks > 1) {
+ kvfree(hbp);
+ hbp = xlog_alloc_buffer(log, hblks);
+ if (!hbp)
+ return -ENOMEM;
+ }
}
} else {
ASSERT(log->l_sectBBsize == 1);
- hblks = 1;
- hbp = xlog_alloc_buffer(log, 1);
h_size = XLOG_BIG_RECORD_BSIZE;
}
- if (!hbp)
- return -ENOMEM;
dbp = xlog_alloc_buffer(log, BTOBB(h_size));
if (!dbp) {
kvfree(hbp);
@@ -3496,21 +3536,6 @@ xlog_recover_finish(
*/
xfs_log_force(log->l_mp, XFS_LOG_SYNC);
- /*
- * Now that we've recovered the log and all the intents, we can clear
- * the log incompat feature bits in the superblock because there's no
- * longer anything to protect. We rely on the AIL push to write out the
- * updated superblock after everything else.
- */
- if (xfs_clear_incompat_log_features(log->l_mp)) {
- error = xfs_sync_sb(log->l_mp, false);
- if (error < 0) {
- xfs_alert(log->l_mp,
- "Failed to clear log incompat features on recovery");
- goto out_error;
- }
- }
-
xlog_recover_process_iunlinks(log);
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index df370eb5dc15..09eef1721ef4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -34,6 +34,7 @@
#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
#include "scrub/stats.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -230,6 +231,13 @@ reread:
mp->m_features |= xfs_sb_version_to_features(sbp);
xfs_reinit_percpu_counters(mp);
+ /*
+ * If logged xattrs are enabled after log recovery finishes, then set
+ * the opstate so that log recovery will work properly.
+ */
+ if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ xfs_set_using_logged_xattrs(mp);
+
/* no need to be quiet anymore, so reset the buf ops */
bp->b_ops = &xfs_sb_buf_ops;
@@ -828,6 +836,15 @@ xfs_mountfs(
goto out_inodegc_shrinker;
}
+ /*
+ * If logged xattrs are still enabled after log recovery finishes, then
+ * they'll be available until unmount. Otherwise, turn them off.
+ */
+ if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ xfs_set_using_logged_xattrs(mp);
+ else
+ xfs_clear_using_logged_xattrs(mp);
+
/* Enable background inode inactivation workers. */
xfs_inodegc_start(mp);
xfs_blockgc_start(mp);
@@ -1095,6 +1112,11 @@ xfs_unmountfs(
"Freespace may not be correct on next mount.");
xfs_unmount_check(mp);
+ /*
+ * Indicate that it's ok to clear log incompat bits before cleaning
+ * the log and writing the unmount record.
+ */
+ xfs_set_done_with_log_incompat(mp);
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1131,16 +1153,44 @@ xfs_fs_writable(
return true;
}
-/* Adjust m_fdblocks or m_frextents. */
+void
+xfs_add_freecounter(
+ struct xfs_mount *mp,
+ struct percpu_counter *counter,
+ uint64_t delta)
+{
+ bool has_resv_pool = (counter == &mp->m_fdblocks);
+ uint64_t res_used;
+
+ /*
+ * If the reserve pool is depleted, put blocks back into it first.
+ * Most of the time the pool is full.
+ */
+ if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
+ percpu_counter_add(counter, delta);
+ return;
+ }
+
+ spin_lock(&mp->m_sb_lock);
+ res_used = mp->m_resblks - mp->m_resblks_avail;
+ if (res_used > delta) {
+ mp->m_resblks_avail += delta;
+ } else {
+ delta -= res_used;
+ mp->m_resblks_avail = mp->m_resblks;
+ percpu_counter_add(counter, delta);
+ }
+ spin_unlock(&mp->m_sb_lock);
+}
+
int
-xfs_mod_freecounter(
+xfs_dec_freecounter(
struct xfs_mount *mp,
struct percpu_counter *counter,
- int64_t delta,
+ uint64_t delta,
bool rsvd)
{
int64_t lcounter;
- long long res_used;
uint64_t set_aside = 0;
s32 batch;
bool has_resv_pool;
@@ -1150,31 +1200,6 @@ xfs_mod_freecounter(
if (rsvd)
ASSERT(has_resv_pool);
- if (delta > 0) {
- /*
- * If the reserve pool is depleted, put blocks back into it
- * first. Most of the time the pool is full.
- */
- if (likely(!has_resv_pool ||
- mp->m_resblks == mp->m_resblks_avail)) {
- percpu_counter_add(counter, delta);
- return 0;
- }
-
- spin_lock(&mp->m_sb_lock);
- res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
-
- if (res_used > delta) {
- mp->m_resblks_avail += delta;
- } else {
- delta -= res_used;
- mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(counter, delta);
- }
- spin_unlock(&mp->m_sb_lock);
- return 0;
- }
-
/*
* Taking blocks away, need to be more accurate the closer we
* are to zero.
@@ -1202,7 +1227,7 @@ xfs_mod_freecounter(
*/
if (has_resv_pool)
set_aside = xfs_fdblocks_unavailable(mp);
- percpu_counter_add_batch(counter, delta, batch);
+ percpu_counter_add_batch(counter, -((int64_t)delta), batch);
if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
@@ -1214,11 +1239,11 @@ xfs_mod_freecounter(
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
- percpu_counter_add(counter, -delta);
+ percpu_counter_add(counter, delta);
if (!has_resv_pool || !rsvd)
goto fdblocks_enospc;
- lcounter = (long long)mp->m_resblks_avail + delta;
+ lcounter = (long long)mp->m_resblks_avail - delta;
if (lcounter >= 0) {
mp->m_resblks_avail = lcounter;
spin_unlock(&mp->m_sb_lock);
@@ -1364,7 +1389,8 @@ xfs_clear_incompat_log_features(
if (!xfs_has_crc(mp) ||
!xfs_sb_has_incompat_log_feature(&mp->m_sb,
XFS_SB_FEAT_INCOMPAT_LOG_ALL) ||
- xfs_is_shutdown(mp))
+ xfs_is_shutdown(mp) ||
+ !xfs_is_done_with_log_incompat(mp))
return false;
/*
@@ -1399,9 +1425,20 @@ xfs_clear_incompat_log_features(
#define XFS_DELALLOC_BATCH (4096)
void
xfs_mod_delalloc(
- struct xfs_mount *mp,
- int64_t delta)
+ struct xfs_inode *ip,
+ int64_t data_delta,
+ int64_t ind_delta)
{
- percpu_counter_add_batch(&mp->m_delalloc_blks, delta,
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ percpu_counter_add_batch(&mp->m_delalloc_rtextents,
+ xfs_rtb_to_rtx(mp, data_delta),
+ XFS_DELALLOC_BATCH);
+ if (!ind_delta)
+ return;
+ data_delta = 0;
+ }
+ percpu_counter_add_batch(&mp->m_delalloc_blks, data_delta + ind_delta,
XFS_DELALLOC_BATCH);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e880aa48de68..d0567dfbc036 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -195,6 +195,12 @@ typedef struct xfs_mount {
* extents or anything related to the rt device.
*/
struct percpu_counter m_delalloc_blks;
+
+ /*
+ * RT version of the above.
+ */
+ struct percpu_counter m_delalloc_rtextents;
+
/*
* Global count of allocation btree blocks in use across all AGs. Only
* used when perag reservation is enabled. Helps prevent block
@@ -292,6 +298,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
+#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -331,19 +338,10 @@ static inline void xfs_add_ ## name (struct xfs_mount *mp) \
__XFS_ADD_FEAT(attr, ATTR)
__XFS_HAS_FEAT(nlink, NLINK)
__XFS_ADD_FEAT(quota, QUOTA)
-__XFS_HAS_FEAT(align, ALIGN)
__XFS_HAS_FEAT(dalign, DALIGN)
-__XFS_HAS_FEAT(logv2, LOGV2)
__XFS_HAS_FEAT(sector, SECTOR)
-__XFS_HAS_FEAT(extflg, EXTFLG)
__XFS_HAS_FEAT(asciici, ASCIICI)
-__XFS_HAS_FEAT(lazysbcount, LAZYSBCOUNT)
-__XFS_ADD_FEAT(attr2, ATTR2)
__XFS_HAS_FEAT(parent, PARENT)
-__XFS_ADD_FEAT(projid32, PROJID32)
-__XFS_HAS_FEAT(crc, CRC)
-__XFS_HAS_FEAT(v3inodes, V3INODES)
-__XFS_HAS_FEAT(pquotino, PQUOTINO)
__XFS_HAS_FEAT(ftype, FTYPE)
__XFS_HAS_FEAT(finobt, FINOBT)
__XFS_HAS_FEAT(rmapbt, RMAPBT)
@@ -355,6 +353,38 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
__XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
+__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
+
+/*
+ * Some features are always on for v5 file systems, allow the compiler to
+ * eliminiate dead code when building without v4 support.
+ */
+#define __XFS_HAS_V4_FEAT(name, NAME) \
+static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
+{ \
+ return !IS_ENABLED(CONFIG_XFS_SUPPORT_V4) || \
+ (mp->m_features & XFS_FEAT_ ## NAME); \
+}
+
+#define __XFS_ADD_V4_FEAT(name, NAME) \
+ __XFS_HAS_V4_FEAT(name, NAME); \
+static inline void xfs_add_ ## name (struct xfs_mount *mp) \
+{ \
+ if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) { \
+ mp->m_features |= XFS_FEAT_ ## NAME; \
+ xfs_sb_version_add ## name(&mp->m_sb); \
+ } \
+}
+
+__XFS_HAS_V4_FEAT(align, ALIGN)
+__XFS_HAS_V4_FEAT(logv2, LOGV2)
+__XFS_HAS_V4_FEAT(extflg, EXTFLG)
+__XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT)
+__XFS_ADD_V4_FEAT(attr2, ATTR2)
+__XFS_ADD_V4_FEAT(projid32, PROJID32)
+__XFS_HAS_V4_FEAT(v3inodes, V3INODES)
+__XFS_HAS_V4_FEAT(crc, CRC)
+__XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
/*
* Mount features
@@ -412,6 +442,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_LARP 9
/* Mount time quotacheck is running */
#define XFS_OPSTATE_QUOTACHECK_RUNNING 10
+/* Do we want to clear log incompat flags? */
+#define XFS_OPSTATE_UNSET_LOG_INCOMPAT 11
+/* Filesystem can use logged extended attributes */
+#define XFS_OPSTATE_USE_LARP 12
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -439,6 +473,8 @@ __XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
#else
# define xfs_is_quotacheck_running(mp) (false)
#endif
+__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
+__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
@@ -457,7 +493,9 @@ xfs_should_warn(struct xfs_mount *mp, long nr)
{ (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
{ (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
{ (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \
- { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }
+ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \
+ { (1UL << XFS_OPSTATE_UNSET_LOG_INCOMPAT), "unset_log_incompat" }, \
+ { (1UL << XFS_OPSTATE_USE_LARP), "logged_xattrs" }
/*
* Max and min values for mount-option defined I/O
@@ -534,19 +572,30 @@ xfs_fdblocks_unavailable(
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
-int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
- int64_t delta, bool rsvd);
+int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+ uint64_t delta, bool rsvd);
+void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+ uint64_t delta);
+
+static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
+ bool reserved)
+{
+ return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+}
+
+static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
+{
+ xfs_add_freecounter(mp, &mp->m_fdblocks, delta);
+}
-static inline int
-xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
+static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
{
- return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+ return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false);
}
-static inline int
-xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
+static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
{
- return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
+ xfs_add_freecounter(mp, &mp->m_frextents, delta);
}
extern int xfs_readsb(xfs_mount_t *, int);
@@ -566,6 +615,7 @@ struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
void xfs_force_summary_recalc(struct xfs_mount *mp);
int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
-void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta);
+void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta,
+ int64_t ind_delta);
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 0f4cf4170c35..47120b745c47 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -836,8 +836,10 @@ xfs_qm_qino_alloc(
ASSERT(xfs_is_shutdown(mp));
xfs_alert(mp, "%s failed (error %d)!", __func__, error);
}
- if (need_alloc)
+ if (need_alloc) {
+ xfs_iunlock(*ipp, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(*ipp);
+ }
return error;
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index f5993012bf98..6e09dfcd13e2 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -136,7 +136,7 @@ enum {
XFS_QM_TRANS_PRJ,
XFS_QM_TRANS_DQTYPES
};
-#define XFS_QM_TRANS_MAXDQS 2
+#define XFS_QM_TRANS_MAXDQS 5
struct xfs_dquot_acct {
struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
};
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 85a4ae1a17f6..23d71a55bbc0 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -123,12 +123,6 @@ extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
extern void xfs_qm_mount_quotas(struct xfs_mount *);
extern void xfs_qm_unmount(struct xfs_mount *);
extern void xfs_qm_unmount_quotas(struct xfs_mount *);
-
-static inline int
-xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
-{
- return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
-}
bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
# ifdef CONFIG_XFS_LIVE_HOOKS
@@ -188,12 +182,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
}
static inline int
-xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
-{
- return 0;
-}
-
-static inline int
xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks)
{
@@ -222,9 +210,16 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
#endif /* CONFIG_XFS_QUOTA */
static inline int
-xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks)
+xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
+}
+
+static inline void
+xfs_quota_unreserve_blkres(struct xfs_inode *ip, uint64_t blocks)
{
- return xfs_quota_reserve_blkres(ip, -blocks);
+ /* don't return an error as unreserving quotas can't fail */
+ xfs_quota_reserve_blkres(ip, -(int64_t)blocks);
}
extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 7da0e8f961d3..063a2e00d169 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -430,13 +430,6 @@ xfs_reflink_fill_cow_hole(
if (error)
return error;
- /*
- * Allocation succeeded but the requested range was not even partially
- * satisfied? Bail out!
- */
- if (nimaps == 0)
- return -ENOSPC;
-
convert:
return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
@@ -499,13 +492,6 @@ xfs_reflink_fill_delalloc(
error = xfs_trans_commit(tp);
if (error)
return error;
-
- /*
- * Allocation succeeded but the requested range was not even
- * partially satisfied? Bail out!
- */
- if (nimaps == 0)
- return -ENOSPC;
} while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
@@ -606,10 +592,8 @@ xfs_reflink_cancel_cow_blocks(
trace_xfs_reflink_cancel_cow(ip, &del);
if (isnullstartblock(del.br_startblock)) {
- error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
- &icur, &got, &del);
- if (error)
- break;
+ xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got,
+ &del);
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
@@ -632,10 +616,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
/* Remove the quota reservation */
- error = xfs_quota_unreserve_blkres(ip,
- del.br_blockcount);
- if (error)
- break;
+ xfs_quota_unreserve_blkres(ip, del.br_blockcount);
} else {
/* Didn't do anything, push cursor back. */
xfs_iext_prev(ifp, &icur);
@@ -731,12 +712,6 @@ xfs_reflink_end_cow_extent(
int nmaps;
int error;
- /* No COW extents? That's easy! */
- if (ifp->if_bytes == 0) {
- *offset_fsb = end_fsb;
- return 0;
- }
-
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
XFS_TRANS_RESERVE, &tp);
@@ -751,14 +726,6 @@ xfs_reflink_end_cow_extent(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
- XFS_IEXT_REFLINK_END_COW_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_REFLINK_END_COW_CNT);
- if (error)
- goto out_cancel;
-
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
@@ -787,6 +754,11 @@ xfs_reflink_end_cow_extent(
del = got;
xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error)
+ goto out_cancel;
+
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
@@ -1283,9 +1255,7 @@ xfs_reflink_remap_extent(
if (dmap_written)
++iext_delta;
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip, iext_delta);
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, iext_delta);
if (error)
goto out_cancel;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e66f9bd5de5c..5a7ddfed1bb8 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -695,11 +695,8 @@ xfs_growfs_rt_alloc(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
@@ -709,8 +706,6 @@ xfs_growfs_rt_alloc(
nmap = 1;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
XFS_BMAPI_METADATA, 0, &map, &nmap);
- if (!error && nmap < 1)
- error = -ENOSPC;
if (error)
goto out_trans_cancel;
/*
@@ -957,10 +952,10 @@ xfs_growfs_rt(
nargs.tp = tp;
/*
- * Lock out other callers by grabbing the bitmap inode lock.
+ * Lock out other callers by grabbing the bitmap and summary
+ * inode locks and joining them to the transaction.
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_rtbitmap_lock(tp, mp);
/*
* Update the bitmap inode's size ondisk and incore. We need
* to update the incore size so that inode inactivation won't
@@ -971,11 +966,6 @@ xfs_growfs_rt(
i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_disk_size);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
/*
- * Get the summary inode into the transaction.
- */
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
- /*
* Update the summary inode's size. We need to update the
* incore size so that inode inactivation won't punch what it
* thinks are "posteof" blocks.
@@ -1142,10 +1132,10 @@ xfs_rtalloc_reinit_frextents(
uint64_t val = 0;
int error;
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
&val);
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
if (error)
return error;
@@ -1346,6 +1336,8 @@ xfs_bmap_rtalloc(
int error;
align = xfs_get_extsz_hint(ap->ip);
+ if (!align)
+ align = 1;
retry:
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 1, ap->eof, 0,
@@ -1382,10 +1374,7 @@ retry:
* Lock out modifications to both the RT bitmap and summary inodes
*/
if (!rtlocked) {
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_rtbitmap_lock(ap->tp, mp);
rtlocked = true;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bce020374c5e..27e9f749c4c7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -43,6 +43,8 @@
#include "xfs_iunlink_item.h"
#include "xfs_dahash_test.h"
#include "xfs_rtbitmap.h"
+#include "xfs_exchmaps_item.h"
+#include "xfs_parent.h"
#include "scrub/stats.h"
#include "scrub/rcbag_btree.h"
@@ -1051,12 +1053,18 @@ xfs_init_percpu_counters(
if (error)
goto free_fdblocks;
- error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+ error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
if (error)
goto free_delalloc;
+ error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+ if (error)
+ goto free_delalloc_rt;
+
return 0;
+free_delalloc_rt:
+ percpu_counter_destroy(&mp->m_delalloc_rtextents);
free_delalloc:
percpu_counter_destroy(&mp->m_delalloc_blks);
free_fdblocks:
@@ -1086,6 +1094,9 @@ xfs_destroy_percpu_counters(
percpu_counter_destroy(&mp->m_ifree);
percpu_counter_destroy(&mp->m_fdblocks);
ASSERT(xfs_is_shutdown(mp) ||
+ percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
+ percpu_counter_destroy(&mp->m_delalloc_rtextents);
+ ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
percpu_counter_destroy(&mp->m_frextents);
@@ -1579,17 +1590,21 @@ xfs_fs_fill_super(
if (error)
goto out_free_sb;
- /* V4 support is undergoing deprecation. */
- if (!xfs_has_crc(mp)) {
-#ifdef CONFIG_XFS_SUPPORT_V4
+ /*
+ * V4 support is undergoing deprecation.
+ *
+ * Note: this has to use an open coded m_features check as xfs_has_crc
+ * always returns false for !CONFIG_XFS_SUPPORT_V4.
+ */
+ if (!(mp->m_features & XFS_FEAT_CRC)) {
+ if (!IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) {
+ xfs_warn(mp,
+ "Deprecated V4 format (crc=0) not supported by kernel.");
+ error = -EINVAL;
+ goto out_free_sb;
+ }
xfs_warn_once(mp,
"Deprecated V4 format (crc=0) will not be supported after September 2030.");
-#else
- xfs_warn(mp,
- "Deprecated V4 format (crc=0) not supported by kernel.");
- error = -EINVAL;
- goto out_free_sb;
-#endif
}
/* ASCII case insensitivity is undergoing deprecation. */
@@ -1727,6 +1742,14 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_has_exchange_range(mp))
+ xfs_warn(mp,
+ "EXPERIMENTAL exchange-range feature enabled. Use at your own risk!");
+
+ if (xfs_has_parent(mp))
+ xfs_warn(mp,
+ "EXPERIMENTAL parent pointer feature enabled. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1873,11 +1896,7 @@ xfs_remount_ro(
xfs_inodegc_stop(mp);
/* Free the per-AG metadata reservation pool. */
- error = xfs_fs_unreserve_ag_blocks(mp);
- if (error) {
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
- }
+ xfs_fs_unreserve_ag_blocks(mp);
/*
* Before we sync the metadata, we need to free up the reserve block
@@ -2185,8 +2204,32 @@ xfs_init_caches(void)
if (!xfs_iunlink_cache)
goto out_destroy_attri_cache;
+ xfs_xmd_cache = kmem_cache_create("xfs_xmd_item",
+ sizeof(struct xfs_xmd_log_item),
+ 0, 0, NULL);
+ if (!xfs_xmd_cache)
+ goto out_destroy_iul_cache;
+
+ xfs_xmi_cache = kmem_cache_create("xfs_xmi_item",
+ sizeof(struct xfs_xmi_log_item),
+ 0, 0, NULL);
+ if (!xfs_xmi_cache)
+ goto out_destroy_xmd_cache;
+
+ xfs_parent_args_cache = kmem_cache_create("xfs_parent_args",
+ sizeof(struct xfs_parent_args),
+ 0, 0, NULL);
+ if (!xfs_parent_args_cache)
+ goto out_destroy_xmi_cache;
+
return 0;
+ out_destroy_xmi_cache:
+ kmem_cache_destroy(xfs_xmi_cache);
+ out_destroy_xmd_cache:
+ kmem_cache_destroy(xfs_xmd_cache);
+ out_destroy_iul_cache:
+ kmem_cache_destroy(xfs_iunlink_cache);
out_destroy_attri_cache:
kmem_cache_destroy(xfs_attri_cache);
out_destroy_attrd_cache:
@@ -2243,6 +2286,9 @@ xfs_destroy_caches(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_cache_destroy(xfs_parent_args_cache);
+ kmem_cache_destroy(xfs_xmd_cache);
+ kmem_cache_destroy(xfs_xmi_cache);
kmem_cache_destroy(xfs_iunlink_cache);
kmem_cache_destroy(xfs_attri_cache);
kmem_cache_destroy(xfs_attrd_cache);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 3e376d24c7c1..17aee806ec2e 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -25,6 +25,8 @@
#include "xfs_error.h"
#include "xfs_health.h"
#include "xfs_symlink_remote.h"
+#include "xfs_parent.h"
+#include "xfs_defer.h"
int
xfs_readlink(
@@ -100,6 +102,7 @@ xfs_symlink(
struct xfs_dquot *pdqp = NULL;
uint resblks;
xfs_ino_t ino;
+ struct xfs_parent_args *ppargs;
*ipp = NULL;
@@ -130,18 +133,24 @@ xfs_symlink(
/*
* The symlink will fit into the inode data fork?
- * There can't be any attributes so we get the whole variable part.
+ * If there are no parent pointers, then there wont't be any attributes.
+ * So we get the whole variable part, and do not need to reserve extra
+ * blocks. Otherwise, we need to reserve the blocks.
*/
- if (pathlen <= XFS_LITINO(mp))
+ if (pathlen <= XFS_LITINO(mp) && !xfs_has_parent(mp))
fs_blocks = 0;
else
fs_blocks = xfs_symlink_blocks(mp, pathlen);
- resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
+ resblks = xfs_symlink_space_res(mp, link_name->len, fs_blocks);
+
+ error = xfs_parent_start(mp, &ppargs);
+ if (error)
+ goto out_release_dquots;
error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp,
pdqp, resblks, &tp);
if (error)
- goto out_release_dquots;
+ goto out_parent;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -161,7 +170,7 @@ xfs_symlink(
if (!error)
error = xfs_init_new_inode(idmap, tp, dp, ino,
S_IFLNK | (mode & ~S_IFMT), 1, 0, prid,
- false, &ip);
+ xfs_has_parent(mp), &ip);
if (error)
goto out_trans_cancel;
@@ -172,8 +181,7 @@ xfs_symlink(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- unlock_dp_on_error = false;
+ xfs_trans_ijoin(tp, dp, 0);
/*
* Also attach the dquot(s) to it, if applicable.
@@ -181,8 +189,8 @@ xfs_symlink(
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
resblks -= XFS_IALLOC_SPACE_RES(mp);
- error = xfs_symlink_write_target(tp, ip, target_path, pathlen,
- fs_blocks, resblks);
+ error = xfs_symlink_write_target(tp, ip, ip->i_ino, target_path,
+ pathlen, fs_blocks, resblks);
if (error)
goto out_trans_cancel;
resblks -= fs_blocks;
@@ -196,6 +204,14 @@ xfs_symlink(
goto out_trans_cancel;
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ /* Add parent pointer for the new symlink. */
+ if (ppargs) {
+ error = xfs_parent_addname(tp, ppargs, dp, link_name, ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
xfs_dir_update_hook(dp, ip, 1, link_name);
/*
@@ -215,6 +231,9 @@ xfs_symlink(
xfs_qm_dqrele(pdqp);
*ipp = ip;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_parent_finish(mp, ppargs);
return 0;
out_trans_cancel:
@@ -226,9 +245,12 @@ out_release_inode:
* transactions and deadlocks from xfs_inactive.
*/
if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
+out_parent:
+ xfs_parent_finish(mp, ppargs);
out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -250,19 +272,12 @@ out_release_dquots:
*/
STATIC int
xfs_inactive_symlink_rmt(
- struct xfs_inode *ip)
+ struct xfs_inode *ip)
{
- struct xfs_buf *bp;
- int done;
- int error;
- int i;
- xfs_mount_t *mp;
- xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS];
- int nmaps;
- int size;
- xfs_trans_t *tp;
-
- mp = ip->i_mount;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
ASSERT(!xfs_need_iread_extents(&ip->i_df));
/*
* We're freeing a symlink that has some
@@ -286,44 +301,14 @@ xfs_inactive_symlink_rmt(
* locked for the second transaction. In the error paths we need it
* held so the cancel won't rele it, see below.
*/
- size = (int)ip->i_disk_size;
ip->i_disk_size = 0;
VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- /*
- * Find the block(s) so we can inval and unmap them.
- */
- done = 0;
- nmaps = ARRAY_SIZE(mval);
- error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
- mval, &nmaps, 0);
- if (error)
- goto error_trans_cancel;
- /*
- * Invalidate the block(s). No validation is done.
- */
- for (i = 0; i < nmaps; i++) {
- error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
- XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0,
- &bp);
- if (error)
- goto error_trans_cancel;
- xfs_trans_binval(tp, bp);
- }
- /*
- * Unmap the dead block(s) to the dfops.
- */
- error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps, &done);
+
+ error = xfs_symlink_remote_truncate(tp, ip);
if (error)
goto error_trans_cancel;
- ASSERT(done);
- /*
- * Commit the transaction. This first logs the EFI and the inode, then
- * rolls and commits the transaction that frees the extents.
- */
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_trans_commit(tp);
if (error) {
ASSERT(xfs_is_shutdown(mp));
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 1a963382e5e9..9c7fbaae2717 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -39,6 +39,9 @@
#include "xfs_buf_mem.h"
#include "xfs_btree_mem.h"
#include "xfs_bmap.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_parent.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aea97fc074f8..25ff6fe1eb6c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -31,6 +31,8 @@
* pos: file offset, in bytes
* bytecount: number of bytes
*
+ * dablk: directory or xattr block offset, in filesystem blocks
+ *
* disize: ondisk file size, in bytes
* isize: incore file size, in bytes
*
@@ -82,11 +84,18 @@ struct xfs_perag;
struct xfbtree;
struct xfs_btree_ops;
struct xfs_bmap_intent;
+struct xfs_exchmaps_intent;
+struct xfs_exchmaps_req;
+struct xfs_exchrange;
+struct xfs_getparents;
+struct xfs_parent_irec;
+struct xfs_attrlist_cursor_kern;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
{ XFS_ATTR_SECURE, "SECURE" }, \
- { XFS_ATTR_INCOMPLETE, "INCOMPLETE" }
+ { XFS_ATTR_INCOMPLETE, "INCOMPLETE" }, \
+ { XFS_ATTR_PARENT, "PARENT" }
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -159,7 +168,7 @@ TRACE_EVENT(xlog_intent_recovery_failed,
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
- __assign_str(name, ops->name);
+ __assign_str(name);
__entry->error = error;
),
TP_printk("dev %d:%d optype %s error %d",
@@ -1654,7 +1663,6 @@ DEFINE_EVENT(xfs_extent_busy_class, name, \
xfs_agblock_t agbno, xfs_extlen_t len), \
TP_ARGS(mp, agno, agbno, len))
DEFINE_BUSY_EVENT(xfs_extent_busy);
-DEFINE_BUSY_EVENT(xfs_extent_busy_enomem);
DEFINE_BUSY_EVENT(xfs_extent_busy_force);
DEFINE_BUSY_EVENT(xfs_extent_busy_reuse);
DEFINE_BUSY_EVENT(xfs_extent_busy_clear);
@@ -1905,7 +1913,7 @@ TRACE_EVENT(xfs_alloc_cur_check,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->bno = bno;
__entry->len = len;
__entry->diff = diff;
@@ -1928,6 +1936,7 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__field(xfs_dahash_t, hashval)
__field(xfs_ino_t, inumber)
__field(uint32_t, op_flags)
+ __field(xfs_ino_t, owner)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -1938,9 +1947,10 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__entry->hashval = args->hashval;
__entry->inumber = args->inumber;
__entry->op_flags = args->op_flags;
+ __entry->owner = args->owner;
),
TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
- "inumber 0x%llx op_flags %s",
+ "inumber 0x%llx op_flags %s owner 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->namelen,
@@ -1948,7 +1958,8 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__entry->namelen,
__entry->hashval,
__entry->inumber,
- __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+ __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+ __entry->owner)
)
#define DEFINE_DIR2_EVENT(name) \
@@ -1992,7 +2003,6 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__field(int, valuelen)
__field(xfs_dahash_t, hashval)
__field(unsigned int, attr_filter)
- __field(unsigned int, attr_flags)
__field(uint32_t, op_flags)
),
TP_fast_assign(
@@ -2004,11 +2014,10 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->valuelen = args->valuelen;
__entry->hashval = args->hashval;
__entry->attr_filter = args->attr_filter;
- __entry->attr_flags = args->attr_flags;
__entry->op_flags = args->op_flags;
),
TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
- "hashval 0x%x filter %s flags %s op_flags %s",
+ "hashval 0x%x filter %s op_flags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->namelen,
@@ -2018,9 +2027,6 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->hashval,
__print_flags(__entry->attr_filter, "|",
XFS_ATTR_FILTER_FLAGS),
- __print_flags(__entry->attr_flags, "|",
- { XATTR_CREATE, "CREATE" },
- { XATTR_REPLACE, "REPLACE" }),
__print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
)
@@ -2467,7 +2473,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_levels[level].ptr;
@@ -2517,7 +2523,7 @@ TRACE_EVENT(xfs_btree_alloc_block,
__entry->ino = 0;
break;
}
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->error = error;
if (!error && stat) {
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
@@ -2561,7 +2567,7 @@ TRACE_EVENT(xfs_btree_free_block,
__entry->ino = cur->bc_ino.ip->i_ino;
else
__entry->ino = 0;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->agbno = xfs_daddr_to_agbno(cur->bc_mp,
xfs_buf_daddr(bp));
),
@@ -2637,7 +2643,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __assign_str(name, dfp->dfp_ops->name);
+ __assign_str(name);
__entry->intent = dfp->dfp_intent;
__entry->flags = dfp->dfp_flags;
__entry->committed = dfp->dfp_done != NULL;
@@ -2726,7 +2732,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __assign_str(name, dfp->dfp_ops->name);
+ __assign_str(name);
__entry->intent = dfp->dfp_intent;
__entry->item = item;
__entry->committed = dfp->dfp_done != NULL;
@@ -3062,7 +3068,6 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
-DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
/* refcount tracepoint classes */
@@ -4239,7 +4244,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->agno = cur->bc_ag.pag->pag_agno;
__entry->agbno = cur->bc_ag.afake->af_root;
__entry->levels = cur->bc_ag.afake->af_levels;
@@ -4268,7 +4273,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
cur->bc_ino.ip->i_ino);
__entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
@@ -4307,7 +4312,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->nr_this_level = nr_this_level;
@@ -4345,7 +4350,7 @@ TRACE_EVENT(xfs_btree_bload_block,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->block_idx = block_idx;
__entry->nr_blocks = nr_blocks;
@@ -4568,7 +4573,7 @@ TRACE_EVENT(xfs_force_shutdown,
__entry->dev = mp->m_super->s_dev;
__entry->ptag = ptag;
__entry->flags = flags;
- __assign_str(fname, fname);
+ __assign_str(fname);
__entry->line_num = line_num;
),
TP_printk("dev %d:%d tag %s flags %s file %s line_num %d",
@@ -4750,7 +4755,7 @@ DECLARE_EVENT_CLASS(xfbtree_freesp_class,
),
TP_fast_assign(
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
- __assign_str(btname, cur->bc_ops->name);
+ __assign_str(btname);
__entry->nlevels = cur->bc_nlevels;
__entry->fileoff = fileoff;
),
@@ -4770,6 +4775,419 @@ DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block);
DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block);
#endif /* CONFIG_XFS_BTREE_IN_MEM */
+/* exchmaps tracepoints */
+#define XFS_EXCHMAPS_STRINGS \
+ { XFS_EXCHMAPS_ATTR_FORK, "ATTRFORK" }, \
+ { XFS_EXCHMAPS_SET_SIZES, "SETSIZES" }, \
+ { XFS_EXCHMAPS_INO1_WRITTEN, "INO1_WRITTEN" }, \
+ { XFS_EXCHMAPS_CLEAR_INO1_REFLINK, "CLEAR_INO1_REFLINK" }, \
+ { XFS_EXCHMAPS_CLEAR_INO2_REFLINK, "CLEAR_INO2_REFLINK" }, \
+ { __XFS_EXCHMAPS_INO2_SHORTFORM, "INO2_SF" }
+
+DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping1_skip);
+DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping1);
+DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping2);
+DEFINE_ITRUNC_EVENT(xfs_exchmaps_update_inode_size);
+
+#define XFS_EXCHRANGE_INODES \
+ { 1, "file1" }, \
+ { 2, "file2" }
+
+DECLARE_EVENT_CLASS(xfs_exchrange_inode_class,
+ TP_PROTO(struct xfs_inode *ip, int whichfile),
+ TP_ARGS(ip, whichfile),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, whichfile)
+ __field(xfs_ino_t, ino)
+ __field(int, format)
+ __field(xfs_extnum_t, nex)
+ __field(int, broot_size)
+ __field(int, fork_off)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->whichfile = whichfile;
+ __entry->ino = ip->i_ino;
+ __entry->format = ip->i_df.if_format;
+ __entry->nex = ip->i_df.if_nextents;
+ __entry->fork_off = xfs_inode_fork_boff(ip);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx whichfile %s format %s num_extents %llu forkoff 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfile, XFS_EXCHRANGE_INODES),
+ __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
+ __entry->nex,
+ __entry->fork_off)
+)
+
+#define DEFINE_EXCHRANGE_INODE_EVENT(name) \
+DEFINE_EVENT(xfs_exchrange_inode_class, name, \
+ TP_PROTO(struct xfs_inode *ip, int whichfile), \
+ TP_ARGS(ip, whichfile))
+
+DEFINE_EXCHRANGE_INODE_EVENT(xfs_exchrange_before);
+DEFINE_EXCHRANGE_INODE_EVENT(xfs_exchrange_after);
+DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error);
+
+#define XFS_EXCHANGE_RANGE_FLAGS_STRS \
+ { XFS_EXCHANGE_RANGE_TO_EOF, "TO_EOF" }, \
+ { XFS_EXCHANGE_RANGE_DSYNC , "DSYNC" }, \
+ { XFS_EXCHANGE_RANGE_DRY_RUN, "DRY_RUN" }, \
+ { XFS_EXCHANGE_RANGE_FILE1_WRITTEN, "F1_WRITTEN" }, \
+ { __XFS_EXCHANGE_RANGE_UPD_CMTIME1, "CMTIME1" }, \
+ { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" }
+
+/* file exchange-range tracepoint class */
+DECLARE_EVENT_CLASS(xfs_exchrange_class,
+ TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip1,
+ struct xfs_inode *ip2),
+ TP_ARGS(fxr, ip1, ip2),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ip1_ino)
+ __field(loff_t, ip1_isize)
+ __field(loff_t, ip1_disize)
+ __field(xfs_ino_t, ip2_ino)
+ __field(loff_t, ip2_isize)
+ __field(loff_t, ip2_disize)
+
+ __field(loff_t, file1_offset)
+ __field(loff_t, file2_offset)
+ __field(unsigned long long, length)
+ __field(unsigned long long, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip1)->i_sb->s_dev;
+ __entry->ip1_ino = ip1->i_ino;
+ __entry->ip1_isize = VFS_I(ip1)->i_size;
+ __entry->ip1_disize = ip1->i_disk_size;
+ __entry->ip2_ino = ip2->i_ino;
+ __entry->ip2_isize = VFS_I(ip2)->i_size;
+ __entry->ip2_disize = ip2->i_disk_size;
+
+ __entry->file1_offset = fxr->file1_offset;
+ __entry->file2_offset = fxr->file2_offset;
+ __entry->length = fxr->length;
+ __entry->flags = fxr->flags;
+ ),
+ TP_printk("dev %d:%d flags %s bytecount 0x%llx "
+ "ino1 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> "
+ "ino2 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_flags_u64(__entry->flags, "|", XFS_EXCHANGE_RANGE_FLAGS_STRS),
+ __entry->length,
+ __entry->ip1_ino,
+ __entry->ip1_isize,
+ __entry->ip1_disize,
+ __entry->file1_offset,
+ __entry->ip2_ino,
+ __entry->ip2_isize,
+ __entry->ip2_disize,
+ __entry->file2_offset)
+)
+
+#define DEFINE_EXCHRANGE_EVENT(name) \
+DEFINE_EVENT(xfs_exchrange_class, name, \
+ TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip1, \
+ struct xfs_inode *ip2), \
+ TP_ARGS(fxr, ip1, ip2))
+DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep);
+DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush);
+DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings);
+
+TRACE_EVENT(xfs_exchmaps_overhead,
+ TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks,
+ unsigned long long rmapbt_blocks),
+ TP_ARGS(mp, bmbt_blocks, rmapbt_blocks),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long long, bmbt_blocks)
+ __field(unsigned long long, rmapbt_blocks)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->bmbt_blocks = bmbt_blocks;
+ __entry->rmapbt_blocks = rmapbt_blocks;
+ ),
+ TP_printk("dev %d:%d bmbt_blocks 0x%llx rmapbt_blocks 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->bmbt_blocks,
+ __entry->rmapbt_blocks)
+);
+
+DECLARE_EVENT_CLASS(xfs_exchmaps_estimate_class,
+ TP_PROTO(const struct xfs_exchmaps_req *req),
+ TP_ARGS(req),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino1)
+ __field(xfs_ino_t, ino2)
+ __field(xfs_fileoff_t, startoff1)
+ __field(xfs_fileoff_t, startoff2)
+ __field(xfs_filblks_t, blockcount)
+ __field(uint64_t, flags)
+ __field(xfs_filblks_t, ip1_bcount)
+ __field(xfs_filblks_t, ip2_bcount)
+ __field(xfs_filblks_t, ip1_rtbcount)
+ __field(xfs_filblks_t, ip2_rtbcount)
+ __field(unsigned long long, resblks)
+ __field(unsigned long long, nr_exchanges)
+ ),
+ TP_fast_assign(
+ __entry->dev = req->ip1->i_mount->m_super->s_dev;
+ __entry->ino1 = req->ip1->i_ino;
+ __entry->ino2 = req->ip2->i_ino;
+ __entry->startoff1 = req->startoff1;
+ __entry->startoff2 = req->startoff2;
+ __entry->blockcount = req->blockcount;
+ __entry->flags = req->flags;
+ __entry->ip1_bcount = req->ip1_bcount;
+ __entry->ip2_bcount = req->ip2_bcount;
+ __entry->ip1_rtbcount = req->ip1_rtbcount;
+ __entry->ip2_rtbcount = req->ip2_rtbcount;
+ __entry->resblks = req->resblks;
+ __entry->nr_exchanges = req->nr_exchanges;
+ ),
+ TP_printk("dev %d:%d ino1 0x%llx fileoff1 0x%llx ino2 0x%llx fileoff2 0x%llx fsbcount 0x%llx flags (%s) bcount1 0x%llx rtbcount1 0x%llx bcount2 0x%llx rtbcount2 0x%llx resblks 0x%llx nr_exchanges %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino1, __entry->startoff1,
+ __entry->ino2, __entry->startoff2,
+ __entry->blockcount,
+ __print_flags_u64(__entry->flags, "|", XFS_EXCHMAPS_STRINGS),
+ __entry->ip1_bcount,
+ __entry->ip1_rtbcount,
+ __entry->ip2_bcount,
+ __entry->ip2_rtbcount,
+ __entry->resblks,
+ __entry->nr_exchanges)
+);
+
+#define DEFINE_EXCHMAPS_ESTIMATE_EVENT(name) \
+DEFINE_EVENT(xfs_exchmaps_estimate_class, name, \
+ TP_PROTO(const struct xfs_exchmaps_req *req), \
+ TP_ARGS(req))
+DEFINE_EXCHMAPS_ESTIMATE_EVENT(xfs_exchmaps_initial_estimate);
+DEFINE_EXCHMAPS_ESTIMATE_EVENT(xfs_exchmaps_final_estimate);
+
+DECLARE_EVENT_CLASS(xfs_exchmaps_intent_class,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_exchmaps_intent *xmi),
+ TP_ARGS(mp, xmi),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino1)
+ __field(xfs_ino_t, ino2)
+ __field(uint64_t, flags)
+ __field(xfs_fileoff_t, startoff1)
+ __field(xfs_fileoff_t, startoff2)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_fsize_t, isize1)
+ __field(xfs_fsize_t, isize2)
+ __field(xfs_fsize_t, new_isize1)
+ __field(xfs_fsize_t, new_isize2)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino1 = xmi->xmi_ip1->i_ino;
+ __entry->ino2 = xmi->xmi_ip2->i_ino;
+ __entry->flags = xmi->xmi_flags;
+ __entry->startoff1 = xmi->xmi_startoff1;
+ __entry->startoff2 = xmi->xmi_startoff2;
+ __entry->blockcount = xmi->xmi_blockcount;
+ __entry->isize1 = xmi->xmi_ip1->i_disk_size;
+ __entry->isize2 = xmi->xmi_ip2->i_disk_size;
+ __entry->new_isize1 = xmi->xmi_isize1;
+ __entry->new_isize2 = xmi->xmi_isize2;
+ ),
+ TP_printk("dev %d:%d ino1 0x%llx fileoff1 0x%llx ino2 0x%llx fileoff2 0x%llx fsbcount 0x%llx flags (%s) isize1 0x%llx newisize1 0x%llx isize2 0x%llx newisize2 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino1, __entry->startoff1,
+ __entry->ino2, __entry->startoff2,
+ __entry->blockcount,
+ __print_flags_u64(__entry->flags, "|", XFS_EXCHMAPS_STRINGS),
+ __entry->isize1, __entry->new_isize1,
+ __entry->isize2, __entry->new_isize2)
+);
+
+#define DEFINE_EXCHMAPS_INTENT_EVENT(name) \
+DEFINE_EVENT(xfs_exchmaps_intent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_exchmaps_intent *xmi), \
+ TP_ARGS(mp, xmi))
+DEFINE_EXCHMAPS_INTENT_EVENT(xfs_exchmaps_defer);
+DEFINE_EXCHMAPS_INTENT_EVENT(xfs_exchmaps_recover);
+
+TRACE_EVENT(xfs_exchmaps_delta_nextents_step,
+ TP_PROTO(struct xfs_mount *mp,
+ const struct xfs_bmbt_irec *left,
+ const struct xfs_bmbt_irec *curr,
+ const struct xfs_bmbt_irec *new,
+ const struct xfs_bmbt_irec *right,
+ int delta, unsigned int state),
+ TP_ARGS(mp, left, curr, new, right, delta, state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_fileoff_t, loff)
+ __field(xfs_fsblock_t, lstart)
+ __field(xfs_filblks_t, lcount)
+ __field(xfs_fileoff_t, coff)
+ __field(xfs_fsblock_t, cstart)
+ __field(xfs_filblks_t, ccount)
+ __field(xfs_fileoff_t, noff)
+ __field(xfs_fsblock_t, nstart)
+ __field(xfs_filblks_t, ncount)
+ __field(xfs_fileoff_t, roff)
+ __field(xfs_fsblock_t, rstart)
+ __field(xfs_filblks_t, rcount)
+ __field(int, delta)
+ __field(unsigned int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->loff = left->br_startoff;
+ __entry->lstart = left->br_startblock;
+ __entry->lcount = left->br_blockcount;
+ __entry->coff = curr->br_startoff;
+ __entry->cstart = curr->br_startblock;
+ __entry->ccount = curr->br_blockcount;
+ __entry->noff = new->br_startoff;
+ __entry->nstart = new->br_startblock;
+ __entry->ncount = new->br_blockcount;
+ __entry->roff = right->br_startoff;
+ __entry->rstart = right->br_startblock;
+ __entry->rcount = right->br_blockcount;
+ __entry->delta = delta;
+ __entry->state = state;
+ ),
+ TP_printk("dev %d:%d left 0x%llx:0x%llx:0x%llx; curr 0x%llx:0x%llx:0x%llx <- new 0x%llx:0x%llx:0x%llx; right 0x%llx:0x%llx:0x%llx delta %d state 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->loff, __entry->lstart, __entry->lcount,
+ __entry->coff, __entry->cstart, __entry->ccount,
+ __entry->noff, __entry->nstart, __entry->ncount,
+ __entry->roff, __entry->rstart, __entry->rcount,
+ __entry->delta, __entry->state)
+);
+
+TRACE_EVENT(xfs_exchmaps_delta_nextents,
+ TP_PROTO(const struct xfs_exchmaps_req *req, int64_t d_nexts1,
+ int64_t d_nexts2),
+ TP_ARGS(req, d_nexts1, d_nexts2),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino1)
+ __field(xfs_ino_t, ino2)
+ __field(xfs_extnum_t, nexts1)
+ __field(xfs_extnum_t, nexts2)
+ __field(int64_t, d_nexts1)
+ __field(int64_t, d_nexts2)
+ ),
+ TP_fast_assign(
+ int whichfork = xfs_exchmaps_reqfork(req);
+
+ __entry->dev = req->ip1->i_mount->m_super->s_dev;
+ __entry->ino1 = req->ip1->i_ino;
+ __entry->ino2 = req->ip2->i_ino;
+ __entry->nexts1 = xfs_ifork_ptr(req->ip1, whichfork)->if_nextents;
+ __entry->nexts2 = xfs_ifork_ptr(req->ip2, whichfork)->if_nextents;
+ __entry->d_nexts1 = d_nexts1;
+ __entry->d_nexts2 = d_nexts2;
+ ),
+ TP_printk("dev %d:%d ino1 0x%llx nexts %llu ino2 0x%llx nexts %llu delta1 %lld delta2 %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino1, __entry->nexts1,
+ __entry->ino2, __entry->nexts2,
+ __entry->d_nexts1, __entry->d_nexts2)
+);
+
+DECLARE_EVENT_CLASS(xfs_getparents_rec_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi,
+ const struct xfs_attr_list_context *context,
+ const struct xfs_getparents_rec *pptr),
+ TP_ARGS(ip, ppi, context, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, firstu)
+ __field(unsigned short, reclen)
+ __field(unsigned int, bufsize)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __string(name, pptr->gpr_name)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->firstu = context->firstu;
+ __entry->reclen = pptr->gpr_reclen;
+ __entry->bufsize = ppi->gp_bufsize;
+ __entry->parent_ino = pptr->gpr_parent.ha_fid.fid_ino;
+ __entry->parent_gen = pptr->gpr_parent.ha_fid.fid_gen;
+ __assign_str(name);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx firstu %u reclen %u bufsize %u parent_ino 0x%llx parent_gen 0x%x name '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->firstu,
+ __entry->reclen,
+ __entry->bufsize,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __get_str(name))
+)
+#define DEFINE_XFS_GETPARENTS_REC_EVENT(name) \
+DEFINE_EVENT(xfs_getparents_rec_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, \
+ const struct xfs_attr_list_context *context, \
+ const struct xfs_getparents_rec *pptr), \
+ TP_ARGS(ip, ppi, context, pptr))
+DEFINE_XFS_GETPARENTS_REC_EVENT(xfs_getparents_put_listent);
+DEFINE_XFS_GETPARENTS_REC_EVENT(xfs_getparents_expand_lastrec);
+
+DECLARE_EVENT_CLASS(xfs_getparents_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi,
+ const struct xfs_attrlist_cursor_kern *cur),
+ TP_ARGS(ip, ppi, cur),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned short, iflags)
+ __field(unsigned short, oflags)
+ __field(unsigned int, bufsize)
+ __field(unsigned int, hashval)
+ __field(unsigned int, blkno)
+ __field(unsigned int, offset)
+ __field(int, initted)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->iflags = ppi->gp_iflags;
+ __entry->oflags = ppi->gp_oflags;
+ __entry->bufsize = ppi->gp_bufsize;
+ __entry->hashval = cur->hashval;
+ __entry->blkno = cur->blkno;
+ __entry->offset = cur->offset;
+ __entry->initted = cur->initted;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx iflags 0x%x oflags 0x%x bufsize %u cur_init? %d hashval 0x%x blkno %u offset %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->iflags,
+ __entry->oflags,
+ __entry->bufsize,
+ __entry->initted,
+ __entry->hashval,
+ __entry->blkno,
+ __entry->offset)
+)
+#define DEFINE_XFS_GETPARENTS_EVENT(name) \
+DEFINE_EVENT(xfs_getparents_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, \
+ const struct xfs_attrlist_cursor_kern *cur), \
+ TP_ARGS(ip, ppi, cur))
+DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_begin);
+DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_end);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 7350640059cc..828da4ac4316 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -163,7 +163,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (blocks > 0) {
- error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
+ error = xfs_dec_fdblocks(mp, blocks, rsvd);
if (error != 0)
return -ENOSPC;
tp->t_blk_res += blocks;
@@ -210,7 +210,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (rtextents > 0) {
- error = xfs_mod_frextents(mp, -((int64_t)rtextents));
+ error = xfs_dec_frextents(mp, rtextents);
if (error) {
error = -ENOSPC;
goto undo_log;
@@ -234,7 +234,7 @@ undo_log:
undo_blocks:
if (blocks > 0) {
- xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
+ xfs_add_fdblocks(mp, blocks);
tp->t_blk_res = 0;
}
return error;
@@ -593,38 +593,44 @@ xfs_trans_unreserve_and_mod_sb(
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
- bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
- int64_t blkdelta = 0;
- int64_t rtxdelta = 0;
+ int64_t blkdelta = tp->t_blk_res;
+ int64_t rtxdelta = tp->t_rtx_res;
int64_t idelta = 0;
int64_t ifreedelta = 0;
- int error;
- /* calculate deltas */
- if (tp->t_blk_res > 0)
- blkdelta = tp->t_blk_res;
- if ((tp->t_fdblocks_delta != 0) &&
- (xfs_has_lazysbcount(mp) ||
- (tp->t_flags & XFS_TRANS_SB_DIRTY)))
+ /*
+ * Calculate the deltas.
+ *
+ * t_fdblocks_delta and t_frextents_delta can be positive or negative:
+ *
+ * - positive values indicate blocks freed in the transaction.
+ * - negative values indicate blocks allocated in the transaction
+ *
+ * Negative values can only happen if the transaction has a block
+ * reservation that covers the allocated block. The end result is
+ * that the calculated delta values must always be positive and we
+ * can only put back previous allocated or reserved blocks here.
+ */
+ ASSERT(tp->t_blk_res || tp->t_fdblocks_delta >= 0);
+ if (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
blkdelta += tp->t_fdblocks_delta;
+ ASSERT(blkdelta >= 0);
+ }
- if (tp->t_rtx_res > 0)
- rtxdelta = tp->t_rtx_res;
- if ((tp->t_frextents_delta != 0) &&
- (tp->t_flags & XFS_TRANS_SB_DIRTY))
+ ASSERT(tp->t_rtx_res || tp->t_frextents_delta >= 0);
+ if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
rtxdelta += tp->t_frextents_delta;
+ ASSERT(rtxdelta >= 0);
+ }
- if (xfs_has_lazysbcount(mp) ||
- (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+ if (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
idelta = tp->t_icount_delta;
ifreedelta = tp->t_ifree_delta;
}
/* apply the per-cpu counters */
- if (blkdelta) {
- error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
- ASSERT(!error);
- }
+ if (blkdelta)
+ xfs_add_fdblocks(mp, blkdelta);
if (idelta)
percpu_counter_add_batch(&mp->m_icount, idelta,
@@ -633,10 +639,8 @@ xfs_trans_unreserve_and_mod_sb(
if (ifreedelta)
percpu_counter_add(&mp->m_ifree, ifreedelta);
- if (rtxdelta) {
- error = xfs_mod_frextents(mp, rtxdelta);
- ASSERT(!error);
- }
+ if (rtxdelta)
+ xfs_add_frextents(mp, rtxdelta);
if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
return;
@@ -672,7 +676,6 @@ xfs_trans_unreserve_and_mod_sb(
*/
ASSERT(mp->m_sb.sb_imax_pct >= 0);
ASSERT(mp->m_sb.sb_rextslog >= 0);
- return;
}
/* Add the given log item to the transaction's list of log items. */
@@ -1291,9 +1294,9 @@ xfs_trans_reserve_more_inode(
return 0;
/* Quota failed, give back the new reservation. */
- xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE);
+ xfs_add_fdblocks(mp, dblocks);
tp->t_blk_res -= dblocks;
- xfs_mod_frextents(mp, rtx);
+ xfs_add_frextents(mp, rtx);
tp->t_rtx_res -= rtx;
return error;
}
@@ -1430,6 +1433,8 @@ out_cancel:
* The caller must ensure that the on-disk dquots attached to this inode have
* already been allocated and initialized. The ILOCKs will be dropped when the
* transaction is committed or cancelled.
+ *
+ * Caller is responsible for unlocking the inodes manually upon return
*/
int
xfs_trans_alloc_dir(
@@ -1460,8 +1465,8 @@ retry:
xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, 0);
+ xfs_trans_ijoin(tp, ip, 0);
error = xfs_qm_dqattach_locked(dp, false);
if (error) {
@@ -1484,6 +1489,9 @@ retry:
if (error == -EDQUOT || error == -ENOSPC) {
if (!retried) {
xfs_trans_cancel(tp);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ if (dp != ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_blockgc_free_quota(dp, 0);
retried = true;
goto retry;
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 577b535a595c..b368e13424c4 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -379,24 +379,29 @@ xfs_trans_mod_dquot(
/*
* Given an array of dqtrx structures, lock all the dquots associated and join
- * them to the transaction, provided they have been modified. We know that the
- * highest number of dquots of one type - usr, grp and prj - involved in a
- * transaction is 3 so we don't need to make this very generic.
+ * them to the transaction, provided they have been modified.
*/
STATIC void
xfs_trans_dqlockedjoin(
struct xfs_trans *tp,
struct xfs_dqtrx *q)
{
+ unsigned int i;
ASSERT(q[0].qt_dquot != NULL);
if (q[1].qt_dquot == NULL) {
xfs_dqlock(q[0].qt_dquot);
xfs_trans_dqjoin(tp, q[0].qt_dquot);
- } else {
- ASSERT(XFS_QM_TRANS_MAXDQS == 2);
+ } else if (q[2].qt_dquot == NULL) {
xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
xfs_trans_dqjoin(tp, q[0].qt_dquot);
xfs_trans_dqjoin(tp, q[1].qt_dquot);
+ } else {
+ xfs_dqlockn(q);
+ for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ if (q[i].qt_dquot == NULL)
+ break;
+ xfs_trans_dqjoin(tp, q[i].qt_dquot);
+ }
}
}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 364104e1b38a..ab3d22f662f2 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -17,15 +17,13 @@
#include "xfs_acl.h"
#include "xfs_log.h"
#include "xfs_xattr.h"
+#include "xfs_quota.h"
#include <linux/posix_acl_xattr.h>
/*
* Get permission to use log-assisted atomic exchange of file extents.
- *
- * Callers must not be running any transactions or hold any inode locks, and
- * they must release the permission by calling xlog_drop_incompat_feat
- * when they're done.
+ * Callers must not be running any transactions or hold any ILOCKs.
*/
static inline int
xfs_attr_grab_log_assist(
@@ -33,17 +31,8 @@ xfs_attr_grab_log_assist(
{
int error = 0;
- /*
- * Protect ourselves from an idle log clearing the logged xattrs log
- * incompat feature bit.
- */
- xlog_use_incompat_feat(mp->m_log);
-
- /*
- * If log-assisted xattrs are already enabled, the caller can use the
- * log assisted swap functions with the log-incompat reference we got.
- */
- if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ /* xattr update log intent items are already enabled */
+ if (xfs_is_using_logged_xattrs(mp))
return 0;
/*
@@ -52,31 +41,20 @@ xfs_attr_grab_log_assist(
* a V5 filesystem for the superblock field, but we'll require rmap
* or reflink to avoid having to deal with really old kernels.
*/
- if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) {
- error = -EOPNOTSUPP;
- goto drop_incompat;
- }
+ if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
/* Enable log-assisted xattrs. */
error = xfs_add_incompat_log_feature(mp,
XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
if (error)
- goto drop_incompat;
+ return error;
+ xfs_set_using_logged_xattrs(mp);
xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP,
"EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!");
return 0;
-drop_incompat:
- xlog_drop_incompat_feat(mp->m_log);
- return error;
-}
-
-static inline void
-xfs_attr_rele_log_assist(
- struct xfs_mount *mp)
-{
- xlog_drop_incompat_feat(mp->m_log);
}
static inline bool
@@ -93,17 +71,31 @@ xfs_attr_want_log_assist(
/*
* Set or remove an xattr, having grabbed the appropriate logging resources
- * prior to calling libxfs.
+ * prior to calling libxfs. Callers of this function are only required to
+ * initialize the inode, attr_filter, name, namelen, value, and valuelen fields
+ * of @args.
*/
int
xfs_attr_change(
- struct xfs_da_args *args)
+ struct xfs_da_args *args,
+ enum xfs_attr_update op)
{
struct xfs_mount *mp = args->dp->i_mount;
- bool use_logging = false;
int error;
- ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED));
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ error = xfs_qm_dqattach(args->dp);
+ if (error)
+ return error;
+
+ /*
+ * We have no control over the attribute names that userspace passes us
+ * to remove, so we have to allow the name lookup prior to attribute
+ * removal to fail as well.
+ */
+ args->op_flags = XFS_DA_OP_OKNOENT;
if (xfs_attr_want_log_assist(mp)) {
error = xfs_attr_grab_log_assist(mp);
@@ -111,14 +103,14 @@ xfs_attr_change(
return error;
args->op_flags |= XFS_DA_OP_LOGGED;
- use_logging = true;
}
- error = xfs_attr_set(args);
+ args->owner = args->dp->i_ino;
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ xfs_attr_sethash(args);
- if (use_logging)
- xfs_attr_rele_log_assist(mp);
- return error;
+ return xfs_attr_set(args, op, args->attr_filter & XFS_ATTR_ROOT);
}
@@ -145,6 +137,20 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
return args.valuelen;
}
+static inline enum xfs_attr_update
+xfs_xattr_flags_to_op(
+ int flags,
+ const void *value)
+{
+ if (!value)
+ return XFS_ATTRUPDATE_REMOVE;
+ if (flags & XATTR_CREATE)
+ return XFS_ATTRUPDATE_CREATE;
+ if (flags & XATTR_REPLACE)
+ return XFS_ATTRUPDATE_REPLACE;
+ return XFS_ATTRUPDATE_UPSERT;
+}
+
static int
xfs_xattr_set(const struct xattr_handler *handler,
struct mnt_idmap *idmap, struct dentry *unused,
@@ -154,7 +160,6 @@ xfs_xattr_set(const struct xattr_handler *handler,
struct xfs_da_args args = {
.dp = XFS_I(inode),
.attr_filter = handler->flags,
- .attr_flags = flags,
.name = name,
.namelen = strlen(name),
.value = (void *)value,
@@ -162,7 +167,7 @@ xfs_xattr_set(const struct xattr_handler *handler,
};
int error;
- error = xfs_attr_change(&args);
+ error = xfs_attr_change(&args, xfs_xattr_flags_to_op(flags, value));
if (!error && (handler->flags & XFS_ATTR_ROOT))
xfs_forget_acl(inode, name);
return error;
@@ -237,6 +242,7 @@ xfs_xattr_put_listent(
int flags,
unsigned char *name,
int namelen,
+ void *value,
int valuelen)
{
char *prefix;
@@ -244,6 +250,10 @@ xfs_xattr_put_listent(
ASSERT(context->count >= 0);
+ /* Don't expose private xattr namespaces. */
+ if (flags & XFS_ATTR_PRIVATE_NSP_MASK)
+ return;
+
if (flags & XFS_ATTR_ROOT) {
#ifdef CONFIG_XFS_POSIX_ACL
if (namelen == SGI_ACL_FILE_SIZE &&
diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h
index cec766cad26c..c3eb858fb59e 100644
--- a/fs/xfs/xfs_xattr.h
+++ b/fs/xfs/xfs_xattr.h
@@ -6,7 +6,8 @@
#ifndef __XFS_XATTR_H__
#define __XFS_XATTR_H__
-int xfs_attr_change(struct xfs_da_args *args);
+enum xfs_attr_update;
+int xfs_attr_change(struct xfs_da_args *args, enum xfs_attr_update op);
extern const struct xattr_handler * const xfs_xattr_handlers[];