summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c1
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/xattr.c5
-rw-r--r--fs/affs/namei.c1
-rw-r--r--fs/anon_inodes.c4
-rw-r--r--fs/bcachefs/Kconfig4
-rw-r--r--fs/bcachefs/Makefile1
-rw-r--r--fs/bcachefs/alloc_background.c185
-rw-r--r--fs/bcachefs/alloc_background.h11
-rw-r--r--fs/bcachefs/alloc_foreground.c38
-rw-r--r--fs/bcachefs/backpointers.c124
-rw-r--r--fs/bcachefs/backpointers.h11
-rw-r--r--fs/bcachefs/bbpos.h17
-rw-r--r--fs/bcachefs/bbpos_types.h18
-rw-r--r--fs/bcachefs/bcachefs.h25
-rw-r--r--fs/bcachefs/bcachefs_format.h122
-rw-r--r--fs/bcachefs/bkey.h22
-rw-r--r--fs/bcachefs/bkey_methods.c169
-rw-r--r--fs/bcachefs/bkey_methods.h15
-rw-r--r--fs/bcachefs/bkey_sort.c6
-rw-r--r--fs/bcachefs/btree_cache.c43
-rw-r--r--fs/bcachefs/btree_cache.h5
-rw-r--r--fs/bcachefs/btree_gc.c148
-rw-r--r--fs/bcachefs/btree_io.c221
-rw-r--r--fs/bcachefs/btree_iter.c73
-rw-r--r--fs/bcachefs/btree_iter.h8
-rw-r--r--fs/bcachefs/btree_key_cache.c64
-rw-r--r--fs/bcachefs/btree_key_cache_types.h34
-rw-r--r--fs/bcachefs/btree_locking.c44
-rw-r--r--fs/bcachefs/btree_locking.h18
-rw-r--r--fs/bcachefs/btree_trans_commit.c202
-rw-r--r--fs/bcachefs/btree_types.h86
-rw-r--r--fs/bcachefs/btree_update_interior.c50
-rw-r--r--fs/bcachefs/btree_update_interior.h7
-rw-r--r--fs/bcachefs/buckets.c182
-rw-r--r--fs/bcachefs/buckets.h15
-rw-r--r--fs/bcachefs/chardev.c4
-rw-r--r--fs/bcachefs/compress.c26
-rw-r--r--fs/bcachefs/compress.h36
-rw-r--r--fs/bcachefs/darray.h6
-rw-r--r--fs/bcachefs/data_update.c61
-rw-r--r--fs/bcachefs/data_update.h1
-rw-r--r--fs/bcachefs/debug.c8
-rw-r--r--fs/bcachefs/dirent.c76
-rw-r--r--fs/bcachefs/dirent.h2
-rw-r--r--fs/bcachefs/disk_groups.c148
-rw-r--r--fs/bcachefs/disk_groups.h7
-rw-r--r--fs/bcachefs/disk_groups_types.h18
-rw-r--r--fs/bcachefs/ec.c83
-rw-r--r--fs/bcachefs/ec.h4
-rw-r--r--fs/bcachefs/errcode.h4
-rw-r--r--fs/bcachefs/error.c32
-rw-r--r--fs/bcachefs/error.h90
-rw-r--r--fs/bcachefs/extents.c409
-rw-r--r--fs/bcachefs/extents.h51
-rw-r--r--fs/bcachefs/fs-common.c2
-rw-r--r--fs/bcachefs/fs-io-buffered.c19
-rw-r--r--fs/bcachefs/fs-io-direct.c1
-rw-r--r--fs/bcachefs/fs-io-pagecache.c2
-rw-r--r--fs/bcachefs/fs-io-pagecache.h2
-rw-r--r--fs/bcachefs/fs-ioctl.c4
-rw-r--r--fs/bcachefs/fs-ioctl.h28
-rw-r--r--fs/bcachefs/fs.c19
-rw-r--r--fs/bcachefs/fsck.c185
-rw-r--r--fs/bcachefs/fsck.h1
-rw-r--r--fs/bcachefs/inode.c267
-rw-r--r--fs/bcachefs/inode.h28
-rw-r--r--fs/bcachefs/io_misc.c15
-rw-r--r--fs/bcachefs/io_misc.h2
-rw-r--r--fs/bcachefs/io_read.c6
-rw-r--r--fs/bcachefs/io_write.c42
-rw-r--r--fs/bcachefs/journal.c50
-rw-r--r--fs/bcachefs/journal.h99
-rw-r--r--fs/bcachefs/journal_io.c302
-rw-r--r--fs/bcachefs/journal_reclaim.c42
-rw-r--r--fs/bcachefs/journal_types.h26
-rw-r--r--fs/bcachefs/lru.c18
-rw-r--r--fs/bcachefs/lru.h2
-rw-r--r--fs/bcachefs/move.c407
-rw-r--r--fs/bcachefs/move.h63
-rw-r--r--fs/bcachefs/move_types.h8
-rw-r--r--fs/bcachefs/movinggc.c69
-rw-r--r--fs/bcachefs/opts.c15
-rw-r--r--fs/bcachefs/opts.h4
-rw-r--r--fs/bcachefs/printbuf.c4
-rw-r--r--fs/bcachefs/quota.c15
-rw-r--r--fs/bcachefs/quota.h2
-rw-r--r--fs/bcachefs/rebalance.c558
-rw-r--r--fs/bcachefs/rebalance.h9
-rw-r--r--fs/bcachefs/rebalance_types.h31
-rw-r--r--fs/bcachefs/recovery.c52
-rw-r--r--fs/bcachefs/recovery_types.h6
-rw-r--r--fs/bcachefs/reflink.c61
-rw-r--r--fs/bcachefs/reflink.h6
-rw-r--r--fs/bcachefs/replicas.c18
-rw-r--r--fs/bcachefs/sb-clean.c5
-rw-r--r--fs/bcachefs/sb-errors.c172
-rw-r--r--fs/bcachefs/sb-errors.h270
-rw-r--r--fs/bcachefs/sb-errors_types.h16
-rw-r--r--fs/bcachefs/sb-members.c159
-rw-r--r--fs/bcachefs/sb-members.h49
-rw-r--r--fs/bcachefs/six.c15
-rw-r--r--fs/bcachefs/snapshot.c202
-rw-r--r--fs/bcachefs/snapshot.h6
-rw-r--r--fs/bcachefs/subvolume.c37
-rw-r--r--fs/bcachefs/subvolume.h2
-rw-r--r--fs/bcachefs/subvolume_types.h2
-rw-r--r--fs/bcachefs/super-io.c20
-rw-r--r--fs/bcachefs/super-io.h40
-rw-r--r--fs/bcachefs/super.c127
-rw-r--r--fs/bcachefs/super_types.h12
-rw-r--r--fs/bcachefs/sysfs.c47
-rw-r--r--fs/bcachefs/trace.c1
-rw-r--r--fs/bcachefs/trace.h101
-rw-r--r--fs/bcachefs/util.c18
-rw-r--r--fs/bcachefs/util.h21
-rw-r--r--fs/bcachefs/xattr.c69
-rw-r--r--fs/bcachefs/xattr.h2
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/delayed-ref.c4
-rw-r--r--fs/btrfs/extent-tree.c25
-rw-r--r--fs/btrfs/extent-tree.h3
-rw-r--r--fs/btrfs/inode.c7
-rw-r--r--fs/btrfs/ioctl.c10
-rw-r--r--fs/btrfs/qgroup.c10
-rw-r--r--fs/btrfs/raid-stripe-tree.c2
-rw-r--r--fs/btrfs/scrub.c10
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/btrfs/volumes.c6
-rw-r--r--fs/btrfs/zoned.c7
-rw-r--r--fs/buffer.c129
-rw-r--r--fs/ceph/acl.c12
-rw-r--r--fs/ceph/addr.c299
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c763
-rw-r--r--fs/ceph/crypto.c43
-rw-r--r--fs/ceph/debugfs.c10
-rw-r--r--fs/ceph/dir.c242
-rw-r--r--fs/ceph/export.c49
-rw-r--r--fs/ceph/file.c282
-rw-r--r--fs/ceph/inode.c511
-rw-r--r--fs/ceph/ioctl.c21
-rw-r--r--fs/ceph/locks.c57
-rw-r--r--fs/ceph/mds_client.c678
-rw-r--r--fs/ceph/mds_client.h13
-rw-r--r--fs/ceph/mdsmap.c29
-rw-r--r--fs/ceph/mdsmap.h75
-rw-r--r--fs/ceph/metric.c5
-rw-r--r--fs/ceph/quota.c29
-rw-r--r--fs/ceph/snap.c192
-rw-r--r--fs/ceph/super.c99
-rw-r--r--fs/ceph/super.h23
-rw-r--r--fs/ceph/xattr.c108
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/crypto/keysetup_v1.c2
-rw-r--r--fs/dax.c24
-rw-r--r--fs/dcache.c8
-rw-r--r--fs/debugfs/file.c2
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/efivarfs/inode.c4
-rw-r--r--fs/efivarfs/internal.h9
-rw-r--r--fs/efivarfs/super.c70
-rw-r--r--fs/efs/super.c1
-rw-r--r--fs/erofs/Kconfig2
-rw-r--r--fs/erofs/data.c5
-rw-r--r--fs/erofs/inode.c98
-rw-r--r--fs/erofs/super.c1
-rw-r--r--fs/erofs/utils.c19
-rw-r--r--fs/eventpoll.c6
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exfat/file.c1
-rw-r--r--fs/exfat/inode.c4
-rw-r--r--fs/exportfs/expfs.c57
-rw-r--r--fs/ext2/super.c1
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/extents_status.c23
-rw-r--r--fs/ext4/inode.c14
-rw-r--r--fs/ext4/move_extent.c11
-rw-r--r--fs/ext4/readpage.c14
-rw-r--r--fs/ext4/super.c14
-rw-r--r--fs/f2fs/compress.c63
-rw-r--r--fs/f2fs/data.c35
-rw-r--r--fs/f2fs/extent_cache.c53
-rw-r--r--fs/f2fs/file.c17
-rw-r--r--fs/f2fs/inode.c2
-rw-r--r--fs/f2fs/node.c20
-rw-r--r--fs/f2fs/segment.c92
-rw-r--r--fs/f2fs/segment.h4
-rw-r--r--fs/f2fs/super.c130
-rw-r--r--fs/f2fs/xattr.c20
-rw-r--r--fs/fat/nfs.c1
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/fhandle.c6
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/freevxfs/vxfs_super.c2
-rw-r--r--fs/fuse/inode.c7
-rw-r--r--fs/gfs2/acl.h8
-rw-r--r--fs/gfs2/aops.c74
-rw-r--r--fs/gfs2/aops.h6
-rw-r--r--fs/gfs2/bmap.c65
-rw-r--r--fs/gfs2/bmap.h38
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/dir.h38
-rw-r--r--fs/gfs2/file.c18
-rw-r--r--fs/gfs2/glock.c20
-rw-r--r--fs/gfs2/glock.h113
-rw-r--r--fs/gfs2/glops.c13
-rw-r--r--fs/gfs2/glops.h4
-rw-r--r--fs/gfs2/incore.h2
-rw-r--r--fs/gfs2/inode.c33
-rw-r--r--fs/gfs2/inode.h60
-rw-r--r--fs/gfs2/log.h46
-rw-r--r--fs/gfs2/lops.h22
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/meta_io.c61
-rw-r--r--fs/gfs2/meta_io.h20
-rw-r--r--fs/gfs2/ops_fstype.c37
-rw-r--r--fs/gfs2/quota.c93
-rw-r--r--fs/gfs2/quota.h41
-rw-r--r--fs/gfs2/recovery.h18
-rw-r--r--fs/gfs2/rgrp.c12
-rw-r--r--fs/gfs2/rgrp.h85
-rw-r--r--fs/gfs2/super.c29
-rw-r--r--fs/gfs2/super.h50
-rw-r--r--fs/gfs2/trans.h24
-rw-r--r--fs/gfs2/util.h8
-rw-r--r--fs/gfs2/xattr.c6
-rw-r--r--fs/gfs2/xattr.h12
-rw-r--r--fs/hugetlbfs/inode.c86
-rw-r--r--fs/inode.c8
-rw-r--r--fs/iomap/buffered-io.c57
-rw-r--r--fs/jbd2/journal.c29
-rw-r--r--fs/jffs2/super.c1
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/kernfs/file.c78
-rw-r--r--fs/kernfs/mount.c7
-rw-r--r--fs/libfs.c44
-rw-r--r--fs/locks.c4
-rw-r--r--fs/mbcache.c22
-rw-r--r--fs/mnt_idmapping.c2
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namespace.c16
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/delegation.c7
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c29
-rw-r--r--fs/nfs/nfs3proc.c3
-rw-r--r--fs/nfs/nfs42xattr.c87
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4proc.c62
-rw-r--r--fs/nfs/pnfs.c8
-rw-r--r--fs/nfs/pnfs.h5
-rw-r--r--fs/nfs/proc.c3
-rw-r--r--fs/nfs/super.c29
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/nfsd/cache.h4
-rw-r--r--fs/nfsd/export.c3
-rw-r--r--fs/nfsd/filecache.c23
-rw-r--r--fs/nfsd/netns.h4
-rw-r--r--fs/nfsd/nfs4state.c21
-rw-r--r--fs/nfsd/nfscache.c118
-rw-r--r--fs/nfsd/nfssvc.c14
-rw-r--r--fs/nilfs2/mdt.c66
-rw-r--r--fs/nilfs2/page.c76
-rw-r--r--fs/nilfs2/page.h11
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/notify/dnotify/dnotify.c6
-rw-r--r--fs/notify/fanotify/fanotify_user.c12
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/ntfs/aops.c255
-rw-r--r--fs/ntfs/file.c89
-rw-r--r--fs/ntfs/namei.c1
-rw-r--r--fs/ntfs3/file.c31
-rw-r--r--fs/ntfs3/super.c1
-rw-r--r--fs/ocfs2/alloc.c11
-rw-r--r--fs/ocfs2/aops.c19
-rw-r--r--fs/ocfs2/buffer_head_io.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c3
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/namei.c8
-rw-r--r--fs/ocfs2/quota_local.c4
-rw-r--r--fs/overlayfs/Makefile2
-rw-r--r--fs/overlayfs/copy_up.c142
-rw-r--r--fs/overlayfs/dir.c64
-rw-r--r--fs/overlayfs/export.c7
-rw-r--r--fs/overlayfs/file.c88
-rw-r--r--fs/overlayfs/inode.c165
-rw-r--r--fs/overlayfs/namei.c52
-rw-r--r--fs/overlayfs/overlayfs.h72
-rw-r--r--fs/overlayfs/params.c333
-rw-r--r--fs/overlayfs/params.h1
-rw-r--r--fs/overlayfs/readdir.c27
-rw-r--r--fs/overlayfs/super.c92
-rw-r--r--fs/overlayfs/util.c119
-rw-r--r--fs/overlayfs/xattrs.c271
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c52
-rw-r--r--fs/proc/inode.c11
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/task_mmu.c744
-rw-r--r--fs/quota/dquot.c17
-rw-r--r--fs/reiserfs/inode.c80
-rw-r--r--fs/smb/client/cached_dir.c84
-rw-r--r--fs/smb/client/cifs_debug.c33
-rw-r--r--fs/smb/client/cifs_ioctl.h6
-rw-r--r--fs/smb/client/cifs_spnego.c4
-rw-r--r--fs/smb/client/cifsfs.c1
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/cifsglob.h16
-rw-r--r--fs/smb/client/cifspdu.h2
-rw-r--r--fs/smb/client/cifsproto.h14
-rw-r--r--fs/smb/client/connect.c98
-rw-r--r--fs/smb/client/dfs.c18
-rw-r--r--fs/smb/client/export.c11
-rw-r--r--fs/smb/client/fs_context.h1
-rw-r--r--fs/smb/client/inode.c4
-rw-r--r--fs/smb/client/ioctl.c26
-rw-r--r--fs/smb/client/link.c16
-rw-r--r--fs/smb/client/namespace.c17
-rw-r--r--fs/smb/client/ntlmssp.h4
-rw-r--r--fs/smb/client/sess.c276
-rw-r--r--fs/smb/client/smb2misc.c2
-rw-r--r--fs/smb/client/smb2ops.c14
-rw-r--r--fs/smb/client/smb2pdu.c131
-rw-r--r--fs/smb/client/smb2transport.c13
-rw-r--r--fs/smb/client/transport.c13
-rw-r--r--fs/smb/client/xattr.c5
-rw-r--r--fs/smb/common/smb2pdu.h24
-rw-r--r--fs/smb/server/ksmbd_spnego_negtokeninit.asn18
-rw-r--r--fs/smb/server/ksmbd_spnego_negtokentarg.asn17
-rw-r--r--fs/smb/server/smb_common.c11
-rw-r--r--fs/smb/server/smbacl.c29
-rw-r--r--fs/smb/server/vfs.c7
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/super.c36
-rw-r--r--fs/sysfs/file.c13
-rw-r--r--fs/tracefs/event_inode.c1093
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/tracefs/internal.h54
-rw-r--r--fs/ubifs/auth.c3
-rw-r--r--fs/ubifs/dir.c4
-rw-r--r--fs/ubifs/file.c3
-rw-r--r--fs/ubifs/journal.c1
-rw-r--r--fs/ubifs/replay.c1
-rw-r--r--fs/ubifs/super.c26
-rw-r--r--fs/ubifs/tnc.c1
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/ufs/balloc.c20
-rw-r--r--fs/ufs/inode.c25
-rw-r--r--fs/ufs/super.c1
-rw-r--r--fs/ufs/util.c34
-rw-r--r--fs/ufs/util.h10
-rw-r--r--fs/userfaultfd.c98
-rw-r--r--fs/xfs/Kconfig2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c27
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c45
-rw-r--r--fs/xfs/libxfs/xfs_defer.c28
-rw-r--r--fs/xfs/libxfs/xfs_defer.h2
-rw-r--r--fs/xfs/libxfs/xfs_format.h34
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c3
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c807
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h383
-rw-r--r--fs/xfs/libxfs/xfs_sb.c2
-rw-r--r--fs/xfs/libxfs/xfs_sb.h2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c10
-rw-r--r--fs/xfs/libxfs/xfs_types.c4
-rw-r--r--fs/xfs/libxfs/xfs_types.h10
-rw-r--r--fs/xfs/scrub/bmap.c2
-rw-r--r--fs/xfs/scrub/fscounters.c2
-rw-r--r--fs/xfs/scrub/inode.c3
-rw-r--r--fs/xfs/scrub/rtbitmap.c28
-rw-r--r--fs/xfs/scrub/rtsummary.c72
-rw-r--r--fs/xfs/scrub/trace.c1
-rw-r--r--fs/xfs/scrub/trace.h15
-rw-r--r--fs/xfs/xfs_bmap_util.c74
-rw-r--r--fs/xfs/xfs_buf.c24
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_file.c63
-rw-r--r--fs/xfs/xfs_fsmap.c15
-rw-r--r--fs/xfs/xfs_icache.c26
-rw-r--r--fs/xfs/xfs_inode.c24
-rw-r--r--fs/xfs/xfs_inode.h9
-rw-r--r--fs/xfs/xfs_inode_item.c3
-rw-r--r--fs/xfs/xfs_inode_item_recover.c46
-rw-r--r--fs/xfs/xfs_ioctl.c5
-rw-r--r--fs/xfs/xfs_linux.h12
-rw-r--r--fs/xfs/xfs_log.c23
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h10
-rw-r--r--fs/xfs/xfs_ondisk.h4
-rw-r--r--fs/xfs/xfs_qm.c27
-rw-r--r--fs/xfs/xfs_qm.h2
-rw-r--r--fs/xfs/xfs_reflink.c5
-rw-r--r--fs/xfs/xfs_rtalloc.c638
-rw-r--r--fs/xfs/xfs_rtalloc.h94
-rw-r--r--fs/xfs/xfs_super.c3
-rw-r--r--fs/xfs/xfs_trans.c7
401 files changed, 13044 insertions, 8242 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index d525957594b6..61dbe52bb3a3 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -732,4 +732,5 @@ module_exit(exit_v9fs)
MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_DESCRIPTION("9P Client File System");
MODULE_LICENSE("GPL");
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index cdf441f22e07..731e3d14b67d 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -52,7 +52,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
unsigned int flags);
int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
-void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
int v9fs_uflags2omode(int uflags, int extended);
void v9fs_blank_wstat(struct p9_wstat *wstat);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 053d1cef6e13..8604e3377ee7 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -68,7 +68,7 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
struct p9_fid *fid;
int ret;
- p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n",
+ p9_debug(P9_DEBUG_VFS, "name = '%s' value_len = %zu\n",
name, buffer_size);
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
@@ -139,7 +139,8 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
+ /* Txattrwalk with an empty string lists xattrs instead */
+ return v9fs_xattr_get(dentry, "", buffer, buffer_size);
}
static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 2fe4a5832fcf..d6b9758ee23d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -568,6 +568,7 @@ static struct dentry *affs_fh_to_parent(struct super_block *sb, struct fid *fid,
}
const struct export_operations affs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = affs_fh_to_dentry,
.fh_to_parent = affs_fh_to_parent,
.get_parent = affs_get_parent,
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 24192a7667ed..d26222b7eefe 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,8 +24,8 @@
#include <linux/uaccess.h>
-static struct vfsmount *anon_inode_mnt __read_mostly;
-static struct inode *anon_inode_inode;
+static struct vfsmount *anon_inode_mnt __ro_after_init;
+static struct inode *anon_inode_inode __ro_after_init;
/*
* anon_inodefs_dname() is called from d_path().
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index df13a4f9a6e3..c08c2c7d6fbb 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -24,7 +24,6 @@ config BCACHEFS_FS
select XXHASH
select SRCU
select SYMBOLIC_ERRNAME
- select MEAN_AND_VARIANCE
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
@@ -42,7 +41,6 @@ config BCACHEFS_POSIX_ACL
config BCACHEFS_DEBUG_TRANSACTIONS
bool "bcachefs runtime info"
depends on BCACHEFS_FS
- default y
help
This makes the list of running btree transactions available in debugfs.
@@ -78,7 +76,7 @@ config BCACHEFS_NO_LATENCY_ACCT
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT
- select MEAN_AND_VARIANCE
+ depends on BCACHEFS_FS
default KUNIT_ALL_TESTS
help
This option enables the kunit tests for mean_and_variance module.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 0749731b9072..45b64f89258c 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -70,6 +70,7 @@ bcachefs-y := \
reflink.o \
replicas.o \
sb-clean.o \
+ sb-errors.o \
sb-members.o \
siphash.o \
six.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 2d516207e223..1fec0e67891f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -192,123 +192,109 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
return DIV_ROUND_UP(bytes, sizeof(u64));
}
-int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+ int ret = 0;
/* allow for unknown fields */
- if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
- prt_printf(err, "incorrect value size (%zu < %u)",
- bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
+ alloc_v1_val_size_bad,
+ "incorrect value size (%zu < %u)",
+ bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+fsck_err:
+ return ret;
}
-int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_alloc_unpacked u;
+ int ret = 0;
- if (bch2_alloc_unpack_v2(&u, k)) {
- prt_printf(err, "unpack error");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
+ alloc_v2_unpack_error,
+ "unpack error");
+fsck_err:
+ return ret;
}
-int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_alloc_unpacked u;
+ int ret = 0;
- if (bch2_alloc_unpack_v3(&u, k)) {
- prt_printf(err, "unpack error");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
+ alloc_v2_unpack_error,
+ "unpack error");
+fsck_err:
+ return ret;
}
-int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+ int ret = 0;
- if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
- prt_printf(err, "bad val size (%u > %zu)",
- alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
+ alloc_v4_val_size_bad,
+ "bad val size (%u > %zu)",
+ alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
- if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
- BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
- prt_printf(err, "invalid backpointers_start");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
+ alloc_v4_backpointers_start_bad,
+ "invalid backpointers_start");
- if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
- prt_printf(err, "invalid data type (got %u should be %u)",
- a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
+ alloc_key_data_type_bad,
+ "invalid data type (got %u should be %u)",
+ a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
switch (a.v->data_type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
- if (a.v->dirty_sectors ||
- a.v->cached_sectors ||
- a.v->stripe) {
- prt_printf(err, "empty data type free but have data");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(a.v->dirty_sectors ||
+ a.v->cached_sectors ||
+ a.v->stripe, c, err,
+ alloc_key_empty_but_have_data,
+ "empty data type free but have data");
break;
case BCH_DATA_sb:
case BCH_DATA_journal:
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
- if (!a.v->dirty_sectors) {
- prt_printf(err, "data_type %s but dirty_sectors==0",
- bch2_data_types[a.v->data_type]);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(!a.v->dirty_sectors, c, err,
+ alloc_key_dirty_sectors_0,
+ "data_type %s but dirty_sectors==0",
+ bch2_data_types[a.v->data_type]);
break;
case BCH_DATA_cached:
- if (!a.v->cached_sectors ||
- a.v->dirty_sectors ||
- a.v->stripe) {
- prt_printf(err, "data type inconsistency");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (!a.v->io_time[READ] &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
- prt_printf(err, "cached bucket with read_time == 0");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(!a.v->cached_sectors ||
+ a.v->dirty_sectors ||
+ a.v->stripe, c, err,
+ alloc_key_cached_inconsistency,
+ "data type inconsistency");
+
+ bkey_fsck_err_on(!a.v->io_time[READ] &&
+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
+ c, err,
+ alloc_key_cached_but_read_time_zero,
+ "cached bucket with read_time == 0");
break;
case BCH_DATA_stripe:
break;
}
-
- return 0;
-}
-
-static inline u64 swab40(u64 x)
-{
- return (((x & 0x00000000ffULL) << 32)|
- ((x & 0x000000ff00ULL) << 16)|
- ((x & 0x0000ff0000ULL) >> 0)|
- ((x & 0x00ff000000ULL) >> 16)|
- ((x & 0xff00000000ULL) >> 32));
+fsck_err:
+ return ret;
}
void bch2_alloc_v4_swab(struct bkey_s k)
@@ -324,6 +310,7 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->io_time[1] = swab64(a->io_time[1]);
a->stripe = swab32(a->stripe);
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+ a->fragmentation_lru = swab64(a->fragmentation_lru);
bps = alloc_v4_backpointers(a);
for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
@@ -521,17 +508,18 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
: 0;
}
-int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
- prt_printf(err, "bad val size (%zu != %zu)",
- bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- return 0;
+ bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
+ bucket_gens_val_size_bad,
+ "bad val size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
+fsck_err:
+ return ret;
}
void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
@@ -727,7 +715,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
" for %s",
set ? "setting" : "clearing",
- bch2_btree_ids[btree],
+ bch2_btree_id_str(btree),
iter.pos.inode,
iter.pos.offset,
bch2_bkey_types[old.k->type],
@@ -986,6 +974,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
int ret;
if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
+ alloc_key_to_missing_dev_bucket,
"alloc key for invalid device:bucket %llu:%llu",
alloc_k.k->p.inode, alloc_k.k->p.offset))
return bch2_btree_delete_at(trans, alloc_iter, 0);
@@ -1005,7 +994,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (k.k->type != discard_key_type &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
+ fsck_err(c, need_discard_key_wrong,
+ "incorrect key in need_discard btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[discard_key_type],
@@ -1035,7 +1025,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (k.k->type != freespace_key_type &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
+ fsck_err(c, freespace_key_wrong,
+ "incorrect key in freespace btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[freespace_key_type],
@@ -1066,7 +1057,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (a->gen != alloc_gen(k, gens_offset) &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+ fsck_err(c, bucket_gens_key_wrong,
+ "incorrect gen in bucket_gens btree (got %u should be %u)\n"
" %s",
alloc_gen(k, gens_offset), a->gen,
(printbuf_reset(&buf),
@@ -1124,7 +1116,8 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_set &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "hole in alloc btree missing in freespace btree\n"
+ fsck_err(c, freespace_hole_missing,
+ "hole in alloc btree missing in freespace btree\n"
" device %llu buckets %llu-%llu",
freespace_iter->pos.inode,
freespace_iter->pos.offset,
@@ -1187,6 +1180,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
for (i = gens_offset; i < gens_end_offset; i++) {
if (fsck_err_on(g.v.gens[i], c,
+ bucket_gens_hole_wrong,
"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
bucket_gens_pos_to_alloc(k.k->p, i).inode,
bucket_gens_pos_to_alloc(k.k->p, i).offset,
@@ -1244,8 +1238,9 @@ static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_tr
return ret;
if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+ need_discard_freespace_key_to_invalid_dev_bucket,
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
- bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
+ bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
goto delete;
a = bch2_alloc_to_v4(alloc_k, &a_convert);
@@ -1253,9 +1248,10 @@ static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_tr
if (fsck_err_on(a->data_type != state ||
(state == BCH_DATA_free &&
genbits != alloc_freespace_genbits(*a)), c,
+ need_discard_freespace_key_bad,
"%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
- bch2_btree_ids[iter->btree_id],
+ bch2_btree_id_str(iter->btree_id),
iter->pos.inode,
iter->pos.offset,
a->data_type == state,
@@ -1320,6 +1316,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
dev_exists = bch2_dev_exists2(c, k.k->p.inode);
if (!dev_exists) {
if (fsck_err_on(!dev_exists, c,
+ bucket_gens_to_invalid_dev,
"bucket_gens key for invalid device:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1330,6 +1327,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
ca = bch_dev_bkey_exists(c, k.k->p.inode);
if (fsck_err_on(end <= ca->mi.first_bucket ||
start >= ca->mi.nbuckets, c,
+ bucket_gens_to_invalid_buckets,
"bucket_gens key for invalid buckets:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1338,6 +1336,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
for (b = start; b < ca->mi.first_bucket; b++)
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+ bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
@@ -1345,6 +1344,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
for (b = ca->mi.nbuckets; b < end; b++)
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+ bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
@@ -1495,11 +1495,13 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
return ret;
if (fsck_err_on(!a->io_time[READ], c,
+ alloc_key_cached_but_read_time_zero,
"cached bucket with read_time 0\n"
" %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
+ alloc_key_to_missing_lru_entry,
"missing lru entry\n"
" %s",
(printbuf_reset(&buf),
@@ -2075,6 +2077,17 @@ void bch2_recalc_capacity(struct bch_fs *c)
closure_wake_up(&c->freelist_wait);
}
+u64 bch2_min_rw_member_capacity(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ u64 ret = U64_MAX;
+
+ for_each_rw_member(ca, c, i)
+ ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
+ return ret;
+}
+
static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
{
struct open_bucket *ob;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 97042067d2a9..73faf99a222a 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -149,13 +149,13 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -193,7 +193,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
.min_val_size = 48, \
})
-int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -249,6 +249,7 @@ int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
int bch2_fs_freespace_init(struct bch_fs *);
void bch2_recalc_capacity(struct bch_fs *);
+u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 3bc4abd3d7d5..b85c7765272f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -399,12 +399,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
struct bucket_alloc_state *s,
struct closure *cl)
{
- struct btree_iter iter;
- struct bkey_s_c k;
+ struct btree_iter iter, citer;
+ struct bkey_s_c k, ck;
struct open_bucket *ob = NULL;
- u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
- u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+ u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+ u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 alloc_cursor = alloc_start;
int ret;
+
+ /*
+ * Scan with an uncached iterator to avoid polluting the key cache. An
+ * uncached iter will return a cached key if one exists, but if not
+ * there is no other underlying protection for the associated key cache
+ * slot. To avoid racing bucket allocations, look up the cached key slot
+ * of any likely allocation candidate before attempting to proceed with
+ * the allocation. This provides proper exclusion on the associated
+ * bucket.
+ */
again:
for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
BTREE_ITER_SLOTS, k, ret) {
@@ -419,25 +430,38 @@ again:
continue;
a = bch2_alloc_to_v4(k, &a_convert);
-
if (a->data_type != BCH_DATA_free)
continue;
+ /* now check the cached key to serialize concurrent allocs of the bucket */
+ ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+ ret = bkey_err(ck);
+ if (ret)
+ break;
+
+ a = bch2_alloc_to_v4(ck, &a_convert);
+ if (a->data_type != BCH_DATA_free)
+ goto next;
+
s->buckets_seen++;
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+next:
+ citer.path->preserve = false;
+ bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
}
bch2_trans_iter_exit(trans, &iter);
+ alloc_cursor = iter.pos.offset;
ca->alloc_cursor = alloc_cursor;
if (!ob && ret)
ob = ERR_PTR(ret);
- if (!ob && alloc_cursor > alloc_start) {
- alloc_cursor = alloc_start;
+ if (!ob && alloc_start > first_bucket) {
+ alloc_cursor = alloc_start = first_bucket;
goto again;
}
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index cc856150a948..23c0834a97a4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -5,6 +5,7 @@
#include "backpointers.h"
#include "btree_cache.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "error.h"
@@ -37,25 +38,26 @@ static bool extent_matches_bp(struct bch_fs *c,
return false;
}
-int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+ int ret = 0;
- if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
- prt_str(err, "backpointer at wrong pos");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+ c, err,
+ backpointer_pos_wrong,
+ "backpointer at wrong pos");
+fsck_err:
+ return ret;
}
void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
{
prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
- bch2_btree_ids[bp->btree_id],
+ bch2_btree_id_str(bp->btree_id),
bp->level,
(u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
(u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
@@ -76,7 +78,7 @@ void bch2_backpointer_swab(struct bkey_s k)
{
struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
- bp.v->bucket_offset = swab32(bp.v->bucket_offset);
+ bp.v->bucket_offset = swab40(bp.v->bucket_offset);
bp.v->bucket_len = swab32(bp.v->bucket_len);
bch2_bpos_swab(&bp.v->pos);
}
@@ -219,18 +221,22 @@ out:
static void backpointer_not_found(struct btree_trans *trans,
struct bpos bp_pos,
struct bch_backpointer bp,
- struct bkey_s_c k,
- const char *thing_it_points_to)
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ /*
+ * If we're using the btree write buffer, the backpointer we were
+ * looking at may have already been deleted - failure to find what it
+ * pointed to is not an error:
+ */
if (likely(!bch2_backpointers_no_use_write_buffer))
return;
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
- thing_it_points_to);
+ bp.level ? "btree node" : "extent");
prt_printf(&buf, "bucket: ");
bch2_bpos_to_text(&buf, bucket);
prt_printf(&buf, "\n ");
@@ -256,56 +262,37 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
struct bch_backpointer bp,
unsigned iter_flags)
{
- struct bch_fs *c = trans->c;
- struct btree_root *r = bch2_btree_id_root(c, bp.btree_id);
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
- struct bkey_s_c k;
-
- bch2_trans_node_iter_init(trans, iter,
- bp.btree_id,
- bp.pos,
- 0,
- min(bp.level, r->level),
- iter_flags);
- k = bch2_btree_iter_peek_slot(iter);
- if (bkey_err(k)) {
- bch2_trans_iter_exit(trans, iter);
- return k;
- }
-
- if (bp.level == r->level + 1)
- k = bkey_i_to_s_c(&r->key);
-
- if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
- return k;
-
- bch2_trans_iter_exit(trans, iter);
+ if (likely(!bp.level)) {
+ struct bch_fs *c = trans->c;
+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ struct bkey_s_c k;
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0, 0,
+ iter_flags);
+ k = bch2_btree_iter_peek_slot(iter);
+ if (bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
- if (unlikely(bch2_backpointers_no_use_write_buffer)) {
- if (bp.level) {
- struct btree *b;
+ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+ return k;
- /*
- * If a backpointer for a btree node wasn't found, it may be
- * because it was overwritten by a new btree node that hasn't
- * been written out yet - backpointer_get_node() checks for
- * this:
- */
- b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
- if (!IS_ERR_OR_NULL(b))
- return bkey_i_to_s_c(&b->key);
+ bch2_trans_iter_exit(trans, iter);
+ backpointer_not_found(trans, bp_pos, bp, k);
+ return bkey_s_c_null;
+ } else {
+ struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
+ if (IS_ERR_OR_NULL(b)) {
bch2_trans_iter_exit(trans, iter);
-
- if (IS_ERR(b))
- return bkey_s_c_err(PTR_ERR(b));
- return bkey_s_c_null;
+ return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
}
-
- backpointer_not_found(trans, bp_pos, bp, k, "extent");
+ return bkey_i_to_s_c(&b->key);
}
-
- return bkey_s_c_null;
}
struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
@@ -326,19 +313,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
bp.level - 1,
0);
b = bch2_btree_iter_peek_node(iter);
- if (IS_ERR(b))
+ if (IS_ERR_OR_NULL(b))
goto err;
- if (b && extent_matches_bp(c, bp.btree_id, bp.level,
- bkey_i_to_s_c(&b->key),
- bucket, bp))
+ BUG_ON(b->c.level != bp.level - 1);
+
+ if (extent_matches_bp(c, bp.btree_id, bp.level,
+ bkey_i_to_s_c(&b->key),
+ bucket, bp))
return b;
- if (b && btree_node_will_make_reachable(b)) {
+ if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
- backpointer_not_found(trans, bp_pos, bp,
- bkey_i_to_s_c(&b->key), "btree node");
+ backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
b = NULL;
}
err:
@@ -356,6 +344,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
int ret = 0;
if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+ backpointer_to_missing_device,
"backpointer for missing device:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, bp_iter, 0);
@@ -369,6 +358,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
goto out;
if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+ backpointer_to_missing_alloc,
"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
alloc_iter.pos.inode, alloc_iter.pos.offset,
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
@@ -453,14 +443,14 @@ fsck_err:
return ret;
missing:
prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
- bch2_btree_ids[bp.btree_id], bp.level);
+ bch2_btree_id_str(bp.btree_id), bp.level);
bch2_bkey_val_to_text(&buf, c, orig_k);
prt_printf(&buf, "\nbp pos ");
bch2_bpos_to_text(&buf, bp_iter.pos);
if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
c->opts.reconstruct_alloc ||
- fsck_err(c, "%s", buf.buf))
+ fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
goto out;
@@ -793,7 +783,9 @@ static int check_one_backpointer(struct btree_trans *trans,
}
if (fsck_err_on(!k.k, c,
- "backpointer for missing extent\n %s",
+ backpointer_to_missing_ptr,
+ "backpointer for missing %s\n %s",
+ bp.v->level ? "btree node" : "extent",
(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
goto out;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 547e0617602a..ab866feeaf66 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -7,7 +7,16 @@
#include "buckets.h"
#include "super.h"
-int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
+static inline u64 swab40(u64 x)
+{
+ return (((x & 0x00000000ffULL) << 32)|
+ ((x & 0x000000ff00ULL) << 16)|
+ ((x & 0x0000ff0000ULL) >> 0)|
+ ((x & 0x00ff000000ULL) >> 16)|
+ ((x & 0xff00000000ULL) >> 32));
+}
+
+int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
enum bkey_invalid_flags, struct printbuf *);
void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
index 1fbed1f8378d..be2edced5213 100644
--- a/fs/bcachefs/bbpos.h
+++ b/fs/bcachefs/bbpos.h
@@ -2,20 +2,9 @@
#ifndef _BCACHEFS_BBPOS_H
#define _BCACHEFS_BBPOS_H
+#include "bbpos_types.h"
#include "bkey_methods.h"
-
-struct bbpos {
- enum btree_id btree;
- struct bpos pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
- return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN BBPOS(0, POS_MIN)
-#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+#include "btree_cache.h"
static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
{
@@ -40,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos)
static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
{
- prt_str(out, bch2_btree_ids[pos.btree]);
+ prt_str(out, bch2_btree_id_str(pos.btree));
prt_char(out, ':');
bch2_bpos_to_text(out, pos.pos);
}
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
new file mode 100644
index 000000000000..5198e94cf3b8
--- /dev/null
+++ b/fs/bcachefs/bbpos_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_TYPES_H
+#define _BCACHEFS_BBPOS_TYPES_H
+
+struct bbpos {
+ enum btree_id btree;
+ struct bpos pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+ return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN BBPOS(0, POS_MIN)
+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 53ffa88cae16..403aa3389fcc 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -209,6 +209,7 @@
#include "nocow_locking_types.h"
#include "opts.h"
#include "recovery_types.h"
+#include "sb-errors_types.h"
#include "seqmutex.h"
#include "util.h"
@@ -418,6 +419,7 @@ enum bch_time_stats {
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
+#include "disk_groups_types.h"
#include "ec_types.h"
#include "journal_types.h"
#include "keylist_types.h"
@@ -463,6 +465,7 @@ enum gc_phase {
GC_PHASE_BTREE_snapshot_trees,
GC_PHASE_BTREE_deleted_inodes,
GC_PHASE_BTREE_logged_ops,
+ GC_PHASE_BTREE_rebalance_work,
GC_PHASE_PENDING_DELETE,
};
@@ -500,6 +503,8 @@ struct bch_dev {
* Committed by bch2_write_super() -> bch_fs_mi_update()
*/
struct bch_member_cpu mi;
+ atomic64_t errors[BCH_MEMBER_ERROR_NR];
+
__uuid_t uuid;
char name[BDEVNAME_SIZE];
@@ -578,7 +583,7 @@ enum {
BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */
BCH_FS_NEED_ANOTHER_GC,
- BCH_FS_HAVE_DELETED_SNAPSHOTS,
+ BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS,
/* errors: */
BCH_FS_ERROR,
@@ -612,7 +617,7 @@ struct journal_seq_blacklist_table {
u64 start;
u64 end;
bool dirty;
- } entries[0];
+ } entries[];
};
struct journal_keys {
@@ -938,9 +943,6 @@ struct bch_fs {
struct list_head moving_context_list;
struct mutex moving_context_lock;
- struct list_head data_progress_list;
- struct mutex data_progress_lock;
-
/* REBALANCE */
struct bch_fs_rebalance rebalance;
@@ -991,11 +993,6 @@ struct bch_fs {
struct bio_set dio_read_bioset;
struct bio_set nocow_flush_bioset;
- /* ERRORS */
- struct list_head fsck_errors;
- struct mutex fsck_error_lock;
- bool fsck_alloc_err;
-
/* QUOTAS */
struct bch_memquota_type quotas[QTYP_NR];
@@ -1044,6 +1041,14 @@ struct bch_fs {
struct bch2_time_stats times[BCH_TIME_STAT_NR];
struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
+
+ /* ERRORS */
+ struct list_head fsck_error_msgs;
+ struct mutex fsck_error_msgs_lock;
+ bool fsck_alloc_msgs_err;
+
+ bch_sb_errors_cpu fsck_error_counts;
+ struct mutex fsck_error_counts_lock;
};
extern struct wait_queue_head bch2_read_only_wait;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 99749f3315fe..0a750953ff92 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -613,31 +613,17 @@ struct bch_extent_stripe_ptr {
#endif
};
-struct bch_extent_reservation {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:22,
- replicas:4,
- generation:32;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 generation:32,
- replicas:4,
- unused:22,
- type:6;
-#endif
-};
-
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:7,
- unused:33,
- compression:8,
+ __u64 type:6,
+ unused:34,
+ compression:8, /* enum bch_compression_opt */
target:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 target:16,
compression:8,
- unused:33,
- type:7;
+ unused:34,
+ type:6;
#endif
};
@@ -838,34 +824,30 @@ enum inode_opt_id {
Inode_opt_nr,
};
-enum {
- /*
- * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
- * flags)
- */
- __BCH_INODE_SYNC = 0,
- __BCH_INODE_IMMUTABLE = 1,
- __BCH_INODE_APPEND = 2,
- __BCH_INODE_NODUMP = 3,
- __BCH_INODE_NOATIME = 4,
-
- __BCH_INODE_I_SIZE_DIRTY = 5, /* obsolete */
- __BCH_INODE_I_SECTORS_DIRTY = 6, /* obsolete */
- __BCH_INODE_UNLINKED = 7,
- __BCH_INODE_BACKPTR_UNTRUSTED = 8,
-
- /* bits 20+ reserved for packed fields below: */
-};
-
-#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC)
-#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE)
-#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND)
-#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP)
-#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME)
-#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
-#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
-#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED)
-#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
+#define BCH_INODE_FLAGS() \
+ x(sync, 0) \
+ x(immutable, 1) \
+ x(append, 2) \
+ x(nodump, 3) \
+ x(noatime, 4) \
+ x(i_size_dirty, 5) \
+ x(i_sectors_dirty, 6) \
+ x(unlinked, 7) \
+ x(backptr_untrusted, 8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n) BCH_INODE_##t = 1U << n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n) __BCH_INODE_##t = n,
+ BCH_INODE_FLAGS()
+#undef x
+};
LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
@@ -1232,7 +1214,8 @@ struct bch_sb_field {
x(journal_seq_blacklist, 8) \
x(journal_v2, 9) \
x(counters, 10) \
- x(members_v2, 11)
+ x(members_v2, 11) \
+ x(errors, 12)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@@ -1282,6 +1265,18 @@ enum bch_iops_measurement {
BCH_IOPS_NR
};
+#define BCH_MEMBER_ERROR_TYPES() \
+ x(read, 0) \
+ x(write, 1) \
+ x(checksum, 2)
+
+enum bch_member_error_type {
+#define x(t, n) BCH_MEMBER_ERROR_##t = n,
+ BCH_MEMBER_ERROR_TYPES()
+#undef x
+ BCH_MEMBER_ERROR_NR
+};
+
struct bch_member {
__uuid_t uuid;
__le64 nbuckets; /* device size */
@@ -1292,6 +1287,9 @@ struct bch_member {
__le64 flags;
__le32 iops[4];
+ __le64 errors[BCH_MEMBER_ERROR_NR];
+ __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
+ __le64 errors_reset_time;
};
#define BCH_MEMBER_V1_BYTES 56
@@ -1615,11 +1613,20 @@ struct journal_seq_blacklist_entry {
struct bch_sb_field_journal_seq_blacklist {
struct bch_sb_field field;
+ struct journal_seq_blacklist_entry start[];
+};
- struct journal_seq_blacklist_entry start[0];
- __u64 _data[];
+struct bch_sb_field_errors {
+ struct bch_sb_field field;
+ struct bch_sb_field_error_entry {
+ __le64 v;
+ __le64 last_error_time;
+ } entries[];
};
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
+
/* Superblock: */
/*
@@ -1682,7 +1689,9 @@ struct bch_sb_field_journal_seq_blacklist {
x(snapshot_skiplists, BCH_VERSION(1, 1), \
BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \
x(deleted_inodes, BCH_VERSION(1, 2), \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
+ x(rebalance_work, BCH_VERSION(1, 3), \
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1693,7 +1702,7 @@ enum bcachefs_metadata_version {
};
static const __maybe_unused
-unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@@ -2247,7 +2256,8 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
enum btree_id_flags {
BTREE_ID_EXTENTS = BIT(0),
BTREE_ID_SNAPSHOTS = BIT(1),
- BTREE_ID_DATA = BIT(2),
+ BTREE_ID_SNAPSHOT_FIELD = BIT(2),
+ BTREE_ID_DATA = BIT(3),
};
#define BCH_BTREE_IDS() \
@@ -2302,11 +2312,13 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_bucket_gens)) \
x(snapshot_trees, 15, 0, \
BIT_ULL(KEY_TYPE_snapshot_tree)) \
- x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \
+ x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
- BIT_ULL(KEY_TYPE_logged_op_finsert))
+ BIT_ULL(KEY_TYPE_logged_op_finsert)) \
+ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
+ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 518450209236..831be01809f2 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -92,19 +92,15 @@ enum bkey_lr_packed {
#define bkey_lr_packed(_l, _r) \
((_l)->format + ((_r)->format << 1))
-#define bkey_copy(_dst, _src) \
-do { \
- BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \
- !type_is(_dst, struct bkey_packed *)); \
- BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \
- !type_is(_src, struct bkey_packed *)); \
- EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \
- (u64 *) (_dst) < (u64 *) (_src) + \
- ((struct bkey *) (_src))->u64s); \
- \
- memcpy_u64s_small((_dst), (_src), \
- ((struct bkey *) (_src))->u64s); \
-} while (0)
+static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
+{
+ memcpy_u64s_small(dst, src, src->u64s);
+}
+
+static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
+{
+ memcpy_u64s_small(dst, src, src->k.u64s);
+}
struct btree;
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index d9fb1fc81f1e..761f5e33b1e6 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "backpointers.h"
#include "bkey_methods.h"
+#include "btree_cache.h"
#include "btree_types.h"
#include "alloc_background.h"
#include "dirent.h"
@@ -25,7 +26,7 @@ const char * const bch2_bkey_types[] = {
NULL
};
-static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
return 0;
@@ -39,23 +40,24 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
.key_invalid = deleted_key_invalid, \
})
-static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
- if (bkey_val_bytes(k.k)) {
- prt_printf(err, "incorrect value size (%zu != 0)",
- bkey_val_bytes(k.k));
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_bytes(k.k), c, err,
+ bkey_val_size_nonzero,
+ "incorrect value size (%zu != 0)",
+ bkey_val_bytes(k.k));
+fsck_err:
+ return ret;
}
#define bch2_bkey_ops_error ((struct bkey_ops) { \
.key_invalid = empty_val_key_invalid, \
})
-static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
+static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
return 0;
@@ -70,7 +72,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
.key_invalid = empty_val_key_invalid, \
})
-static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
return 0;
@@ -91,18 +93,6 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
.val_to_text = key_type_inline_data_to_text, \
})
-static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags, struct printbuf *err)
-{
- if (bkey_val_bytes(k.k)) {
- prt_printf(err, "incorrect value size (%zu != %zu)",
- bkey_val_bytes(k.k), sizeof(struct bch_cookie));
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
-}
-
static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
bch2_key_resize(l.k, l.k->size + r.k->size);
@@ -110,7 +100,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
}
#define bch2_bkey_ops_set ((struct bkey_ops) { \
- .key_invalid = key_type_set_invalid, \
+ .key_invalid = empty_val_key_invalid, \
.key_merge = key_type_set_merge, \
})
@@ -128,84 +118,95 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
struct printbuf *err)
{
const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+ int ret = 0;
- if (bkey_val_bytes(k.k) < ops->min_val_size) {
- prt_printf(err, "bad val size (%zu < %u)",
- bkey_val_bytes(k.k), ops->min_val_size);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err,
+ bkey_val_size_too_small,
+ "bad val size (%zu < %u)",
+ bkey_val_bytes(k.k), ops->min_val_size);
if (!ops->key_invalid)
return 0;
- return ops->key_invalid(c, k, flags, err);
+ ret = ops->key_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
}
static u64 bch2_key_types_allowed[] = {
-#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
- BCH_BTREE_IDS()
-#undef x
[BKEY_TYPE_btree] =
BIT_ULL(KEY_TYPE_deleted)|
BIT_ULL(KEY_TYPE_btree_ptr)|
BIT_ULL(KEY_TYPE_btree_ptr_v2),
+#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
+ BCH_BTREE_IDS()
+#undef x
};
+const char *bch2_btree_node_type_str(enum btree_node_type type)
+{
+ return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
+}
+
int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (k.k->u64s < BKEY_U64s) {
- prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- if (flags & BKEY_INVALID_COMMIT &&
- !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) {
- prt_printf(err, "invalid key type for btree %s (%s)",
- bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
+ bkey_u64s_too_small,
+ "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
- if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
- if (k.k->size == 0) {
- prt_printf(err, "size == 0");
- return -BCH_ERR_invalid_bkey;
- }
+ if (type >= BKEY_TYPE_NR)
+ return 0;
- if (k.k->size > k.k->p.offset) {
- prt_printf(err, "size greater than offset (%u > %llu)",
- k.k->size, k.k->p.offset);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+ !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
+ bkey_invalid_type_for_btree,
+ "invalid key type for btree %s (%s)",
+ bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+
+ if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+ bkey_fsck_err_on(k.k->size == 0, c, err,
+ bkey_extent_size_zero,
+ "size == 0");
+
+ bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err,
+ bkey_extent_size_greater_than_offset,
+ "size greater than offset (%u > %llu)",
+ k.k->size, k.k->p.offset);
} else {
- if (k.k->size) {
- prt_printf(err, "size != 0");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(k.k->size, c, err,
+ bkey_size_nonzero,
+ "size != 0");
}
if (type != BKEY_TYPE_btree) {
- if (!btree_type_has_snapshots((enum btree_id) type) &&
- k.k->p.snapshot) {
- prt_printf(err, "nonzero snapshot");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (btree_type_has_snapshots((enum btree_id) type) &&
- !k.k->p.snapshot) {
- prt_printf(err, "snapshot == 0");
- return -BCH_ERR_invalid_bkey;
+ enum btree_id btree = type - 1;
+
+ if (btree_type_has_snapshots(btree)) {
+ bkey_fsck_err_on(!k.k->p.snapshot, c, err,
+ bkey_snapshot_zero,
+ "snapshot == 0");
+ } else if (!btree_type_has_snapshot_field(btree)) {
+ bkey_fsck_err_on(k.k->p.snapshot, c, err,
+ bkey_snapshot_nonzero,
+ "nonzero snapshot");
+ } else {
+ /*
+ * btree uses snapshot field but it's not required to be
+ * nonzero
+ */
}
- if (bkey_eq(k.k->p, POS_MAX)) {
- prt_printf(err, "key at POS_MAX");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
+ bkey_at_pos_max,
+ "key at POS_MAX");
}
-
- return 0;
+fsck_err:
+ return ret;
}
int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -217,20 +218,20 @@ int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
bch2_bkey_val_invalid(c, k, flags, err);
}
-int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
- struct printbuf *err)
+int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k, struct printbuf *err)
{
- if (bpos_lt(k.k->p, b->data->min_key)) {
- prt_printf(err, "key before start of btree node");
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- if (bpos_gt(k.k->p, b->data->max_key)) {
- prt_printf(err, "key past end of btree node");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err,
+ bkey_before_start_of_btree_node,
+ "key before start of btree node");
- return 0;
+ bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err,
+ bkey_after_end_of_btree_node,
+ "key past end of btree node");
+fsck_err:
+ return ret;
}
void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 668f595e2fcf..3a370b7087ac 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -21,7 +21,7 @@ extern const struct bkey_ops bch2_bkey_null_ops;
* being read or written; more aggressive checks can be enabled when rw == WRITE.
*/
struct bkey_ops {
- int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
+ int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
@@ -55,7 +55,8 @@ int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
enum bkey_invalid_flags, struct printbuf *);
int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
+int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
+ struct bkey_s_c, struct printbuf *);
void bch2_bpos_to_text(struct printbuf *, struct bpos);
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
@@ -119,16 +120,6 @@ enum btree_update_flags {
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
-#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
- ((1U << KEY_TYPE_alloc)| \
- (1U << KEY_TYPE_alloc_v2)| \
- (1U << KEY_TYPE_alloc_v3)| \
- (1U << KEY_TYPE_alloc_v4)| \
- (1U << KEY_TYPE_stripe)| \
- (1U << KEY_TYPE_inode)| \
- (1U << KEY_TYPE_inode_v2)| \
- (1U << KEY_TYPE_snapshot))
-
static inline int bch2_trans_mark_key(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index b9aa027c881b..bcca9e76a0b4 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -106,7 +106,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
while ((k = sort_iter_peek(iter))) {
if (!bkey_deleted(k) &&
!should_drop_next_key(iter)) {
- bkey_copy(out, k);
+ bkey_p_copy(out, k);
btree_keys_account_key_add(&nr, 0, out);
out = bkey_p_next(out);
}
@@ -137,7 +137,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
continue;
if (!transform)
- bkey_copy(out, in);
+ bkey_p_copy(out, in);
else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
? in_f : &bch2_bkey_format_current, in))
out->format = KEY_FORMAT_LOCAL_BTREE;
@@ -191,7 +191,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
set_bkeyp_val_u64s(f, out, 0);
} else {
- bkey_copy(out, in);
+ bkey_p_copy(out, in);
}
out->needs_whiteout |= needs_whiteout;
out = bkey_p_next(out);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 82cf243aa288..47e7770d0583 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -285,8 +285,7 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
@@ -384,8 +383,7 @@ out_nounlock:
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
if (bch2_btree_shrinker_disabled)
@@ -400,7 +398,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
struct btree *b;
unsigned i, flags;
- unregister_shrinker(&bc->shrink);
+ shrinker_free(bc->shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
@@ -454,6 +452,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
int bch2_fs_btree_cache_init(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
+ struct shrinker *shrink;
unsigned i;
int ret = 0;
@@ -473,12 +472,15 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
mutex_init(&c->verify_lock);
- bc->shrink.count_objects = bch2_btree_cache_count;
- bc->shrink.scan_objects = bch2_btree_cache_scan;
- bc->shrink.seeks = 4;
- ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
- if (ret)
+ shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
+ if (!shrink)
goto err;
+ bc->shrink = shrink;
+ shrink->count_objects = bch2_btree_cache_count;
+ shrink->scan_objects = bch2_btree_cache_scan;
+ shrink->seeks = 4;
+ shrink->private_data = c;
+ shrinker_register(shrink);
return 0;
err:
@@ -783,12 +785,12 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
"btree node header doesn't match ptr\n"
"btree %s level %u\n"
"ptr: ",
- bch2_btree_ids[b->c.btree_id], b->c.level);
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_printf(&buf, "\nheader: btree %s level %llu\n"
"min ",
- bch2_btree_ids[BTREE_NODE_ID(b->data)],
+ bch2_btree_id_str(BTREE_NODE_ID(b->data)),
BTREE_NODE_LEVEL(b->data));
bch2_bpos_to_text(&buf, b->data->min_key);
@@ -1151,8 +1153,21 @@ wait_on_io:
six_unlock_intent(&b->c.lock);
}
-void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
- const struct btree *b)
+const char *bch2_btree_id_str(enum btree_id btree)
+{
+ return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
+}
+
+void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+{
+ prt_printf(out, "%s level %u/%u\n ",
+ bch2_btree_id_str(b->c.btree_id),
+ b->c.level,
+ bch2_btree_id_root(c, b->c.btree_id)->level);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
+
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
{
struct bset_stats stats;
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 1e562b6efa62..cfb80b201d61 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -123,8 +123,9 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
return bch2_btree_id_root(c, b->c.btree_id)->b;
}
-void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
- const struct btree *);
+const char *bch2_btree_id_str(enum btree_id);
+void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 693ed067b1a7..0b5d09c8475d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -95,15 +95,15 @@ static int bch2_gc_check_topology(struct bch_fs *c,
bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " cur %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1.buf, buf2.buf) &&
- should_restart_for_topology_repair(c)) {
+ FSCK_CAN_FIX|
+ FSCK_CAN_IGNORE|
+ FSCK_NO_RATELIMIT,
+ btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key at btree %s level %u:\n"
+ " prev %s\n"
+ " cur %s",
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
+ buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
@@ -122,14 +122,12 @@ static int bch2_gc_check_topology(struct bch_fs *c,
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
bch2_bpos_to_text(&buf2, node_end);
- if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
+ if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
+ btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf) &&
should_restart_for_topology_repair(c)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
@@ -287,10 +285,11 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
cur->data->min_key), c,
+ btree_node_topology_overwritten_by_next_node,
"btree node overwritten by next node at btree %s level %u:\n"
" node %s\n"
" next %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = DROP_PREV_NODE;
goto out;
@@ -298,10 +297,11 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
bpos_predecessor(cur->data->min_key)), c,
+ btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" node %s\n"
" next %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf))
ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key));
@@ -310,20 +310,22 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
if (mustfix_fsck_err_on(bpos_ge(expected_start,
cur->data->max_key), c,
+ btree_node_topology_overwritten_by_prev_node,
"btree node overwritten by prev node at btree %s level %u:\n"
" prev %s\n"
" node %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = DROP_THIS_NODE;
goto out;
}
if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
+ btree_node_topology_bad_min_key,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" node %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf))
ret = set_node_min(c, cur, expected_start);
}
@@ -344,10 +346,11 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
bch2_bpos_to_text(&buf2, b->key.k.p);
if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
+ btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = set_node_max(c, child, b->key.k.p);
if (ret)
@@ -396,9 +399,10 @@ again:
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
if (mustfix_fsck_err_on(ret == -EIO, c,
+ btree_node_unreadable,
"Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
- bch2_btree_ids[b->c.btree_id],
+ bch2_btree_id_str(b->c.btree_id),
b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
@@ -504,9 +508,10 @@ again:
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
if (mustfix_fsck_err_on(!have_child, c,
+ btree_node_topology_interior_node_empty,
"empty interior btree node at btree %s level %u\n"
" %s",
- bch2_btree_ids[b->c.btree_id],
+ bch2_btree_id_str(b->c.btree_id),
b->c.level, buf.buf))
ret = DROP_THIS_NODE;
err:
@@ -582,7 +587,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
if (!g->gen_valid &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ fsck_err(c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@@ -599,7 +605,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ fsck_err(c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@@ -620,7 +627,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ fsck_err(c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@@ -631,7 +639,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ fsck_err(c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@@ -645,6 +654,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
if (fsck_err_on(bucket_data_type(g->data_type) &&
bucket_data_type(g->data_type) != data_type, c,
+ ptr_bucket_data_type_mismatch,
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -664,6 +674,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
if (fsck_err_on(!m || !m->alive, c,
+ ptr_to_missing_stripe,
"pointer to nonexistent stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
@@ -672,6 +683,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
do_update = true;
if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
+ ptr_to_incorrect_stripe,
"pointer does not match stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
@@ -811,6 +823,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
goto err;
if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
+ bkey_version_in_future,
"key version number higher than recorded: %llu > %llu",
k->k->version.lo,
atomic64_read(&c->key_version)))
@@ -968,9 +981,10 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
FSCK_NO_RATELIMIT,
+ btree_node_read_error,
"Unreadable btree node at btree %s level %u:\n"
" %s",
- bch2_btree_ids[b->c.btree_id],
+ bch2_btree_id_str(b->c.btree_id),
b->c.level - 1,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
@@ -1025,6 +1039,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->min_key);
if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
+ btree_root_bad_min_key,
"btree root with incorrect min_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
ret = -BCH_ERR_fsck_repair_unimplemented;
@@ -1034,6 +1049,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->max_key);
if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
+ btree_root_bad_max_key,
"btree root with incorrect max_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
ret = -BCH_ERR_fsck_repair_unimplemented;
@@ -1207,16 +1223,16 @@ static int bch2_gc_done(struct bch_fs *c,
percpu_down_write(&c->mark_lock);
-#define copy_field(_f, _msg, ...) \
+#define copy_field(_err, _f, _msg, ...) \
if (dst->_f != src->_f && \
(!verify || \
- fsck_err(c, _msg ": got %llu, should be %llu" \
+ fsck_err(c, _err, _msg ": got %llu, should be %llu" \
, ##__VA_ARGS__, dst->_f, src->_f))) \
dst->_f = src->_f
-#define copy_dev_field(_f, _msg, ...) \
- copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
-#define copy_fs_field(_f, _msg, ...) \
- copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+#define copy_dev_field(_err, _f, _msg, ...) \
+ copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
+#define copy_fs_field(_err, _f, _msg, ...) \
+ copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
@@ -1227,13 +1243,17 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
dev_usage_u64s());
- copy_dev_field(buckets_ec, "buckets_ec");
-
for (i = 0; i < BCH_DATA_NR; i++) {
- copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
- copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
- copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ copy_dev_field(dev_usage_buckets_wrong,
+ d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(dev_usage_sectors_wrong,
+ d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(dev_usage_fragmented_wrong,
+ d[i].fragmented, "%s fragmented", bch2_data_types[i]);
}
+
+ copy_dev_field(dev_usage_buckets_ec_wrong,
+ buckets_ec, "buckets_ec");
}
{
@@ -1242,17 +1262,24 @@ static int bch2_gc_done(struct bch_fs *c,
struct bch_fs_usage *src = (void *)
bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
- copy_fs_field(hidden, "hidden");
- copy_fs_field(btree, "btree");
+ copy_fs_field(fs_usage_hidden_wrong,
+ hidden, "hidden");
+ copy_fs_field(fs_usage_btree_wrong,
+ btree, "btree");
if (!metadata_only) {
- copy_fs_field(data, "data");
- copy_fs_field(cached, "cached");
- copy_fs_field(reserved, "reserved");
- copy_fs_field(nr_inodes,"nr_inodes");
+ copy_fs_field(fs_usage_data_wrong,
+ data, "data");
+ copy_fs_field(fs_usage_cached_wrong,
+ cached, "cached");
+ copy_fs_field(fs_usage_reserved_wrong,
+ reserved, "reserved");
+ copy_fs_field(fs_usage_nr_inodes_wrong,
+ nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
- copy_fs_field(persistent_reserved[i],
+ copy_fs_field(fs_usage_persistent_reserved_wrong,
+ persistent_reserved[i],
"persistent_reserved[%i]", i);
}
@@ -1268,7 +1295,8 @@ static int bch2_gc_done(struct bch_fs *c,
printbuf_reset(&buf);
bch2_replicas_entry_to_text(&buf, e);
- copy_fs_field(replicas[i], "%s", buf.buf);
+ copy_fs_field(fs_usage_replicas_wrong,
+ replicas[i], "%s", buf.buf);
}
}
@@ -1404,6 +1432,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
if (c->opts.reconstruct_alloc ||
fsck_err_on(new.data_type != gc.data_type, c,
+ alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
@@ -1412,9 +1441,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
bch2_data_types[gc.data_type]))
new.data_type = gc.data_type;
-#define copy_bucket_field(_f) \
+#define copy_bucket_field(_errtype, _f) \
if (c->opts.reconstruct_alloc || \
- fsck_err_on(new._f != gc._f, c, \
+ fsck_err_on(new._f != gc._f, c, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
@@ -1423,11 +1452,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
new._f, gc._f)) \
new._f = gc._f; \
- copy_bucket_field(gen);
- copy_bucket_field(dirty_sectors);
- copy_bucket_field(cached_sectors);
- copy_bucket_field(stripe_redundancy);
- copy_bucket_field(stripe);
+ copy_bucket_field(alloc_key_gen_wrong,
+ gen);
+ copy_bucket_field(alloc_key_dirty_sectors_wrong,
+ dirty_sectors);
+ copy_bucket_field(alloc_key_cached_sectors_wrong,
+ cached_sectors);
+ copy_bucket_field(alloc_key_stripe_wrong,
+ stripe);
+ copy_bucket_field(alloc_key_stripe_redundancy_wrong,
+ stripe_redundancy);
#undef copy_bucket_field
if (!bch2_alloc_v4_cmp(*old, new))
@@ -1584,6 +1618,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
}
if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+ reflink_v_refcount_wrong,
"reflink key has wrong refcount:\n"
" %s\n"
" should be %u",
@@ -1709,7 +1744,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
if (bad)
bch2_bkey_val_to_text(&buf, c, k);
- if (fsck_err_on(bad, c, "%s", buf.buf)) {
+ if (fsck_err_on(bad, c, stripe_sector_count_wrong,
+ "%s", buf.buf)) {
struct bkey_i_stripe *new;
new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
@@ -1954,19 +1990,17 @@ int bch2_gc_gens(struct bch_fs *c)
trans = bch2_trans_get(c);
for_each_member_device(ca, c, i) {
- struct bucket_gens *gens;
+ struct bucket_gens *gens = bucket_gens(ca);
BUG_ON(ca->oldest_gen);
- ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+ ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
if (!ca->oldest_gen) {
percpu_ref_put(&ca->ref);
ret = -BCH_ERR_ENOMEM_gc_gens;
goto err;
}
- gens = bucket_gens(ca);
-
for (b = gens->first_bucket;
b < gens->nbuckets; b++)
ca->oldest_gen[b] = gens->b[b];
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a869cf6ac7c6..37d896edb06e 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -184,7 +184,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
k = new_whiteouts;
while (ptrs != ptrs_end) {
- bkey_copy(k, *ptrs);
+ bkey_p_copy(k, *ptrs);
k = bkey_p_next(k);
ptrs++;
}
@@ -260,7 +260,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
n = bkey_p_next(k);
if (!bkey_deleted(k)) {
- bkey_copy(out, k);
+ bkey_p_copy(out, k);
out = bkey_p_next(out);
} else {
BUG_ON(k->needs_whiteout);
@@ -510,16 +510,6 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
bch2_trans_node_reinit_iter(trans, b);
}
-static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
- struct btree *b)
-{
- prt_printf(out, "%s level %u/%u\n ",
- bch2_btree_ids[b->c.btree_id],
- b->c.level,
- bch2_btree_id_root(c, b->c.btree_id)->level);
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-}
-
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca,
struct btree *b, struct bset *i,
@@ -532,7 +522,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
if (ca)
prt_printf(out, "on %s ", ca->name);
prt_printf(out, "at btree ");
- btree_pos_to_text(out, c, b);
+ bch2_btree_pos_to_text(out, c, b);
prt_printf(out, "\n node offset %u", b->written);
if (i)
@@ -540,7 +530,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_str(out, ": ");
}
-__printf(8, 9)
+__printf(9, 10)
static int __btree_err(int ret,
struct bch_fs *c,
struct bch_dev *ca,
@@ -548,6 +538,7 @@ static int __btree_err(int ret,
struct bset *i,
int write,
bool have_retry,
+ enum bch_sb_error_id err_type,
const char *fmt, ...)
{
struct printbuf out = PRINTBUF;
@@ -572,9 +563,15 @@ static int __btree_err(int ret,
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
ret = -BCH_ERR_btree_node_read_err_bad_node;
+ if (ret != -BCH_ERR_btree_node_read_err_fixable)
+ bch2_sb_error_count(c, err_type);
+
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
- mustfix_fsck_err(c, "%s", out.buf);
+ ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf);
+ if (ret != -BCH_ERR_fsck_fix &&
+ ret != -BCH_ERR_fsck_ignore)
+ goto fsck_err;
ret = -BCH_ERR_fsck_fix;
break;
case -BCH_ERR_btree_node_read_err_want_retry:
@@ -599,9 +596,11 @@ fsck_err:
return ret;
}
-#define btree_err(type, c, ca, b, i, msg, ...) \
+#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \
({ \
- int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
+ int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \
+ BCH_FSCK_ERR_##_err_type, \
+ msg, ##__VA_ARGS__); \
\
if (_ret != -BCH_ERR_fsck_fix) { \
ret = _ret; \
@@ -676,13 +675,17 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
int ret = 0;
btree_err_on(!bch2_version_compatible(version),
- -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_incompatible,
+ c, ca, b, i,
+ btree_node_unsupported_version,
"unsupported bset version %u.%u",
BCH_VERSION_MAJOR(version),
BCH_VERSION_MINOR(version));
if (btree_err_on(version < c->sb.version_min,
- -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bset_older_than_sb_min,
"bset version %u older than superblock version_min %u",
version, c->sb.version_min)) {
mutex_lock(&c->sb_lock);
@@ -693,7 +696,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
if (btree_err_on(BCH_VERSION_MAJOR(version) >
BCH_VERSION_MAJOR(c->sb.version),
- -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bset_newer_than_sb,
"bset version %u newer than superblock version %u",
version, c->sb.version)) {
mutex_lock(&c->sb_lock);
@@ -703,11 +708,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
}
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
- -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_incompatible,
+ c, ca, b, i,
+ btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
if (btree_err_on(offset + sectors > btree_sectors(c),
- -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ bset_past_end_of_btree_node,
"bset past end of btree node")) {
i->u64s = 0;
ret = 0;
@@ -715,12 +724,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
}
btree_err_on(offset && !i->u64s,
- -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ bset_empty,
"empty bset");
- btree_err_on(BSET_OFFSET(i) &&
- BSET_OFFSET(i) != offset,
- -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
+ btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_wrong_sector_offset,
"bset at wrong sector offset");
if (!offset) {
@@ -734,16 +746,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
/* XXX endianness */
btree_err_on(bp->seq != bn->keys.seq,
- -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ bset_bad_seq,
"incorrect sequence number (wrong btree node)");
}
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
- -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, i,
+ btree_node_bad_btree,
"incorrect btree id");
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
- -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, i,
+ btree_node_bad_level,
"incorrect level");
if (!write)
@@ -760,7 +778,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
}
btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
- -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_min_key,
"incorrect min_key: got %s should be %s",
(printbuf_reset(&buf1),
bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
@@ -769,7 +789,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
}
btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
- -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, i,
+ btree_node_bad_max_key,
"incorrect max key %s",
(printbuf_reset(&buf1),
bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
@@ -779,7 +801,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BSET_BIG_ENDIAN(i), write, bn);
btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
- -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_bad_node,
+ c, ca, b, i,
+ btree_node_bad_format,
"invalid bkey format: %s\n %s", buf1.buf,
(printbuf_reset(&buf2),
bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
@@ -802,7 +826,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
struct printbuf *err)
{
return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
- (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
+ (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
@@ -823,14 +847,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
struct bkey tmp;
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
- -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_past_bset_end,
"key extends past end of bset")) {
i->u64s = cpu_to_le16((u64 *) k - i->_data);
break;
}
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
- -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_bad_format,
"invalid bkey format %u", k->format)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
@@ -849,12 +877,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
printbuf_reset(&buf);
if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
printbuf_reset(&buf);
- prt_printf(&buf, "invalid bkey: ");
bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
prt_printf(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, u.s_c);
- btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
+ btree_err(-BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bad_bkey,
+ "invalid bkey: %s", buf.buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
@@ -878,7 +908,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
bch2_dump_bset(c, b, i, 0);
- if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) {
+ if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_out_of_order,
+ "%s", buf.buf)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
@@ -919,47 +952,62 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
if (bch2_meta_read_fault("btree"))
- btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+ btree_err(-BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_fault_injected,
"dynamic fault");
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
- -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_magic,
"bad magic: want %llx, got %llx",
bset_magic(c), le64_to_cpu(b->data->magic));
- btree_err_on(!b->data->keys.seq,
- -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
- "bad btree header: seq 0");
-
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
btree_err_on(b->data->keys.seq != bp->seq,
- -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_seq,
"got wrong btree node (seq %llx want %llx)",
b->data->keys.seq, bp->seq);
+ } else {
+ btree_err_on(!b->data->keys.seq,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_seq,
+ "bad btree header: seq 0");
}
while (b->written < (ptr_written ?: btree_sectors(c))) {
unsigned sectors;
struct nonce nonce;
- struct bch_csum csum;
bool first = !b->written;
+ bool csum_bad;
if (!b->written) {
i = &b->data->keys;
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
- "unknown checksum type %llu",
- BSET_CSUM_TYPE(i));
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_unknown_csum,
+ "unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
- csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
- btree_err_on(bch2_crc_cmp(csum, b->data->csum),
- -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
+ csum_bad = bch2_crc_cmp(b->data->csum,
+ csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
+ if (csum_bad)
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+ btree_err_on(csum_bad,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_bad_csum,
"invalid checksum");
ret = bset_encrypt(c, i, b->written << 9);
@@ -969,7 +1017,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
- -BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL,
+ -BCH_ERR_btree_node_read_err_incompatible,
+ c, NULL, b, NULL,
+ btree_node_unsupported_version,
"btree node does not have NEW_EXTENT_OVERWRITE set");
sectors = vstruct_sectors(b->data, c->block_bits);
@@ -981,15 +1031,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
break;
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
- "unknown checksum type %llu",
- BSET_CSUM_TYPE(i));
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_unknown_csum,
+ "unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
- csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
- btree_err_on(bch2_crc_cmp(csum, bne->csum),
- -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
+ csum_bad = bch2_crc_cmp(bne->csum,
+ csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
+ if (csum_bad)
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+ btree_err_on(csum_bad,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_bad_csum,
"invalid checksum");
ret = bset_encrypt(c, i, b->written << 9);
@@ -1022,12 +1078,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
true);
btree_err_on(blacklisted && first,
- -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ bset_blacklisted_journal_seq,
"first btree node bset has blacklisted journal seq (%llu)",
le64_to_cpu(i->journal_seq));
btree_err_on(blacklisted && ptr_written,
- -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ first_bset_blacklisted_journal_seq,
"found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
le64_to_cpu(i->journal_seq),
b->written, b->written + sectors, ptr_written);
@@ -1044,7 +1104,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (ptr_written) {
btree_err_on(b->written < ptr_written,
- -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, NULL,
+ btree_node_data_missing,
"btree node data missing: expected %u sectors, found %u",
ptr_written, b->written);
} else {
@@ -1055,7 +1117,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
!bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq),
true),
- -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, NULL,
+ btree_node_bset_after_end,
"found bset signature after last bset");
}
@@ -1097,7 +1161,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
prt_printf(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, u.s_c);
- btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
+ btree_err(-BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bad_bkey,
+ "%s", buf.buf);
btree_keys_account_key_drop(&b->nr, 0, k);
@@ -1177,8 +1244,9 @@ static void btree_node_read_work(struct work_struct *work)
}
start:
printbuf_reset(&buf);
- btree_pos_to_text(&buf, c, b);
- bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ "btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
@@ -1213,7 +1281,7 @@ start:
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->key.k.p);
bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
- __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf);
+ __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
bch2_btree_node_rewrite_async(c, b);
}
@@ -1322,14 +1390,20 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
}
written2 = btree_node_sectors_written(c, ra->buf[i]);
- if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
+ if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_replicas_sectors_written_mismatch,
"btree node sectors written mismatch: %u != %u",
written, written2) ||
btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
- -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_bset_after_end,
"found bset signature after last bset") ||
btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
- -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_replicas_data_mismatch,
"btree node replicas content mismatch"))
dump_bset_maps = true;
@@ -1524,7 +1598,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
struct printbuf buf = PRINTBUF;
prt_str(&buf, "btree node read error: no device to read from\n at ");
- btree_pos_to_text(&buf, c, b);
+ bch2_btree_pos_to_text(&buf, c, b);
bch_err(c, "%s", buf.buf);
if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
@@ -1759,7 +1833,8 @@ static void btree_node_write_endio(struct bio *bio)
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
- if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ "btree write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 1d79514754d7..6fa90bcd7016 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -257,7 +257,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
- !btree_type_has_snapshots(iter->btree_id));
+ !btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
bch2_btree_path_verify(trans, iter->update_path);
@@ -362,7 +362,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
bch2_bpos_to_text(&buf, pos);
panic("not locked: %s %s%s\n",
- bch2_btree_ids[id], buf.buf,
+ bch2_btree_id_str(id), buf.buf,
key_cache ? " cached" : "");
}
@@ -1109,6 +1109,9 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
if (unlikely(ret))
goto out;
+ if (unlikely(!trans->srcu_held))
+ bch2_trans_srcu_lock(trans);
+
/*
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
* and re-traverse the path without a transaction restart:
@@ -1371,7 +1374,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
struct bkey_s_c old = { &i->old_k, i->old_v };
prt_printf(buf, "update: btree=%s cached=%u %pS",
- bch2_btree_ids[i->btree_id],
+ bch2_btree_id_str(i->btree_id),
i->cached,
(void *) i->ip_allocated);
prt_newline(buf);
@@ -1387,7 +1390,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
trans_for_each_wb_update(trans, wb) {
prt_printf(buf, "update: btree=%s wb=1 %pS",
- bch2_btree_ids[wb->btree],
+ bch2_btree_id_str(wb->btree),
(void *) i->ip_allocated);
prt_newline(buf);
@@ -1416,7 +1419,7 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
path->idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
- bch2_btree_ids[path->btree_id],
+ bch2_btree_id_str(path->btree_id),
path->level);
bch2_bpos_to_text(out, path->pos);
@@ -1523,6 +1526,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
+ path->alloc_seq++;
btree_path_list_add(trans, pos, path);
trans->paths_sorted = false;
@@ -1598,7 +1602,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
locks_want = min(locks_want, BTREE_MAX_DEPTH);
if (locks_want > path->locks_want)
- bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
+ bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
return path;
}
@@ -2829,18 +2833,36 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
return p;
}
-static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
+static inline void check_srcu_held_too_long(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
- struct btree_path *path;
+ WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
+ "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
+ (jiffies - trans->srcu_lock_time) / HZ);
+}
- trans_for_each_path(trans, path)
- if (path->cached && !btree_node_locked(path, 0))
- path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+void bch2_trans_srcu_unlock(struct btree_trans *trans)
+{
+ if (trans->srcu_held) {
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
- srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- trans->srcu_lock_time = jiffies;
+ trans_for_each_path(trans, path)
+ if (path->cached && !btree_node_locked(path, 0))
+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+ check_srcu_held_too_long(trans);
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+ trans->srcu_held = false;
+ }
+}
+
+void bch2_trans_srcu_lock(struct btree_trans *trans)
+{
+ if (!trans->srcu_held) {
+ trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
+ trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
+ }
}
/**
@@ -2894,8 +2916,9 @@ u32 bch2_trans_begin(struct btree_trans *trans)
}
trans->last_begin_time = now;
- if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
- bch2_trans_reset_srcu_lock(trans);
+ if (unlikely(trans->srcu_held &&
+ time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
+ bch2_trans_srcu_unlock(trans);
trans->last_begin_ip = _RET_IP_;
if (trans->restarted) {
@@ -2980,8 +3003,9 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans->wb_updates_size = s->wb_updates_size;
}
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
struct btree_trans *pos;
@@ -3025,7 +3049,7 @@ leaked:
trans_for_each_path(trans, path)
if (path->ref)
printk(KERN_ERR " btree %s %pS\n",
- bch2_btree_ids[path->btree_id],
+ bch2_btree_id_str(path->btree_id),
(void *) path->ip_allocated);
/* Be noisy about this: */
bch2_fatal_error(c);
@@ -3058,9 +3082,10 @@ void bch2_trans_put(struct btree_trans *trans)
check_btree_paths_leaked(trans);
- srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+ if (trans->srcu_held) {
+ check_srcu_held_too_long(trans);
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+ }
kfree(trans->extra_journal_entries.data);
@@ -3100,7 +3125,7 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
prt_tab(out);
prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
- b->level, bch2_btree_ids[b->btree_id]);
+ b->level, bch2_btree_id_str(b->btree_id));
bch2_bpos_to_text(out, btree_node_pos(b));
prt_tab(out);
@@ -3130,7 +3155,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
path->idx,
path->cached ? 'c' : 'b',
path->level,
- bch2_btree_ids[path->btree_id]);
+ bch2_btree_id_str(path->btree_id));
bch2_bpos_to_text(out, path->pos);
prt_newline(out);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index fbe273453db3..85e7cb52f6b6 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -274,6 +274,7 @@ void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
int bch2_trans_relock(struct btree_trans *);
int bch2_trans_relock_notrace(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_unlock_long(struct btree_trans *);
bool bch2_trans_locked(struct btree_trans *);
static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
@@ -411,11 +412,11 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
- btree_node_type_is_extents(btree_id))
+ btree_id_is_extents(btree_id))
flags |= BTREE_ITER_IS_EXTENTS;
if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
- !btree_type_has_snapshots(btree_id))
+ !btree_type_has_snapshot_field(btree_id))
flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
@@ -579,6 +580,9 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
__bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \
KEY_TYPE_##_type, sizeof(*_val), _val)
+void bch2_trans_srcu_unlock(struct btree_trans *);
+void bch2_trans_srcu_lock(struct btree_trans *);
+
u32 bch2_trans_begin(struct btree_trans *);
/*
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 29a0b566a4fe..37fbf22de8fc 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -89,10 +89,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
- if (ck->c.lock.readers)
+ if (ck->c.lock.readers) {
list_move_tail(&ck->list, &bc->freed_pcpu);
- else
+ bc->nr_freed_pcpu++;
+ } else {
list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ bc->nr_freed_nonpcpu++;
+ }
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
@@ -109,6 +112,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
{
struct bkey_cached *pos;
+ bc->nr_freed_nonpcpu++;
+
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
pos->btree_trans_barrier_seq)) {
@@ -158,6 +163,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
#else
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ bc->nr_freed_nonpcpu++;
mutex_unlock(&bc->lock);
#endif
} else {
@@ -217,6 +223,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_nonpcpu--;
f->objs[f->nr++] = ck;
}
@@ -229,6 +236,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_nonpcpu)) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_nonpcpu--;
}
mutex_unlock(&bc->lock);
#endif
@@ -324,7 +332,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
- bch2_btree_ids[path->btree_id]);
+ bch2_btree_id_str(path->btree_id));
return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
}
@@ -407,7 +415,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_ids[ck->key.btree_id], new_u64s);
+ bch2_btree_id_str(ck->key.btree_id), new_u64s);
ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
goto err;
}
@@ -509,7 +517,7 @@ fill:
* path->uptodate yet:
*/
if (!path->locks_want &&
- !__bch2_btree_path_upgrade(trans, path, 1)) {
+ !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
goto err;
@@ -664,7 +672,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
goto out;
bch2_journal_pin_drop(j, &ck->journal);
- bch2_journal_preres_put(j, &ck->res);
BUG_ON(!btree_node_locked(c_iter.path, 0));
@@ -762,18 +769,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
BUG_ON(insert->k.u64s > ck->u64s);
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- int difference;
-
- BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
-
- difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
- if (difference > 0) {
- trans->journal_preres.u64s -= difference;
- ck->res.u64s += difference;
- }
- }
-
bkey_copy(ck->k, insert);
ck->valid = true;
@@ -834,8 +829,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_key_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bucket_table *tbl;
struct bkey_cached *ck, *t;
@@ -851,6 +845,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
+ scanned += bc->nr_freed_nonpcpu;
+
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -860,13 +856,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- scanned++;
freed++;
+ bc->nr_freed_nonpcpu--;
}
if (scanned >= nr)
goto out;
+ scanned += bc->nr_freed_pcpu;
+
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -876,8 +874,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- scanned++;
freed++;
+ bc->nr_freed_pcpu--;
}
if (scanned >= nr)
@@ -932,8 +930,7 @@ out:
static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_key_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
long nr = atomic_long_read(&bc->nr_keys) -
atomic_long_read(&bc->nr_dirty);
@@ -953,7 +950,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
int cpu;
#endif
- unregister_shrinker(&bc->shrink);
+ shrinker_free(bc->shrink);
mutex_lock(&bc->lock);
@@ -984,6 +981,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
}
#endif
+ BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
+ BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
+
list_splice(&bc->freed_pcpu, &items);
list_splice(&bc->freed_nonpcpu, &items);
@@ -993,7 +993,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
- bch2_journal_preres_put(&c->journal, &ck->res);
list_del(&ck->list);
kfree(ck->k);
@@ -1027,6 +1026,7 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+ struct shrinker *shrink;
#ifdef __KERNEL__
bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
@@ -1039,11 +1039,15 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
bc->table_init_done = true;
- bc->shrink.seeks = 0;
- bc->shrink.count_objects = bch2_btree_key_cache_count;
- bc->shrink.scan_objects = bch2_btree_key_cache_scan;
- if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name))
+ shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
+ if (!shrink)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ bc->shrink = shrink;
+ shrink->seeks = 0;
+ shrink->count_objects = bch2_btree_key_cache_count;
+ shrink->scan_objects = bch2_btree_key_cache_scan;
+ shrink->private_data = c;
+ shrinker_register(shrink);
return 0;
}
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
new file mode 100644
index 000000000000..290e4e57df5b
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+
+struct btree_key_cache_freelist {
+ struct bkey_cached *objs[16];
+ unsigned nr;
+};
+
+struct btree_key_cache {
+ struct mutex lock;
+ struct rhashtable table;
+ bool table_init_done;
+
+ struct list_head freed_pcpu;
+ size_t nr_freed_pcpu;
+ struct list_head freed_nonpcpu;
+ size_t nr_freed_nonpcpu;
+
+ struct shrinker *shrink;
+ unsigned shrink_iter;
+ struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+ atomic_long_t nr_freed;
+ atomic_long_t nr_keys;
+ atomic_long_t nr_dirty;
+};
+
+struct bkey_cached_key {
+ u32 btree_id;
+ struct bpos pos;
+} __packed __aligned(4);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 40c8ed8f7bf1..3d48834d091f 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -431,7 +431,8 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
static inline bool btree_path_get_locks(struct btree_trans *trans,
struct btree_path *path,
- bool upgrade)
+ bool upgrade,
+ struct get_locks_fail *f)
{
unsigned l = path->level;
int fail_idx = -1;
@@ -442,8 +443,14 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
if (!(upgrade
? bch2_btree_node_upgrade(trans, path, l)
- : bch2_btree_node_relock(trans, path, l)))
- fail_idx = l;
+ : bch2_btree_node_relock(trans, path, l))) {
+ fail_idx = l;
+
+ if (f) {
+ f->l = l;
+ f->b = path->l[l].b;
+ }
+ }
l++;
} while (l < path->locks_want);
@@ -584,7 +591,9 @@ __flatten
bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
- return btree_path_get_locks(trans, path, false);
+ struct get_locks_fail f;
+
+ return btree_path_get_locks(trans, path, false, &f);
}
int __bch2_btree_path_relock(struct btree_trans *trans,
@@ -600,22 +609,24 @@ int __bch2_btree_path_relock(struct btree_trans *trans,
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
struct btree_path *path,
- unsigned new_locks_want)
+ unsigned new_locks_want,
+ struct get_locks_fail *f)
{
EBUG_ON(path->locks_want >= new_locks_want);
path->locks_want = new_locks_want;
- return btree_path_get_locks(trans, path, true);
+ return btree_path_get_locks(trans, path, true, f);
}
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
- unsigned new_locks_want)
+ unsigned new_locks_want,
+ struct get_locks_fail *f)
{
struct btree_path *linked;
- if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
+ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
return true;
/*
@@ -644,7 +655,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true);
+ btree_path_get_locks(trans, linked, true, NULL);
}
return false;
@@ -656,6 +667,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
{
unsigned l;
+ if (trans->restarted)
+ return;
+
EBUG_ON(path->locks_want < new_locks_want);
path->locks_want = new_locks_want;
@@ -674,6 +688,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
}
bch2_btree_path_verify_locks(path);
+
+ path->downgrade_seq++;
+ trace_path_downgrade(trans, _RET_IP_, path);
}
/* Btree transaction locking: */
@@ -682,6 +699,9 @@ void bch2_trans_downgrade(struct btree_trans *trans)
{
struct btree_path *path;
+ if (trans->restarted)
+ return;
+
trans_for_each_path(trans, path)
bch2_btree_path_downgrade(trans, path);
}
@@ -733,6 +753,12 @@ void bch2_trans_unlock(struct btree_trans *trans)
__bch2_btree_path_unlock(trans, path);
}
+void bch2_trans_unlock_long(struct btree_trans *trans)
+{
+ bch2_trans_unlock(trans);
+ bch2_trans_srcu_unlock(trans);
+}
+
bool bch2_trans_locked(struct btree_trans *trans)
{
struct btree_path *path;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 6231e9ffc5d7..11b0a2c8cd69 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -355,26 +355,36 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
+
+struct get_locks_fail {
+ unsigned l;
+ struct btree *b;
+};
+
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
- struct btree_path *, unsigned);
+ struct btree_path *, unsigned,
+ struct get_locks_fail *);
+
bool __bch2_btree_path_upgrade(struct btree_trans *,
- struct btree_path *, unsigned);
+ struct btree_path *, unsigned,
+ struct get_locks_fail *);
static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
+ struct get_locks_fail f;
unsigned old_locks_want = path->locks_want;
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
if (path->locks_want < new_locks_want
- ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+ ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
: path->uptodate == BTREE_ITER_UPTODATE)
return 0;
trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
- old_locks_want, new_locks_want);
+ old_locks_want, new_locks_want, &f);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
}
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 04c1f4610972..12907beda98c 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
bch2_btree_init_next(trans, b);
}
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+ while (--i >= trans->updates) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
+
+ bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+ }
+
+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int bch2_trans_lock_write(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ EBUG_ON(trans->write_locked);
+
+ trans_for_each_update(trans, i) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
+
+ if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+ return trans_lock_write_fail(trans, i);
+
+ if (!i->cached)
+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+ }
+
+ trans->write_locked = true;
+ return 0;
+}
+
+static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+ if (likely(trans->write_locked)) {
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ if (!same_leaf_as_prev(trans, i))
+ bch2_btree_node_unlock_write_inlined(trans, i->path,
+ insert_l(i)->b);
+ trans->write_locked = false;
+ }
+}
+
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
@@ -269,23 +316,13 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->level != i->path->level);
BUG_ON(i->btree_id != i->path->btree_id);
EBUG_ON(!i->level &&
+ btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
}
-static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
- unsigned long trace_ip)
-{
- return drop_locks_do(trans,
- bch2_journal_preres_get(&trans->c->journal,
- &trans->journal_preres,
- trans->journal_preres_u64s,
- (flags & BCH_WATERMARK_MASK)));
-}
-
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags)
{
@@ -320,6 +357,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
return 0;
}
+noinline static int
+btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
+ struct btree_path *path, unsigned new_u64s)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+ struct bkey_cached *ck = (void *) path->l[0].b;
+ struct bkey_i *new_k;
+ int ret;
+
+ bch2_trans_unlock_write(trans);
+ bch2_trans_unlock(trans);
+
+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+ if (!new_k) {
+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_id_str(path->btree_id), new_u64s);
+ return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+ }
+
+ ret = bch2_trans_relock(trans) ?:
+ bch2_trans_lock_write(trans);
+ if (unlikely(ret)) {
+ kfree(new_k);
+ return ret;
+ }
+
+ memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
+
+ trans_for_each_update(trans, i)
+ if (i->old_v == &ck->k->v)
+ i->old_v = &new_k->v;
+
+ kfree(ck->k);
+ ck->u64s = new_u64s;
+ ck->k = new_k;
+ return 0;
+}
+
static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned u64s)
{
@@ -346,12 +422,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
- if (!new_k) {
- bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_ids[path->btree_id], new_u64s);
- return -BCH_ERR_ENOMEM_btree_key_cache_insert;
- }
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+ if (unlikely(!new_k))
+ return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
trans_for_each_update(trans, i)
if (i->old_v == &ck->k->v)
@@ -379,11 +452,10 @@ static int run_one_mem_trigger(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
- if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id))
+ if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
return 0;
- if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
ret = bch2_mark_key(trans, i->btree_id, i->level,
old, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
@@ -425,8 +497,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
if (!i->insert_trigger_run &&
!i->overwrite_trigger_run &&
- old_ops->trans_trigger == new_ops->trans_trigger &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ old_ops->trans_trigger == new_ops->trans_trigger) {
i->overwrite_trigger_run = true;
i->insert_trigger_run = true;
return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
@@ -683,7 +754,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
BCH_JSET_ENTRY_overwrite,
i->btree_id, i->level,
i->old_k.u64s);
- bkey_reassemble(&entry->start[0],
+ bkey_reassemble((struct bkey_i *) entry->start,
(struct bkey_s_c) { &i->old_k, i->old_v });
}
@@ -691,7 +762,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
BCH_JSET_ENTRY_btree_keys,
i->btree_id, i->level,
i->k->k.u64s);
- bkey_copy(&entry->start[0], i->k);
+ bkey_copy((struct bkey_i *) entry->start, i->k);
}
trans_for_each_wb_update(trans, wb) {
@@ -699,7 +770,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
BCH_JSET_ENTRY_btree_keys,
wb->btree, 0,
wb->k.k.u64s);
- bkey_copy(&entry->start[0], &wb->k);
+ bkey_copy((struct bkey_i *) entry->start, &wb->k);
}
if (trans->journal_seq)
@@ -733,37 +804,6 @@ revert_fs_usage:
return ret;
}
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
- while (--i >= trans->updates) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
- }
-
- trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int trans_lock_write(struct btree_trans *trans)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_update(trans, i) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
- return trans_lock_write_fail(trans, i);
-
- if (!i->cached)
- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
- }
-
- return 0;
-}
-
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
struct btree_insert_entry *i;
@@ -776,12 +816,12 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
}
-static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
+ enum bkey_invalid_flags flags,
struct btree_insert_entry *i,
struct printbuf *err)
{
struct bch_fs *c = trans->c;
- int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
printbuf_reset(err);
prt_printf(err, "invalid bkey on insert from %s -> %ps",
@@ -792,8 +832,7 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, un
bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
prt_newline(err);
- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
- i->bkey_type, rw, err);
+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err);
bch2_print_string_as_lines(KERN_ERR, err->buf);
bch2_inconsistent_error(c);
@@ -832,15 +871,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
}
}
- ret = bch2_journal_preres_get(&c->journal,
- &trans->journal_preres, trans->journal_preres_u64s,
- (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
- if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
- ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
- if (unlikely(ret))
- return ret;
-
- ret = trans_lock_write(trans);
+ ret = bch2_trans_lock_write(trans);
if (unlikely(ret))
return ret;
@@ -849,10 +880,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans);
- trans_for_each_update(trans, i)
- if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(trans, i->path,
- insert_l(i)->b);
+ bch2_trans_unlock_write(trans);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@@ -864,12 +892,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
*/
bch2_journal_res_put(&c->journal, &trans->journal_res);
- if (unlikely(ret))
- return ret;
-
- bch2_trans_downgrade(trans);
-
- return 0;
+ return ret;
}
static int journal_reclaim_wait_done(struct bch_fs *c)
@@ -1010,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
struct btree_write_buffered_key *wb;
- unsigned u64s;
int ret = 0;
if (!trans->nr_updates &&
@@ -1034,7 +1056,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
i->bkey_type, invalid_flags, &buf)))
- ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
+ ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf);
btree_insert_entry_checks(trans, i);
printbuf_exit(&buf);
@@ -1070,13 +1092,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
- memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
trans->journal_u64s = trans->extra_journal_entries.nr;
- trans->journal_preres_u64s = 0;
-
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-
if (trans->journal_transaction_names)
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
@@ -1092,16 +1109,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (i->key_cache_already_flushed)
continue;
- /* we're going to journal the key being updated: */
- u64s = jset_u64s(i->k->k.u64s);
- if (i->cached &&
- likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
- trans->journal_preres_u64s += u64s;
-
if (i->flags & BTREE_UPDATE_NOJOURNAL)
continue;
- trans->journal_u64s += u64s;
+ /* we're going to journal the key being updated: */
+ trans->journal_u64s += jset_u64s(i->k->k.u64s);
/* and we're also going to log the overwrite: */
if (trans->journal_transaction_names)
@@ -1133,11 +1145,11 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
+ if (!ret)
+ bch2_trans_downgrade(trans);
bch2_trans_reset_updates(trans);
return ret;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index c9a38e254949..60453ba86c4b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -5,7 +5,7 @@
#include <linux/list.h>
#include <linux/rhashtable.h>
-//#include "bkey_methods.h"
+#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
#include "errcode.h"
@@ -163,7 +163,7 @@ struct btree_cache {
unsigned used;
unsigned reserve;
atomic_t dirty;
- struct shrinker shrink;
+ struct shrinker *shrink;
/*
* If we need to allocate memory for a new btree node and that
@@ -228,6 +228,8 @@ struct btree_path {
u8 sorted_idx;
u8 ref;
u8 intent_ref;
+ u32 alloc_seq;
+ u32 downgrade_seq;
/* btree_iter_copy starts here: */
struct bpos pos;
@@ -310,31 +312,6 @@ struct btree_iter {
#endif
};
-struct btree_key_cache_freelist {
- struct bkey_cached *objs[16];
- unsigned nr;
-};
-
-struct btree_key_cache {
- struct mutex lock;
- struct rhashtable table;
- bool table_init_done;
- struct list_head freed_pcpu;
- struct list_head freed_nonpcpu;
- struct shrinker shrink;
- unsigned shrink_iter;
- struct btree_key_cache_freelist __percpu *pcpu_freed;
-
- atomic_long_t nr_freed;
- atomic_long_t nr_keys;
- atomic_long_t nr_dirty;
-};
-
-struct bkey_cached_key {
- u32 btree_id;
- struct bpos pos;
-} __packed __aligned(4);
-
#define BKEY_CACHED_ACCESSED 0
#define BKEY_CACHED_DIRTY 1
@@ -350,7 +327,6 @@ struct bkey_cached {
struct rhash_head hash;
struct list_head list;
- struct journal_preres res;
struct journal_entry_pin journal;
u64 seq;
@@ -387,11 +363,7 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};
-#ifndef CONFIG_LOCKDEP
#define BTREE_ITER_MAX 64
-#else
-#define BTREE_ITER_MAX 32
-#endif
struct btree_trans_commit_hook;
typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
@@ -424,6 +396,7 @@ struct btree_trans {
u8 nr_updates;
u8 nr_wb_updates;
u8 wb_updates_size;
+ bool srcu_held:1;
bool used_mempool:1;
bool in_traverse_all:1;
bool paths_sorted:1;
@@ -431,6 +404,7 @@ struct btree_trans {
bool journal_transaction_names:1;
bool journal_replay_not_finished:1;
bool notrace_relock_fail:1;
+ bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
unsigned long last_begin_ip;
@@ -462,11 +436,9 @@ struct btree_trans {
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
- struct journal_preres journal_preres;
u64 *journal_seq;
struct disk_reservation *disk_res;
unsigned journal_u64s;
- unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
};
@@ -636,16 +608,17 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i)
}
enum btree_node_type {
-#define x(kwd, val, ...) BKEY_TYPE_##kwd = val,
+ BKEY_TYPE_btree,
+#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
BCH_BTREE_IDS()
#undef x
- BKEY_TYPE_btree,
+ BKEY_TYPE_NR
};
/* Type of a key in btree @id at level @level: */
static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
{
- return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
+ return level ? BKEY_TYPE_btree : (unsigned) id + 1;
}
/* Type of keys @b contains: */
@@ -654,19 +627,21 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
return __btree_node_type(b->c.level, b->c.btree_id);
}
+const char *bch2_btree_node_type_str(enum btree_node_type);
+
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
- (BIT(BKEY_TYPE_extents)| \
- BIT(BKEY_TYPE_alloc)| \
- BIT(BKEY_TYPE_inodes)| \
- BIT(BKEY_TYPE_stripes)| \
- BIT(BKEY_TYPE_reflink)| \
- BIT(BKEY_TYPE_btree))
+ (BIT_ULL(BKEY_TYPE_extents)| \
+ BIT_ULL(BKEY_TYPE_alloc)| \
+ BIT_ULL(BKEY_TYPE_inodes)| \
+ BIT_ULL(BKEY_TYPE_stripes)| \
+ BIT_ULL(BKEY_TYPE_reflink)| \
+ BIT_ULL(BKEY_TYPE_btree))
#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
- (BIT(BKEY_TYPE_alloc)| \
- BIT(BKEY_TYPE_inodes)| \
- BIT(BKEY_TYPE_stripes)| \
- BIT(BKEY_TYPE_snapshots))
+ (BIT_ULL(BKEY_TYPE_alloc)| \
+ BIT_ULL(BKEY_TYPE_inodes)| \
+ BIT_ULL(BKEY_TYPE_stripes)| \
+ BIT_ULL(BKEY_TYPE_snapshots))
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
@@ -674,13 +649,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
- return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
+ return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
}
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
const unsigned mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << nr)
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
BCH_BTREE_IDS()
#undef x
;
@@ -690,7 +665,7 @@ static inline bool btree_node_type_is_extents(enum btree_node_type type)
static inline bool btree_id_is_extents(enum btree_id btree)
{
- return btree_node_type_is_extents((enum btree_node_type) btree);
+ return btree_node_type_is_extents(__btree_node_type(0, btree));
}
static inline bool btree_type_has_snapshots(enum btree_id id)
@@ -704,6 +679,17 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
return (1U << id) & mask;
}
+static inline bool btree_type_has_snapshot_field(enum btree_id id)
+{
+ const unsigned mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+ BCH_BTREE_IDS()
+#undef x
+ ;
+
+ return (1U << id) & mask;
+}
+
static inline bool btree_type_has_ptrs(enum btree_id id)
{
const unsigned mask = 0
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7dbf6b6c7f34..76f27bc9fa24 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -513,8 +513,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
up_read(&c->gc_lock);
as->took_gc_lock = false;
- bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res);
@@ -734,8 +732,6 @@ err:
bch2_journal_pin_drop(&c->journal, &as->journal);
- bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
mutex_lock(&c->btree_interior_update_lock);
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
@@ -1047,7 +1043,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned journal_flags = 0;
int ret = 0;
u32 restart_count = trans->restart_count;
@@ -1061,10 +1056,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
- journal_flags |= JOURNAL_RES_GET_NONBLOCK;
- journal_flags |= watermark;
-
while (1) {
nr_nodes[!!update_level] += 1 + split;
update_level++;
@@ -1129,27 +1120,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret)
goto err;
- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags|JOURNAL_RES_GET_NONBLOCK);
- if (ret) {
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
- goto err;
- }
-
- ret = drop_locks_do(trans,
- bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags));
- if (ret == -BCH_ERR_journal_preres_get_blocked) {
- trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
- }
- if (ret)
- goto err;
- }
-
ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
@@ -1274,14 +1244,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
btree_node_type(b), WRITE, &buf) ?:
- bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
+ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
printbuf_reset(&buf);
prt_printf(&buf, "inserting invalid bkey\n ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
prt_printf(&buf, "\n ");
bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
btree_node_type(b), WRITE, &buf);
- bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
+ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
bch2_fs_inconsistent(c, "%s", buf.buf);
dump_stack();
@@ -1987,7 +1957,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
out:
if (new_path)
bch2_path_put(trans, new_path, true);
- bch2_btree_path_downgrade(trans, iter->path);
+ bch2_trans_downgrade(trans);
return ret;
err:
bch2_btree_node_free_never_used(as, trans, n);
@@ -2411,30 +2381,24 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry
r->level = entry->level;
r->alive = true;
- bkey_copy(&r->key, &entry->start[0]);
+ bkey_copy(&r->key, (struct bkey_i *) entry->start);
mutex_unlock(&c->btree_root_lock);
}
struct jset_entry *
bch2_btree_roots_to_journal_entries(struct bch_fs *c,
- struct jset_entry *start,
- struct jset_entry *end)
+ struct jset_entry *end,
+ unsigned long skip)
{
- struct jset_entry *entry;
- unsigned long have = 0;
unsigned i;
- for (entry = start; entry < end; entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_btree_root)
- __set_bit(entry->btree_id, &have);
-
mutex_lock(&c->btree_root_lock);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
- if (r->alive && !test_bit(i, &have)) {
+ if (r->alive && !test_bit(i, &skip)) {
journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
i, r->level, &r->key, r->key.k.u64s);
end = vstruct_next(end);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 5e0a467fe905..031076e75fa1 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -55,7 +55,6 @@ struct btree_update {
unsigned update_level;
struct disk_reservation disk_res;
- struct journal_preres journal_preres;
/*
* BTREE_INTERIOR_UPDATING_NODE:
@@ -271,7 +270,7 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
- __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+ __bch_btree_u64s_remaining(c, b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
@@ -303,7 +302,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
- bkey_copy(unwritten_whiteouts_start(c, b), &k);
+ bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
}
/*
@@ -325,7 +324,7 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *);
void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
- struct jset_entry *, struct jset_entry *);
+ struct jset_entry *, unsigned long);
void bch2_do_pending_node_rewrites(struct bch_fs *);
void bch2_free_pending_node_rewrites(struct bch_fs *);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a1a4b5feadaa..58d8c6ffd955 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -370,8 +370,8 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
idx = bch2_replicas_entry_idx(c, r);
if (idx < 0 &&
- fsck_err(c, "no replicas entry\n"
- " while marking %s",
+ fsck_err(c, ptr_to_missing_replicas_entry,
+ "no replicas entry\n while marking %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
percpu_up_read(&c->mark_lock);
ret = bch2_mark_replicas(c, r);
@@ -695,6 +695,7 @@ static int check_bucket_ref(struct btree_trans *trans,
if (gen_after(ptr->gen, b_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -707,6 +708,7 @@ static int check_bucket_ref(struct btree_trans *trans,
if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_ptr_too_stale,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -720,6 +722,7 @@ static int check_bucket_ref(struct btree_trans *trans,
if (b_gen != ptr->gen && !ptr->cached) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_stale_dirty_ptr,
"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -741,6 +744,7 @@ static int check_bucket_ref(struct btree_trans *trans,
ptr_data_type &&
bucket_data_type != ptr_data_type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -754,6 +758,7 @@ static int check_bucket_ref(struct btree_trans *trans,
if ((u64) bucket_sectors + sectors > U32_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_bucket_sector_count_overflow,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -935,14 +940,12 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
return 0;
}
-int bch2_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+static int __mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@@ -1018,6 +1021,14 @@ int bch2_mark_extent(struct btree_trans *trans,
return 0;
}
+int bch2_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
+}
+
int bch2_mark_stripe(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
@@ -1124,13 +1135,11 @@ int bch2_mark_stripe(struct btree_trans *trans,
return 0;
}
-int bch2_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+static int __mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bch_fs_usage *fs_usage;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
@@ -1157,6 +1166,14 @@ int bch2_mark_reservation(struct btree_trans *trans,
return 0;
}
+int bch2_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 start, u64 end,
@@ -1183,7 +1200,8 @@ static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
*idx = r->offset;
return 0;
not_found:
- if (fsck_err(c, "pointer to missing indirect extent\n"
+ if (fsck_err(c, reflink_p_to_missing_reflink_v,
+ "pointer to missing indirect extent\n"
" %s\n"
" missing range %llu-%llu",
(bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
@@ -1211,13 +1229,11 @@ fsck_err:
return ret;
}
-int bch2_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+static int __mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
struct reflink_gc *ref;
size_t l, r, m;
@@ -1251,6 +1267,14 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
return ret;
}
+int bch2_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
void bch2_trans_fs_usage_revert(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
@@ -1298,7 +1322,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
struct bch_fs *c = trans->c;
static int warned_disk_usage = 0;
bool warn = false;
- unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
struct replicas_delta *d, *d2;
struct replicas_delta *top = (void *) deltas->d + deltas->used;
struct bch_fs_usage *dst;
@@ -1357,7 +1381,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
bch2_trans_inconsistent(trans,
- "disk usage increased %lli more than %u sectors reserved)",
+ "disk usage increased %lli more than %llu sectors reserved)",
should_not_have_added, disk_res_sectors);
return 0;
need_mark:
@@ -1452,15 +1476,11 @@ err:
return ret;
}
-int bch2_trans_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+static int __trans_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
- ? old
- : bkey_i_to_s_c(new);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@@ -1517,6 +1537,24 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
return ret;
}
+int bch2_trans_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
+ (int) bch2_bkey_needs_rebalance(c, old);
+
+ if (mod) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+ if (ret)
+ return ret;
+ }
+
+ return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
+}
+
static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
struct bkey_s_c_stripe s,
unsigned idx, bool deleting)
@@ -1670,15 +1708,10 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
return ret;
}
-int bch2_trans_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
+static int __trans_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
- ? old
- : bkey_i_to_s_c(new);
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
struct replicas_delta_list *d;
@@ -1700,7 +1733,16 @@ int bch2_trans_mark_reservation(struct btree_trans *trans,
return 0;
}
-static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
+{
+ return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
+static int trans_mark_reflink_p_segment(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 *idx, unsigned flags)
{
@@ -1767,35 +1809,38 @@ err:
return ret;
}
-int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
+static int __trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
- ? old
- : bkey_i_to_s_c(new);
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
u64 idx, end_idx;
int ret = 0;
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
-
- v->front_pad = v->back_pad = 0;
- }
-
idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
end_idx = le64_to_cpu(p.v->idx) + p.k->size +
le32_to_cpu(p.v->back_pad);
while (idx < end_idx && !ret)
- ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
-
+ ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
return ret;
}
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
+{
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
+
+ v->front_pad = v->back_pad = 0;
+ }
+
+ return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
@@ -1818,6 +1863,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
if (a->v.data_type && type && a->v.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_bucket_metadata_type_mismatch,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
@@ -1825,16 +1871,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bch2_data_types[type],
bch2_data_types[type]);
ret = -EIO;
- goto out;
+ goto err;
}
- a->v.data_type = type;
- a->v.dirty_sectors = sectors;
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto out;
-out:
+ if (a->v.data_type != type ||
+ a->v.dirty_sectors != sectors) {
+ a->v.data_type = type;
+ a->v.dirty_sectors = sectors;
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ }
+err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1929,6 +1975,22 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
return ret;
}
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_online_member(ca, c, i) {
+ int ret = bch2_trans_mark_dev_sb(c, ca);
+ if (ret) {
+ percpu_ref_put(&ca->ref);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/* Disk reservations: */
#define SECTORS_CACHE 1024
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index bf8d7f407e9c..21f6cb356921 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -339,12 +339,27 @@ int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct
int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+({ \
+ int ret = 0; \
+ \
+ if (_old.k->type) \
+ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
+ if (!ret && _new.k->type) \
+ ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \
+ ret; \
+})
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \
+ mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
+
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
size_t, enum bch_data_type, unsigned);
int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+int bch2_trans_mark_dev_sbs(struct bch_fs *);
static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
{
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index f69e15dc699c..4bb88aefed12 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -332,8 +332,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
.p.data_type = ctx->stats.data_type,
- .p.btree_id = ctx->stats.btree_id,
- .p.pos = ctx->stats.pos,
+ .p.btree_id = ctx->stats.pos.btree,
+ .p.pos = ctx->stats.pos.pos,
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.p.sectors_total = bch2_fs_usage_read_short(c).used,
};
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 1480b64547b0..a8b148ec2a2b 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -697,14 +697,32 @@ err:
return ret;
}
+void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
+{
+ struct bch_compression_opt opt = bch2_compression_decode(v);
+
+ if (opt.type < BCH_COMPRESSION_OPT_NR)
+ prt_str(out, bch2_compression_opts[opt.type]);
+ else
+ prt_printf(out, "(unknown compression opt %u)", opt.type);
+ if (opt.level)
+ prt_printf(out, ":%u", opt.level);
+}
+
void bch2_opt_compression_to_text(struct printbuf *out,
struct bch_fs *c,
struct bch_sb *sb,
u64 v)
{
- struct bch_compression_opt opt = bch2_compression_decode(v);
+ return bch2_compression_opt_to_text(out, v);
+}
- prt_str(out, bch2_compression_opts[opt.type]);
- if (opt.level)
- prt_printf(out, ":%u", opt.level);
+int bch2_opt_compression_validate(u64 v, struct printbuf *err)
+{
+ if (!bch2_compression_opt_valid(v)) {
+ prt_printf(err, "invalid compression opt %llu", v);
+ return -BCH_ERR_invalid_sb_opt_compression;
+ }
+
+ return 0;
}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 052ea303241f..607fd5e232c9 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -4,12 +4,18 @@
#include "extents_types.h"
+static const unsigned __bch2_compression_opt_to_type[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+ BCH_COMPRESSION_OPTS()
+#undef x
+};
+
struct bch_compression_opt {
u8 type:4,
level:4;
};
-static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
{
return (struct bch_compression_opt) {
.type = v & 15,
@@ -17,17 +23,25 @@ static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
};
}
+static inline bool bch2_compression_opt_valid(unsigned v)
+{
+ struct bch_compression_opt opt = __bch2_compression_decode(v);
+
+ return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
+}
+
+static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+{
+ return bch2_compression_opt_valid(v)
+ ? __bch2_compression_decode(v)
+ : (struct bch_compression_opt) { 0 };
+}
+
static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
{
return opt.type|(opt.level << 4);
}
-static const unsigned __bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
- BCH_COMPRESSION_OPTS()
-#undef x
-};
-
static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
{
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
@@ -44,12 +58,16 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
void bch2_fs_compress_exit(struct bch_fs *);
int bch2_fs_compress_init(struct bch_fs *);
+void bch2_compression_opt_to_text(struct printbuf *, u64);
+
int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+int bch2_opt_compression_validate(u64, struct printbuf *);
#define bch2_opt_compression (struct bch_opt_fn) { \
- .parse = bch2_opt_compression_parse, \
- .to_text = bch2_opt_compression_to_text, \
+ .parse = bch2_opt_compression_parse, \
+ .to_text = bch2_opt_compression_to_text, \
+ .validate = bch2_opt_compression_validate, \
}
#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 114f86b45fd5..87b4b2d1ec76 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -69,9 +69,15 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more,
_ret; \
})
+#define darray_remove_item(_d, _pos) \
+ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+
#define darray_for_each(_d, _i) \
for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+#define darray_for_each_reverse(_d, _i) \
+ for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+
#define darray_init(_d) \
do { \
(_d)->data = NULL; \
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 899ff46de8e0..5ed66202c226 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -13,6 +13,7 @@
#include "keylist.h"
#include "move.h"
#include "nocow_locking.h"
+#include "rebalance.h"
#include "subvolume.h"
#include "trace.h"
@@ -161,11 +162,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
- /*
- * See comment below:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
- */
rewrites_found |= 1U << i;
}
i++;
@@ -211,14 +208,8 @@ restart_drop_extra_replicas:
if (!p.ptr.cached &&
durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability;
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
- /*
- * Currently, we're dropping unneeded replicas
- * instead of marking them as cached, since
- * cached data in stripe buckets prevents them
- * from being reused:
+
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
- */
goto restart_drop_extra_replicas;
}
}
@@ -248,14 +239,42 @@ restart_drop_extra_replicas:
next_pos = insert->k.p;
+ /*
+ * Check for nonce offset inconsistency:
+ * This is debug code - we've been seeing this bug rarely, and
+ * it's been hard to reproduce, so this should give us some more
+ * information when it does occur:
+ */
+ struct printbuf err = PRINTBUF;
+ int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+ printbuf_exit(&err);
+
+ if (invalid) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "about to insert invalid key in data update path");
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ bch2_fatal_error(c);
+ goto out;
+ }
+
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
- k.k->p, insert->k.p);
- if (ret)
- goto err;
-
- ret = bch2_trans_update(trans, &iter, insert,
+ k.k->p, insert->k.p) ?:
+ bch2_bkey_set_needs_rebalance(c, insert,
+ op->opts.background_target,
+ op->opts.background_compression) ?:
+ bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
@@ -281,11 +300,11 @@ next:
}
continue;
nowork:
- if (m->ctxt && m->ctxt->stats) {
+ if (m->stats && m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
- atomic64_inc(&m->ctxt->stats->keys_raced);
+ atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
- &m->ctxt->stats->sectors_raced);
+ &m->stats->sectors_raced);
}
this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
@@ -439,6 +458,8 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
m->data_opts = data_opts;
+ m->ctxt = ctxt;
+ m->stats = ctxt ? ctxt->stats : NULL;
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
@@ -487,7 +508,7 @@ int bch2_data_update_init(struct btree_trans *trans,
if (c->opts.nocow_enabled) {
if (ctxt) {
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
!atomic_read(&ctxt->read_sectors));
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 7ca1f98d7e94..9dc17b9d8379 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -23,6 +23,7 @@ struct data_update {
struct bkey_buf k;
struct data_update_opts data_opts;
struct moving_context *ctxt;
+ struct bch_move_stats *stats;
struct bch_write_op op;
};
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 75a3dc7cbd47..57c5128db173 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -517,7 +517,7 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
prt_printf(out, "%px btree=%s l=%u ",
b,
- bch2_btree_ids[b->c.btree_id],
+ bch2_btree_id_str(b->c.btree_id),
b->c.level);
prt_newline(out);
@@ -919,18 +919,18 @@ void bch2_fs_debug_init(struct bch_fs *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- debugfs_create_file(bch2_btree_ids[bd->id],
+ debugfs_create_file(bch2_btree_id_str(bd->id),
0400, c->btree_debug_dir, bd,
&btree_debug_ops);
snprintf(name, sizeof(name), "%s-formats",
- bch2_btree_ids[bd->id]);
+ bch2_btree_id_str(bd->id));
debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
&btree_format_debug_ops);
snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch2_btree_ids[bd->id]);
+ bch2_btree_id_str(bd->id));
debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
&bfloat_failed_debug_ops);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 6c6c8d57d72b..1a0f2d571569 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -97,61 +97,51 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
.is_visible = dirent_is_visible,
};
-int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
+ int ret = 0;
- if (!d_name.len) {
- prt_printf(err, "empty name");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(!d_name.len, c, err,
+ dirent_empty_name,
+ "empty name");
- if (bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len)) {
- prt_printf(err, "value too big (%zu > %u)",
- bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err,
+ dirent_val_too_big,
+ "value too big (%zu > %u)",
+ bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
/*
* Check new keys don't exceed the max length
* (older keys may be larger.)
*/
- if ((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX) {
- prt_printf(err, "dirent name too big (%u > %u)",
- d_name.len, BCH_NAME_MAX);
- return -BCH_ERR_invalid_bkey;
- }
-
- if (d_name.len != strnlen(d_name.name, d_name.len)) {
- prt_printf(err, "dirent has stray data after name's NUL");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (d_name.len == 1 && !memcmp(d_name.name, ".", 1)) {
- prt_printf(err, "invalid name");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (d_name.len == 2 && !memcmp(d_name.name, "..", 2)) {
- prt_printf(err, "invalid name");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (memchr(d_name.name, '/', d_name.len)) {
- prt_printf(err, "invalid name");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (d.v->d_type != DT_SUBVOL &&
- le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
- prt_printf(err, "dirent points to own directory");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err,
+ dirent_name_too_long,
+ "dirent name too big (%u > %u)",
+ d_name.len, BCH_NAME_MAX);
+
+ bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err,
+ dirent_name_embedded_nul,
+ "dirent has stray data after name's NUL");
+
+ bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
+ (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err,
+ dirent_name_dot_or_dotdot,
+ "invalid name");
+
+ bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err,
+ dirent_name_has_slash,
+ "name with /");
+
+ bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
+ le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err,
+ dirent_to_itself,
+ "dirent points to own directory");
+fsck_err:
+ return ret;
}
void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index e9fa1df38232..cd262bf4d9c5 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -7,7 +7,7 @@
enum bkey_invalid_flags;
extern const struct bch_hash_desc bch2_dirent_hash_desc;
-int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index e00133b6ea51..4d0cb0ccff32 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -175,6 +175,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
dst->deleted = BCH_GROUP_DELETED(src);
dst->parent = BCH_GROUP_PARENT(src);
+ memcpy(dst->label, src->label, sizeof(dst->label));
}
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
@@ -382,7 +383,57 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
return v;
}
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+ struct bch_disk_groups_cpu *groups;
+ struct bch_disk_group_cpu *g;
+ unsigned nr = 0;
+ u16 path[32];
+
+ out->atomic++;
+ rcu_read_lock();
+ groups = rcu_dereference(c->disk_groups);
+ if (!groups)
+ goto invalid;
+
+ while (1) {
+ if (nr == ARRAY_SIZE(path))
+ goto invalid;
+
+ if (v >= groups->nr)
+ goto invalid;
+
+ g = groups->entries + v;
+
+ if (g->deleted)
+ goto invalid;
+
+ path[nr++] = v;
+
+ if (!g->parent)
+ break;
+
+ v = g->parent - 1;
+ }
+
+ while (nr) {
+ v = path[--nr];
+ g = groups->entries + v;
+
+ prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+ if (nr)
+ prt_printf(out, ".");
+ }
+out:
+ rcu_read_unlock();
+ out->atomic--;
+ return;
+invalid:
+ prt_printf(out, "invalid label %u", v);
+ goto out;
+}
+
+void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct bch_sb_field_disk_groups *groups =
bch2_sb_field_get(sb, disk_groups);
@@ -493,10 +544,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
return -EINVAL;
}
-void bch2_opt_target_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_sb *sb,
- u64 v)
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
{
struct target t = target_decode(v);
@@ -504,47 +552,71 @@ void bch2_opt_target_to_text(struct printbuf *out,
case TARGET_NULL:
prt_printf(out, "none");
break;
- case TARGET_DEV:
- if (c) {
- struct bch_dev *ca;
-
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
-
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
- prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
- percpu_ref_put(&ca->io_ref);
- } else if (ca) {
- prt_printf(out, "offline device %u", t.dev);
- } else {
- prt_printf(out, "invalid device %u", t.dev);
- }
-
- rcu_read_unlock();
+ case TARGET_DEV: {
+ struct bch_dev *ca;
+
+ out->atomic++;
+ rcu_read_lock();
+ ca = t.dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[t.dev])
+ : NULL;
+
+ if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+ percpu_ref_put(&ca->io_ref);
+ } else if (ca) {
+ prt_printf(out, "offline device %u", t.dev);
} else {
- struct bch_member m = bch2_sb_member_get(sb, t.dev);
-
- if (bch2_dev_exists(sb, t.dev)) {
- prt_printf(out, "Device ");
- pr_uuid(out, m.uuid.b);
- prt_printf(out, " (%u)", t.dev);
- } else {
- prt_printf(out, "Bad device %u", t.dev);
- }
+ prt_printf(out, "invalid device %u", t.dev);
}
+
+ rcu_read_unlock();
+ out->atomic--;
break;
+ }
case TARGET_GROUP:
- if (c) {
- mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
- mutex_unlock(&c->sb_lock);
+ bch2_disk_path_to_text(out, c, t.group);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+ struct target t = target_decode(v);
+
+ switch (t.type) {
+ case TARGET_NULL:
+ prt_printf(out, "none");
+ break;
+ case TARGET_DEV: {
+ struct bch_member m = bch2_sb_member_get(sb, t.dev);
+
+ if (bch2_dev_exists(sb, t.dev)) {
+ prt_printf(out, "Device ");
+ pr_uuid(out, m.uuid.b);
+ prt_printf(out, " (%u)", t.dev);
} else {
- bch2_disk_path_to_text(out, sb, t.group);
+ prt_printf(out, "Bad device %u", t.dev);
}
break;
+ }
+ case TARGET_GROUP:
+ bch2_disk_path_to_text_sb(out, sb, t.group);
+ break;
default:
BUG();
}
}
+
+void bch2_opt_target_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_sb *sb,
+ u64 v)
+{
+ if (c)
+ bch2_target_to_text(out, c, v);
+ else
+ bch2_target_to_text_sb(out, sb, v);
+}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index bd7711767fd4..441826fff224 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_DISK_GROUPS_H
#define _BCACHEFS_DISK_GROUPS_H
+#include "disk_groups_types.h"
+
extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
@@ -83,7 +85,10 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *);
/* Exported for userspace bcachefs-tools: */
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
+void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
+
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h
new file mode 100644
index 000000000000..a54ef085b13d
--- /dev/null
+++ b/fs/bcachefs/disk_groups_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
+#define _BCACHEFS_DISK_GROUPS_TYPES_H
+
+struct bch_disk_group_cpu {
+ bool deleted;
+ u16 parent;
+ u8 label[BCH_SB_LABEL_SIZE];
+ struct bch_devs_mask devs;
+};
+
+struct bch_disk_groups_cpu {
+ struct rcu_head rcu;
+ unsigned nr;
+ struct bch_disk_group_cpu entries[] __counted_by(nr);
+};
+
+#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 8646856e4539..2a77de18c004 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -105,29 +105,26 @@ struct ec_bio {
/* Stripes btree keys: */
-int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ int ret = 0;
- if (bkey_eq(k.k->p, POS_MIN)) {
- prt_printf(err, "stripe at POS_MIN");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
+ bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
+ stripe_pos_bad,
+ "stripe at bad pos");
- if (k.k->p.inode) {
- prt_printf(err, "nonzero inode field");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
- prt_printf(err, "incorrect value size (%zu < %u)",
- bkey_val_u64s(k.k), stripe_val_u64s(s));
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
+ stripe_val_size_bad,
+ "incorrect value size (%zu < %u)",
+ bkey_val_u64s(k.k), stripe_val_u64s(s));
- return bch2_bkey_ptrs_invalid(c, k, flags, err);
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
}
void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@@ -153,6 +150,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
if (i < nr_data)
prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+ prt_printf(out, " gen %u", ptr->gen);
if (ptr_stale(ca, ptr))
prt_printf(out, " stale");
}
@@ -306,16 +304,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
- struct printbuf buf2 = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key));
+ struct printbuf err = PRINTBUF;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
+
+ prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
+ want.hi, want.lo,
+ got.hi, got.lo,
+ bch2_csum_types[v->csum_type]);
+ prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
+ bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+ bch_err_ratelimited(ca, "%s", err.buf);
+ printbuf_exit(&err);
- bch_err_ratelimited(c,
- "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
- (void *) _RET_IP_, i, j, v->csum_type,
- want.lo, got.lo, buf2.buf);
- printbuf_exit(&buf2);
clear_bit(i, buf->valid);
+
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
break;
}
@@ -373,7 +376,11 @@ static void ec_block_endio(struct bio *bio)
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca,
+ bio_data_dir(bio)
+ ? BCH_MEMBER_ERROR_write
+ : BCH_MEMBER_ERROR_read,
+ "erasure coding %s error: %s",
bio_data_dir(bio) ? "write" : "read",
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
@@ -474,14 +481,10 @@ err:
return ret;
}
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
-{
- return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe));
-}
-
/* recovery read path: */
-int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
{
+ struct bch_fs *c = trans->c;
struct ec_stripe_buf *buf;
struct closure cl;
struct bch_stripe *v;
@@ -496,7 +499,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
if (!buf)
return -BCH_ERR_ENOMEM_ec_read_extent;
- ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
+ ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
bch_err_ratelimited(c,
"error doing reconstruct read: error %i looking up stripe", ret);
@@ -1370,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
h->nr_active_devs++;
rcu_read_unlock();
+
+ /*
+ * If we only have redundancy + 1 devices, we're better off with just
+ * replication:
+ */
+ if (h->nr_active_devs < h->redundancy + 2)
+ bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
+ h->nr_active_devs, h->redundancy + 2);
+
list_add(&h->list, &c->ec_stripe_head_list);
return h;
}
@@ -1421,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
found:
+ if (!IS_ERR_OR_NULL(h) &&
+ h->nr_active_devs < h->redundancy + 2) {
+ mutex_unlock(&h->lock);
+ h = NULL;
+ }
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
@@ -1678,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
int ret;
h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
- if (!h)
- bch_err(c, "no stripe head");
if (IS_ERR_OR_NULL(h))
return h;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 966d165a3b66..7d0237c9819f 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -8,7 +8,7 @@
enum bkey_invalid_flags;
-int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
@@ -199,7 +199,7 @@ struct ec_stripe_head {
struct ec_stripe_new *s;
};
-int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 7cc083776a2e..68a1a96bb7ca 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -3,6 +3,8 @@
#define _BCACHEFS_ERRCODE_H
#define BCH_ERRCODES() \
+ x(ERANGE, ERANGE_option_too_small) \
+ x(ERANGE, ERANGE_option_too_big) \
x(ENOMEM, ENOMEM_stripe_buf) \
x(ENOMEM, ENOMEM_replicas_table) \
x(ENOMEM, ENOMEM_cpu_replicas) \
@@ -213,6 +215,8 @@
x(BCH_ERR_invalid_sb, invalid_sb_crypt) \
x(BCH_ERR_invalid_sb, invalid_sb_clean) \
x(BCH_ERR_invalid_sb, invalid_sb_quota) \
+ x(BCH_ERR_invalid_sb, invalid_sb_errors) \
+ x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 2a5af8872613..7b28d37922fd 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -56,8 +56,9 @@ void bch2_io_error_work(struct work_struct *work)
up_write(&c->state_lock);
}
-void bch2_io_error(struct bch_dev *ca)
+void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
{
+ atomic64_inc(&ca->errors[type]);
//queue_work(system_long_wq, &ca->io_error_work);
}
@@ -116,31 +117,34 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
return NULL;
- list_for_each_entry(s, &c->fsck_errors, list)
+ list_for_each_entry(s, &c->fsck_error_msgs, list)
if (s->fmt == fmt) {
/*
* move it to the head of the list: repeated fsck errors
* are common
*/
- list_move(&s->list, &c->fsck_errors);
+ list_move(&s->list, &c->fsck_error_msgs);
return s;
}
s = kzalloc(sizeof(*s), GFP_NOFS);
if (!s) {
- if (!c->fsck_alloc_err)
+ if (!c->fsck_alloc_msgs_err)
bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
- c->fsck_alloc_err = true;
+ c->fsck_alloc_msgs_err = true;
return NULL;
}
INIT_LIST_HEAD(&s->list);
s->fmt = fmt;
- list_add(&s->list, &c->fsck_errors);
+ list_add(&s->list, &c->fsck_error_msgs);
return s;
}
-int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
+int bch2_fsck_err(struct bch_fs *c,
+ enum bch_fsck_flags flags,
+ enum bch_sb_error_id err,
+ const char *fmt, ...)
{
struct fsck_err_state *s = NULL;
va_list args;
@@ -148,11 +152,13 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
struct printbuf buf = PRINTBUF, *out = &buf;
int ret = -BCH_ERR_fsck_ignore;
+ bch2_sb_error_count(c, err);
+
va_start(args, fmt);
prt_vprintf(out, fmt, args);
va_end(args);
- mutex_lock(&c->fsck_error_lock);
+ mutex_lock(&c->fsck_error_msgs_lock);
s = fsck_err_get(c, fmt);
if (s) {
/*
@@ -162,7 +168,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
*/
if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
ret = s->ret;
- mutex_unlock(&c->fsck_error_lock);
+ mutex_unlock(&c->fsck_error_msgs_lock);
printbuf_exit(&buf);
return ret;
}
@@ -257,7 +263,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
if (s)
s->ret = ret;
- mutex_unlock(&c->fsck_error_lock);
+ mutex_unlock(&c->fsck_error_msgs_lock);
printbuf_exit(&buf);
@@ -278,9 +284,9 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
{
struct fsck_err_state *s, *n;
- mutex_lock(&c->fsck_error_lock);
+ mutex_lock(&c->fsck_error_msgs_lock);
- list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
+ list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
if (s->ratelimited && s->last_msg)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
@@ -289,5 +295,5 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
kfree(s);
}
- mutex_unlock(&c->fsck_error_lock);
+ mutex_unlock(&c->fsck_error_msgs_lock);
}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 7ce9540052e5..d167d65986e0 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -4,6 +4,7 @@
#include <linux/list.h>
#include <linux/printk.h>
+#include "sb-errors.h"
struct bch_dev;
struct bch_fs;
@@ -101,18 +102,26 @@ struct fsck_err_state {
char *last_msg;
};
-#define FSCK_CAN_FIX (1 << 0)
-#define FSCK_CAN_IGNORE (1 << 1)
-#define FSCK_NEED_FSCK (1 << 2)
-#define FSCK_NO_RATELIMIT (1 << 3)
+enum bch_fsck_flags {
+ FSCK_CAN_FIX = 1 << 0,
+ FSCK_CAN_IGNORE = 1 << 1,
+ FSCK_NEED_FSCK = 1 << 2,
+ FSCK_NO_RATELIMIT = 1 << 3,
+};
+
+#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
-__printf(3, 4) __cold
-int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
+__printf(4, 5) __cold
+int bch2_fsck_err(struct bch_fs *,
+ enum bch_fsck_flags,
+ enum bch_sb_error_id,
+ const char *, ...);
void bch2_flush_fsck_errs(struct bch_fs *);
-#define __fsck_err(c, _flags, msg, ...) \
+#define __fsck_err(c, _flags, _err_type, ...) \
({ \
- int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \
+ int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \
+ __VA_ARGS__); \
\
if (_ret != -BCH_ERR_fsck_fix && \
_ret != -BCH_ERR_fsck_ignore) { \
@@ -127,26 +136,53 @@ void bch2_flush_fsck_errs(struct bch_fs *);
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-#define __fsck_err_on(cond, c, _flags, ...) \
- (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
+#define __fsck_err_on(cond, c, _flags, _err_type, ...) \
+ (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
+
+#define need_fsck_err_on(cond, c, _err_type, ...) \
+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+
+#define need_fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+
+#define mustfix_fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
+
+#define mustfix_fsck_err_on(cond, c, _err_type, ...) \
+ __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-#define need_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+#define fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-#define need_fsck_err(c, ...) \
- __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+#define fsck_err_on(cond, c, _err_type, ...) \
+ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-#define mustfix_fsck_err(c, ...) \
- __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
+static inline void bch2_bkey_fsck_err(struct bch_fs *c,
+ struct printbuf *err_msg,
+ enum bch_sb_error_id err_type,
+ const char *fmt, ...)
+{
+ va_list args;
-#define mustfix_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
+ va_start(args, fmt);
+ prt_vprintf(err_msg, fmt, args);
+ va_end(args);
-#define fsck_err(c, ...) \
- __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+}
-#define fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+#define bkey_fsck_err(c, _err_msg, _err_type, ...) \
+do { \
+ prt_printf(_err_msg, __VA_ARGS__); \
+ bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \
+ ret = -BCH_ERR_invalid_bkey; \
+ goto fsck_err; \
+} while (0)
+
+#define bkey_fsck_err_on(cond, ...) \
+do { \
+ if (unlikely(cond)) \
+ bkey_fsck_err(__VA_ARGS__); \
+} while (0)
/*
* Fatal errors: these don't indicate a bug, but we can't continue running in RW
@@ -179,26 +215,26 @@ do { \
void bch2_io_error_work(struct work_struct *);
/* Does the error handling without logging a message */
-void bch2_io_error(struct bch_dev *);
+void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
-#define bch2_dev_io_err_on(cond, ca, ...) \
+#define bch2_dev_io_err_on(cond, ca, _type, ...) \
({ \
bool _ret = (cond); \
\
if (_ret) { \
bch_err_dev_ratelimited(ca, __VA_ARGS__); \
- bch2_io_error(ca); \
+ bch2_io_error(ca, _type); \
} \
_ret; \
})
-#define bch2_dev_inum_io_err_on(cond, ca, ...) \
+#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
({ \
bool _ret = (cond); \
\
if (_ret) { \
bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
- bch2_io_error(ca); \
+ bch2_io_error(ca, _type); \
} \
_ret; \
})
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1b25f84e4b9c..a864de231b69 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "buckets.h"
#include "checksum.h"
+#include "compress.h"
#include "debug.h"
#include "disk_groups.h"
#include "error.h"
@@ -162,17 +163,19 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
/* KEY_TYPE_btree_ptr: */
-int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
- prt_printf(err, "value too big (%zu > %u)",
- bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
+ btree_ptr_val_too_big,
+ "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
- return bch2_bkey_ptrs_invalid(c, k, flags, err);
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
}
void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -181,17 +184,20 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
- prt_printf(err, "value too big (%zu > %zu)",
- bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
+ btree_ptr_v2_val_too_big,
+ "value too big (%zu > %zu)",
+ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
- return bch2_bkey_ptrs_invalid(c, k, flags, err);
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
}
void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
@@ -372,19 +378,18 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
/* KEY_TYPE_reservation: */
-int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+ int ret = 0;
- if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
- prt_printf(err, "invalid nr_replicas (%u)",
- r.v->nr_replicas);
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
+ reservation_key_nr_replicas_invalid,
+ "invalid nr_replicas (%u)", r.v->nr_replicas);
+fsck_err:
+ return ret;
}
void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -757,18 +762,6 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
return i;
}
-static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
- union bch_extent_entry *next = extent_entry_next(entry);
-
- /* stripes have ptrs, but their layout doesn't work with this code */
- BUG_ON(k.k->type == KEY_TYPE_stripe);
-
- memmove_u64s_down(entry, next,
- (u64 *) bkey_val_end(k) - (u64 *) next);
- k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
/*
* Returns pointer to the next entry after the one being dropped:
*/
@@ -992,10 +985,6 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
- const struct bch_extent_ptr *ptr;
- const struct bch_extent_stripe_ptr *ec;
- struct bch_dev *ca;
bool first = true;
if (c)
@@ -1006,9 +995,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- ptr = entry_to_ptr(entry);
- ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ case BCH_EXTENT_ENTRY_ptr: {
+ const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
+ struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
@@ -1030,10 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " stale");
}
break;
+ }
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+ case BCH_EXTENT_ENTRY_crc128: {
+ struct bch_extent_crc_unpacked crc =
+ bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
crc.compressed_size,
@@ -1042,12 +1033,26 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
bch2_csum_types[crc.csum_type],
bch2_compression_types[crc.compression_type]);
break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- ec = &entry->stripe_ptr;
+ }
+ case BCH_EXTENT_ENTRY_stripe_ptr: {
+ const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
prt_printf(out, "ec: idx %llu block %u",
(u64) ec->idx, ec->block);
break;
+ }
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ prt_str(out, "rebalance: target ");
+ if (c)
+ bch2_target_to_text(out, c, r->target);
+ else
+ prt_printf(out, "%u", r->target);
+ prt_str(out, " compression ");
+ bch2_compression_opt_to_text(out, r->compression);
+ break;
+ }
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@@ -1057,7 +1062,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
}
}
-static int extent_ptr_invalid(const struct bch_fs *c,
+static int extent_ptr_invalid(struct bch_fs *c,
struct bkey_s_c k,
enum bkey_invalid_flags flags,
const struct bch_extent_ptr *ptr,
@@ -1070,6 +1075,7 @@ static int extent_ptr_invalid(const struct bch_fs *c,
u64 bucket;
u32 bucket_offset;
struct bch_dev *ca;
+ int ret = 0;
if (!bch2_dev_exists2(c, ptr->dev)) {
/*
@@ -1080,41 +1086,33 @@ static int extent_ptr_invalid(const struct bch_fs *c,
if (flags & BKEY_INVALID_WRITE)
return 0;
- prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
- return -BCH_ERR_invalid_bkey;
+ bkey_fsck_err(c, err, ptr_to_invalid_device,
+ "pointer to invalid device (%u)", ptr->dev);
}
ca = bch_dev_bkey_exists(c, ptr->dev);
bkey_for_each_ptr(ptrs, ptr2)
- if (ptr != ptr2 && ptr->dev == ptr2->dev) {
- prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
+ ptr_to_duplicate_device,
+ "multiple pointers to same device (%u)", ptr->dev);
bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
- if (bucket >= ca->mi.nbuckets) {
- prt_printf(err, "pointer past last bucket (%llu > %llu)",
- bucket, ca->mi.nbuckets);
- return -BCH_ERR_invalid_bkey;
- }
-
- if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
- prt_printf(err, "pointer before first bucket (%llu < %u)",
- bucket, ca->mi.first_bucket);
- return -BCH_ERR_invalid_bkey;
- }
-
- if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
- prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
+ bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
+ ptr_after_last_bucket,
+ "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
+ bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
+ ptr_before_first_bucket,
+ "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
+ bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
+ ptr_spans_multiple_buckets,
+ "pointer spans multiple buckets (%u + %u > %u)",
bucket_offset, size_ondisk, ca->mi.bucket_size);
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+fsck_err:
+ return ret;
}
-int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
@@ -1124,24 +1122,22 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
unsigned size_ondisk = k.k->size;
unsigned nonce = UINT_MAX;
unsigned nr_ptrs = 0;
- bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
- int ret;
+ bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
+ int ret = 0;
if (bkey_is_btree_ptr(k.k))
size_ondisk = btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) {
- if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
- prt_printf(err, "invalid extent entry type (got %u, max %u)",
- __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
+ extent_ptrs_invalid_entry,
+ "invalid extent entry type (got %u, max %u)",
+ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
- if (bkey_is_btree_ptr(k.k) &&
- !extent_entry_is_ptr(entry)) {
- prt_printf(err, "has non ptr field");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
+ !extent_entry_is_ptr(entry), c, err,
+ btree_ptr_has_non_ptr,
+ "has non ptr field");
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
@@ -1150,22 +1146,15 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
if (ret)
return ret;
- if (nr_ptrs && unwritten != entry->ptr.unwritten) {
- prt_printf(err, "extent with unwritten and written ptrs");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) {
- prt_printf(err, "has unwritten ptrs");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
+ ptr_cached_and_erasure_coded,
+ "cached, erasure coded ptr");
- if (entry->ptr.cached && have_ec) {
- prt_printf(err, "cached, erasure coded ptr");
- return -BCH_ERR_invalid_bkey;
- }
+ if (!entry->ptr.unwritten)
+ have_written = true;
+ else
+ have_unwritten = true;
- unwritten = entry->ptr.unwritten;
have_ec = false;
crc_since_last_ptr = false;
nr_ptrs++;
@@ -1175,72 +1164,77 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- if (crc.offset + crc.live_size >
- crc.uncompressed_size) {
- prt_printf(err, "checksum offset + key size > uncompressed size");
- return -BCH_ERR_invalid_bkey;
- }
-
- size_ondisk = crc.compressed_size;
-
- if (!bch2_checksum_type_valid(c, crc.csum_type)) {
- prt_printf(err, "invalid checksum type");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
- prt_printf(err, "invalid compression type");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
+ ptr_crc_uncompressed_size_too_small,
+ "checksum offset + key size > uncompressed size");
+ bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
+ ptr_crc_csum_type_unknown,
+ "invalid checksum type");
+ bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
+ ptr_crc_compression_type_unknown,
+ "invalid compression type");
if (bch2_csum_type_is_encryption(crc.csum_type)) {
if (nonce == UINT_MAX)
nonce = crc.offset + crc.nonce;
- else if (nonce != crc.offset + crc.nonce) {
- prt_printf(err, "incorrect nonce");
- return -BCH_ERR_invalid_bkey;
- }
+ else if (nonce != crc.offset + crc.nonce)
+ bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
+ "incorrect nonce");
}
- if (crc_since_last_ptr) {
- prt_printf(err, "redundant crc entry");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(crc_since_last_ptr, c, err,
+ ptr_crc_redundant,
+ "redundant crc entry");
crc_since_last_ptr = true;
+
+ bkey_fsck_err_on(crc_is_encoded(crc) &&
+ (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+ (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
+ ptr_crc_uncompressed_size_too_big,
+ "too large encoded extent");
+
+ size_ondisk = crc.compressed_size;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
- if (have_ec) {
- prt_printf(err, "redundant stripe entry");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(have_ec, c, err,
+ ptr_stripe_redundant,
+ "redundant stripe entry");
have_ec = true;
break;
- case BCH_EXTENT_ENTRY_rebalance:
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ if (!bch2_compression_opt_valid(r->compression)) {
+ struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+ prt_printf(err, "invalid compression opt %u:%u",
+ opt.type, opt.level);
+ return -BCH_ERR_invalid_bkey;
+ }
break;
}
+ }
}
- if (!nr_ptrs) {
- prt_str(err, "no ptrs");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
- prt_str(err, "too many ptrs");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (crc_since_last_ptr) {
- prt_printf(err, "redundant crc entry");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (have_ec) {
- prt_printf(err, "redundant stripe entry");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(!nr_ptrs, c, err,
+ extent_ptrs_no_ptrs,
+ "no ptrs");
+ bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
+ extent_ptrs_too_many_ptrs,
+ "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
+ bkey_fsck_err_on(have_written && have_unwritten, c, err,
+ extent_ptrs_written_and_unwritten,
+ "extent with unwritten and written ptrs");
+ bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
+ extent_ptrs_unwritten,
+ "has unwritten ptrs");
+ bkey_fsck_err_on(crc_since_last_ptr, c, err,
+ extent_ptrs_redundant_crc,
+ "redundant crc entry");
+ bkey_fsck_err_on(have_ec, c, err,
+ extent_ptrs_redundant_stripe,
+ "redundant stripe entry");
+fsck_err:
+ return ret;
}
void bch2_ptr_swab(struct bkey_s k)
@@ -1281,6 +1275,125 @@ void bch2_ptr_swab(struct bkey_s k)
}
}
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+ return &entry->rebalance;
+
+ return NULL;
+}
+
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned rewrite_ptrs = 0;
+
+ if (compression) {
+ unsigned compression_type = bch2_compression_opt_to_type(compression);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) {
+ rewrite_ptrs = 0;
+ goto incompressible;
+ }
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+incompressible:
+ if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+ const struct bch_extent_ptr *ptr;
+ unsigned i = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+
+ return rewrite_ptrs;
+}
+
+bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ /*
+ * If it's an indirect extent, we don't delete the rebalance entry when
+ * done so that we know what options were applied - check if it still
+ * needs work done:
+ */
+ if (r &&
+ k.k->type == KEY_TYPE_reflink_v &&
+ !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
+ r = NULL;
+
+ return r != NULL;
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_s k = bkey_i_to_s(_k);
+ struct bch_extent_rebalance *r;
+ bool needs_rebalance;
+
+ if (!bkey_extent_is_direct_data(k.k))
+ return 0;
+
+ /* get existing rebalance entry: */
+ r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+ if (r) {
+ if (k.k->type == KEY_TYPE_reflink_v) {
+ /*
+ * indirect extents: existing options take precedence,
+ * so that we don't move extents back and forth if
+ * they're referenced by different inodes with different
+ * options:
+ */
+ if (r->target)
+ target = r->target;
+ if (r->compression)
+ compression = r->compression;
+ }
+
+ r->target = target;
+ r->compression = compression;
+ }
+
+ needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
+
+ if (needs_rebalance && !r) {
+ union bch_extent_entry *new = bkey_val_end(k);
+
+ new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
+ new->rebalance.compression = compression;
+ new->rebalance.target = target;
+ new->rebalance.unused = 0;
+ k.k->u64s += extent_entry_u64s(new);
+ } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
+ /*
+ * For indirect extents, don't delete the rebalance entry when
+ * we're finished so that we know we specifically moved it or
+ * compressed it to its current location/compression type
+ */
+ extent_entry_drop(k, (union bch_extent_entry *) r);
+ }
+
+ return 0;
+}
+
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 879e7d218b6a..a2ce8a3be13c 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -89,6 +89,18 @@ static inline void __extent_entry_insert(struct bkey_i *k,
memcpy_u64s_small(dst, new, extent_entry_u64s(new));
}
+static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+ union bch_extent_entry *next = extent_entry_next(entry);
+
+ /* stripes have ptrs, but their layout doesn't work with this code */
+ BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+ memmove_u64s_down(entry, next,
+ (u64 *) bkey_val_end(k) - (u64 *) next);
+ k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
@@ -190,6 +202,11 @@ static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
}
+static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
+{
+ return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
+}
+
/* bkey_ptrs: generically over any key type that has ptrs */
struct bkey_ptrs_c {
@@ -383,12 +400,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
/* KEY_TYPE_btree_ptr: */
-int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
@@ -428,7 +445,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
/* KEY_TYPE_reservation: */
-int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -688,11 +705,19 @@ void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_ptr_swab(struct bkey_s);
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
+ unsigned, unsigned);
+bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
+ unsigned, unsigned);
+
/* Generic extent code: */
enum bch_extent_overlap {
@@ -737,22 +762,4 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
k->size = new_size;
}
-/*
- * In extent_sort_fix_overlapping(), insert_fixup_extent(),
- * extent_merge_inline() - we're modifying keys in place that are packed. To do
- * that we have to unpack the key, modify the unpacked key - then this
- * copies/repacks the unpacked to the original as necessary.
- */
-static inline void extent_save(struct btree *b, struct bkey_packed *dst,
- struct bkey *src)
-{
- struct bkey_format *f = &b->format;
- struct bkey_i *dst_unpacked;
-
- if ((dst_unpacked = packed_to_bkey(dst)))
- dst_unpacked->k = *src;
- else
- BUG_ON(!bch2_bkey_pack_key(dst, src, f));
-}
-
#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index bb5305441f27..4496cf91a4c1 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -51,7 +51,7 @@ int bch2_create_trans(struct btree_trans *trans,
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
if (flags & BCH_CREATE_TMPFILE)
- new_inode->bi_flags |= BCH_INODE_UNLINKED;
+ new_inode->bi_flags |= BCH_INODE_unlinked;
ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
if (ret)
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 58ccc7b91ac7..52f0e7acda3d 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -389,6 +389,21 @@ static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs
return ret;
}
+/*
+ * Determine when a writepage io is full. We have to limit writepage bios to a
+ * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
+ * what the bounce path in bch2_write_extent() can handle. In theory we could
+ * loosen this restriction for non-bounce I/O, but we don't have that context
+ * here. Ideally, we can up this limit and make it configurable in the future
+ * when the bounce path can be enhanced to accommodate larger source bios.
+ */
+static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
+{
+ struct bio *bio = &io->op.wbio.bio;
+ return bio_full(bio, len) ||
+ (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
+}
+
static void bch2_writepage_io_done(struct bch_write_op *op)
{
struct bch_writepage_io *io =
@@ -606,9 +621,7 @@ do_io:
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio, sectors << 9) ||
- w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
- (BIO_MAX_VECS * PAGE_SIZE) ||
+ bch_io_full(w->io, sectors << 9) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
bch2_writepage_do_io(w);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 6a9557e7ecab..5b42a76c4796 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -113,6 +113,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
} else {
atomic_set(&dio->cl.remaining,
CLOSURE_REMAINING_INITIALIZER + 1);
+ dio->cl.closure_get_happened = true;
}
dio->req = req;
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 8bd9bcdd27f7..ff664fd0d8ef 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -13,7 +13,7 @@
int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
loff_t start, u64 end,
- int fgp_flags, gfp_t gfp,
+ fgf_t fgp_flags, gfp_t gfp,
folios *fs)
{
struct folio *f;
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index a2222ad586e9..27f712ae37a6 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -7,7 +7,7 @@
typedef DARRAY(struct folio *) folios;
int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
- u64, int, gfp_t, folios *);
+ u64, fgf_t, gfp_t, folios *);
int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
/*
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 6040bd3f0778..5a39bcb597a3 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -45,13 +45,13 @@ static int bch2_inode_flags_set(struct btree_trans *trans,
unsigned newflags = s->flags;
unsigned oldflags = bi->bi_flags & s->mask;
- if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
+ if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
if (!S_ISREG(bi->bi_mode) &&
!S_ISDIR(bi->bi_mode) &&
- (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
+ (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
return -EINVAL;
if (s->set_projinherit) {
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
index 54a9c21a3b83..d30f9bb056fd 100644
--- a/fs/bcachefs/fs-ioctl.h
+++ b/fs/bcachefs/fs-ioctl.h
@@ -6,28 +6,28 @@
/* bcachefs inode flags -> vfs inode flags: */
static const __maybe_unused unsigned bch_flags_to_vfs[] = {
- [__BCH_INODE_SYNC] = S_SYNC,
- [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
- [__BCH_INODE_APPEND] = S_APPEND,
- [__BCH_INODE_NOATIME] = S_NOATIME,
+ [__BCH_INODE_sync] = S_SYNC,
+ [__BCH_INODE_immutable] = S_IMMUTABLE,
+ [__BCH_INODE_append] = S_APPEND,
+ [__BCH_INODE_noatime] = S_NOATIME,
};
/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
static const __maybe_unused unsigned bch_flags_to_uflags[] = {
- [__BCH_INODE_SYNC] = FS_SYNC_FL,
- [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
- [__BCH_INODE_APPEND] = FS_APPEND_FL,
- [__BCH_INODE_NODUMP] = FS_NODUMP_FL,
- [__BCH_INODE_NOATIME] = FS_NOATIME_FL,
+ [__BCH_INODE_sync] = FS_SYNC_FL,
+ [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
+ [__BCH_INODE_append] = FS_APPEND_FL,
+ [__BCH_INODE_nodump] = FS_NODUMP_FL,
+ [__BCH_INODE_noatime] = FS_NOATIME_FL,
};
/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
static const __maybe_unused unsigned bch_flags_to_xflags[] = {
- [__BCH_INODE_SYNC] = FS_XFLAG_SYNC,
- [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
- [__BCH_INODE_APPEND] = FS_XFLAG_APPEND,
- [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP,
- [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME,
+ [__BCH_INODE_sync] = FS_XFLAG_SYNC,
+ [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
+ [__BCH_INODE_append] = FS_XFLAG_APPEND,
+ [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
+ [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
};
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 6642b88c41a0..8ef817304e4a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -764,15 +764,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
}
- if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
+ if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
stat->attributes |= STATX_ATTR_IMMUTABLE;
stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
- if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
+ if (inode->ei_inode.bi_flags & BCH_INODE_append)
stat->attributes |= STATX_ATTR_APPEND;
stat->attributes_mask |= STATX_ATTR_APPEND;
- if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
+ if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
stat->attributes |= STATX_ATTR_NODUMP;
stat->attributes_mask |= STATX_ATTR_NODUMP;
@@ -1213,9 +1213,6 @@ static struct dentry *bch2_get_parent(struct dentry *child)
.inum = inode->ei_inode.bi_dir,
};
- if (!parent_inum.inum)
- return NULL;
-
return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
}
@@ -1904,7 +1901,7 @@ got_sb:
sb->s_flags |= SB_POSIXACL;
#endif
- sb->s_shrink.seeks = 0;
+ sb->s_shrink->seeks = 0;
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
@@ -1925,10 +1922,7 @@ out:
return dget(sb->s_root);
err_put_super:
- sb->s_fs_info = NULL;
- c->vfs_sb = NULL;
deactivate_locked_super(sb);
- bch2_fs_stop(c);
return ERR_PTR(bch2_err_class(ret));
}
@@ -1936,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb)
{
struct bch_fs *c = sb->s_fs_info;
- if (c)
- c->vfs_sb = NULL;
generic_shutdown_super(sb);
- if (c)
- bch2_fs_free(c);
+ bch2_fs_free(c);
}
static struct file_system_type bcache_fs_type = {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b8f9e7475dc5..e0c5cd119acc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bkey_buf.h"
+#include "btree_cache.h"
#include "btree_update.h"
#include "buckets.h"
#include "darray.h"
@@ -444,9 +445,10 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
if (i->equiv == n.equiv) {
bch_err(c, "snapshot deletion did not finish:\n"
" duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
- bch2_btree_ids[btree_id],
+ bch2_btree_id_str(btree_id),
pos.inode, pos.offset,
i->id, n.id, n.equiv);
+ set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
}
}
@@ -719,8 +721,9 @@ static int check_key_has_snapshot(struct btree_trans *trans,
int ret = 0;
if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
- "key in missing snapshot: %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ bkey_in_missing_snapshot,
+ "key in missing snapshot: %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
fsck_err:
@@ -789,6 +792,7 @@ static int hash_check_key(struct btree_trans *trans,
if (fsck_err_on(k.k->type == desc.key_type &&
!desc.cmp_bkey(k, hash_k), c,
+ hash_table_key_duplicate,
"duplicate hash table keys:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k),
@@ -807,8 +811,9 @@ out:
printbuf_exit(&buf);
return ret;
bad_hash:
- if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
- bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
+ if (fsck_err(c, hash_table_key_wrong_offset,
+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
+ bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
@@ -849,22 +854,23 @@ static int check_inode(struct btree_trans *trans,
BUG_ON(bch2_inode_unpack(k, &u));
if (!full &&
- !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
- BCH_INODE_I_SECTORS_DIRTY|
- BCH_INODE_UNLINKED)))
+ !(u.bi_flags & (BCH_INODE_i_size_dirty|
+ BCH_INODE_i_sectors_dirty|
+ BCH_INODE_unlinked)))
return 0;
if (prev->bi_inum != u.bi_inum)
*prev = u;
if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
- inode_d_type(prev) != inode_d_type(&u), c,
+ inode_d_type(prev) != inode_d_type(&u),
+ c, inode_snapshot_mismatch,
"inodes in different snapshots don't match")) {
bch_err(c, "repair not implemented yet");
return -EINVAL;
}
- if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) &&
+ if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
struct bpos new_min_pos;
@@ -872,7 +878,7 @@ static int check_inode(struct btree_trans *trans,
if (ret)
goto err;
- u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED;
+ u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
ret = __write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode");
@@ -884,9 +890,10 @@ static int check_inode(struct btree_trans *trans,
return 0;
}
- if (u.bi_flags & BCH_INODE_UNLINKED &&
+ if (u.bi_flags & BCH_INODE_unlinked &&
(!c->sb.clean ||
- fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
+ fsck_err(c, inode_unlinked_but_clean,
+ "filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
bch2_trans_unlock(trans);
bch2_fs_lazy_rw(c);
@@ -896,9 +903,10 @@ static int check_inode(struct btree_trans *trans,
return ret;
}
- if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
+ if (u.bi_flags & BCH_INODE_i_size_dirty &&
(!c->sb.clean ||
- fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
+ fsck_err(c, inode_i_size_dirty_but_clean,
+ "filesystem marked clean, but inode %llu has i_size dirty",
u.bi_inum))) {
bch_verbose(c, "truncating inode %llu", u.bi_inum);
@@ -922,15 +930,16 @@ static int check_inode(struct btree_trans *trans,
* We truncated without our normal sector accounting hook, just
* make sure we recalculate it:
*/
- u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
+ u.bi_flags |= BCH_INODE_i_sectors_dirty;
- u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+ u.bi_flags &= ~BCH_INODE_i_size_dirty;
do_update = true;
}
- if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
+ if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
(!c->sb.clean ||
- fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
+ fsck_err(c, inode_i_sectors_dirty_but_clean,
+ "filesystem marked clean, but inode %llu has i_sectors dirty",
u.bi_inum))) {
s64 sectors;
@@ -944,14 +953,14 @@ static int check_inode(struct btree_trans *trans,
}
u.bi_sectors = sectors;
- u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+ u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
do_update = true;
}
- if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
+ if (u.bi_flags & BCH_INODE_backptr_untrusted) {
u.bi_dir = 0;
u.bi_dir_offset = 0;
- u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
+ u.bi_flags &= ~BCH_INODE_backptr_untrusted;
do_update = true;
}
@@ -1056,10 +1065,11 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
return -BCH_ERR_internal_fsck_err;
}
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
- "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
- w->last_pos.inode, i->snapshot,
- i->inode.bi_sectors, i->count)) {
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
+ c, inode_i_sectors_wrong,
+ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+ w->last_pos.inode, i->snapshot,
+ i->inode.bi_sectors, i->count)) {
i->inode.bi_sectors = i->count;
ret = fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
@@ -1200,7 +1210,8 @@ static int overlapping_extents_found(struct btree_trans *trans,
prt_printf(&buf, "\n overwriting %s extent",
pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
- if (fsck_err(c, "overlapping extents%s", buf.buf)) {
+ if (fsck_err(c, extent_overlapping,
+ "overlapping extents%s", buf.buf)) {
struct btree_iter *old_iter = &iter1;
struct disk_reservation res = { 0 };
@@ -1297,6 +1308,28 @@ err:
return ret;
}
+static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
+ unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
+
+ bkey_for_each_crc(k.k, ptrs, crc, i)
+ if (crc_is_encoded(crc) &&
+ crc.uncompressed_size > encoded_extent_max_sectors) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ return 0;
+}
+
static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k,
struct inode_walker *inode,
@@ -1333,7 +1366,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
if (k.k->type != KEY_TYPE_whiteout) {
- if (fsck_err_on(!i, c,
+ if (fsck_err_on(!i, c, extent_in_missing_inode,
"extent in missing inode:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
@@ -1341,7 +1374,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
if (fsck_err_on(i &&
!S_ISREG(i->inode.bi_mode) &&
- !S_ISLNK(i->inode.bi_mode), c,
+ !S_ISLNK(i->inode.bi_mode),
+ c, extent_in_non_reg_inode,
"extent in non regular inode mode %o:\n %s",
i->inode.bi_mode,
(printbuf_reset(&buf),
@@ -1371,9 +1405,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
continue;
if (k.k->type != KEY_TYPE_whiteout) {
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
- !bkey_extent_is_reservation(k), c,
+ !bkey_extent_is_reservation(k),
+ c, extent_past_end_of_inode,
"extent type past end of inode %llu:%u, i_size %llu\n %s",
i->inode.bi_inum, i->snapshot, i->inode.bi_size,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -1432,7 +1467,8 @@ int bch2_check_extents(struct bch_fs *c)
&res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends);
+ check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent_overbig(trans, &iter, k);
})) ?:
check_i_sectors(trans, &w);
@@ -1446,6 +1482,30 @@ int bch2_check_extents(struct bch_fs *c)
return ret;
}
+int bch2_check_indirect_extents(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct disk_reservation res = { 0 };
+ int ret = 0;
+
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+ POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ &res, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent_overbig(trans, &iter, k);
+ }));
+
+ bch2_disk_reservation_put(c, &res);
+ bch2_trans_put(trans);
+
+ bch_err_fn(c, ret);
+ return ret;
+}
+
static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
@@ -1470,7 +1530,8 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
continue;
}
- if (fsck_err_on(i->inode.bi_nlink != i->count, c,
+ if (fsck_err_on(i->inode.bi_nlink != i->count,
+ c, inode_dir_wrong_nlink,
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
@@ -1514,27 +1575,28 @@ static int check_dirent_target(struct btree_trans *trans,
backpointer_exists = ret;
ret = 0;
- if (fsck_err_on(S_ISDIR(target->bi_mode) &&
- backpointer_exists, c,
+ if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
+ c, inode_dir_multiple_links,
"directory %llu with multiple links",
target->bi_inum)) {
ret = __remove_dirent(trans, d.k->p);
goto out;
}
- if (fsck_err_on(backpointer_exists &&
- !target->bi_nlink, c,
+ if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+ c, inode_multiple_links_but_nlink_0,
"inode %llu type %s has multiple links but i_nlink 0",
target->bi_inum, bch2_d_types[d.v->d_type])) {
target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_UNLINKED;
+ target->bi_flags &= ~BCH_INODE_unlinked;
ret = __write_inode(trans, target, target_snapshot);
if (ret)
goto err;
}
- if (fsck_err_on(!backpointer_exists, c,
+ if (fsck_err_on(!backpointer_exists,
+ c, inode_wrong_backpointer,
"inode %llu:%u has wrong backpointer:\n"
"got %llu:%llu\n"
"should be %llu:%llu",
@@ -1552,7 +1614,8 @@ static int check_dirent_target(struct btree_trans *trans,
}
}
- if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
+ if (fsck_err_on(d.v->d_type != inode_d_type(target),
+ c, dirent_d_type_wrong,
"incorrect d_type: got %s, should be %s:\n%s",
bch2_d_type_str(d.v->d_type),
bch2_d_type_str(inode_d_type(target)),
@@ -1576,7 +1639,8 @@ static int check_dirent_target(struct btree_trans *trans,
if (d.v->d_type == DT_SUBVOL &&
target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
(c->sb.version < bcachefs_metadata_version_subvol_dirent ||
- fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
+ fsck_err(c, dirent_d_parent_subvol_wrong,
+ "dirent has wrong d_parent_subvol field: got %u, should be %u",
le32_to_cpu(d.v->d_parent_subvol),
target->bi_parent_subvol))) {
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
@@ -1648,7 +1712,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
dir->first_this_inode = false;
- if (fsck_err_on(!i, c,
+ if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
"dirent in nonexisting directory:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -1660,7 +1724,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (!i)
goto out;
- if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
+ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
+ c, dirent_in_non_dir_inode,
"dirent in non directory inode type %s:\n%s",
bch2_d_type_str(inode_d_type(&i->inode)),
(printbuf_reset(&buf),
@@ -1694,7 +1759,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
- if (fsck_err_on(ret, c,
+ if (fsck_err_on(ret, c, dirent_to_missing_subvol,
"dirent points to missing subvolume %u",
le32_to_cpu(d.v->d_child_subvol))) {
ret = __remove_dirent(trans, d.k->p);
@@ -1706,7 +1771,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
- if (fsck_err_on(ret, c,
+ if (fsck_err_on(ret, c, subvol_to_missing_root,
"subvolume %u points to missing subvolume root %llu",
target_subvol,
target_inum)) {
@@ -1715,7 +1780,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
+ if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
+ c, subvol_root_wrong_bi_subvol,
"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
target_inum,
subvol_root.bi_subvol, target_subvol)) {
@@ -1734,7 +1800,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
- if (fsck_err_on(!target->inodes.nr, c,
+ if (fsck_err_on(!target->inodes.nr,
+ c, dirent_to_missing_inode,
"dirent points to missing inode: (equiv %u)\n%s",
equiv.snapshot,
(printbuf_reset(&buf),
@@ -1820,7 +1887,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
inode->first_this_inode = false;
- if (fsck_err_on(!i, c,
+ if (fsck_err_on(!i, c, xattr_in_missing_inode,
"xattr for missing inode %llu",
k.k->p.inode))
return bch2_btree_delete_at(trans, iter, 0);
@@ -1869,7 +1936,8 @@ static int check_root_trans(struct btree_trans *trans)
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
+ if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
+ "root subvol missing")) {
struct bkey_i_subvolume root_subvol;
snapshot = U32_MAX;
@@ -1895,8 +1963,10 @@ static int check_root_trans(struct btree_trans *trans)
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
- mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
+ if (mustfix_fsck_err_on(ret, c, root_dir_missing,
+ "root directory missing") ||
+ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
+ c, root_inode_not_dir,
"root inode not a directory")) {
bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
0, NULL);
@@ -2000,7 +2070,8 @@ static int check_path(struct btree_trans *trans,
}
if (bch2_err_matches(ret, ENOENT)) {
- if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
+ if (fsck_err(c, inode_unreachable,
+ "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
inode->bi_inum, snapshot,
bch2_d_type_str(inode_d_type(inode)),
inode->bi_nlink,
@@ -2040,7 +2111,8 @@ static int check_path(struct btree_trans *trans,
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode->bi_inum, snapshot);
- if (!fsck_err(c, "directory structure loop"))
+ if (!fsck_err(c, dir_loop,
+ "directory structure loop"))
return 0;
ret = commit_do(trans, NULL, NULL,
@@ -2088,7 +2160,7 @@ int bch2_check_directory_structure(struct bch_fs *c)
break;
}
- if (u.bi_flags & BCH_INODE_UNLINKED)
+ if (u.bi_flags & BCH_INODE_unlinked)
continue;
ret = check_path(trans, &path, &u, iter.pos.snapshot);
@@ -2148,7 +2220,7 @@ static int nlink_cmp(const void *_l, const void *_r)
const struct nlink *l = _l;
const struct nlink *r = _r;
- return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
+ return cmp_int(l->inum, r->inum);
}
static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
@@ -2300,7 +2372,8 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
link = &links->d[++*idx];
}
- if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
+ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
+ c, inode_wrong_nlink,
"inode %llu type %s has wrong i_nlink (%u, should be %u)",
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
bch2_inode_nlink_get(&u), link->count)) {
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 90c87b5089a0..da991e8cf27e 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -4,6 +4,7 @@
int bch2_check_inodes(struct bch_fs *);
int bch2_check_extents(struct bch_fs *);
+int bch2_check_indirect_extents(struct bch_fs *);
int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index bb3f443d8381..c7849b0753e7 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -6,6 +6,7 @@
#include "bkey_methods.h"
#include "btree_update.h"
#include "buckets.h"
+#include "compress.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
@@ -19,13 +20,18 @@
#include <asm/unaligned.h>
-const char * const bch2_inode_opts[] = {
#define x(name, ...) #name,
+const char * const bch2_inode_opts[] = {
BCH_INODE_OPTS()
-#undef x
NULL,
};
+static const char * const bch2_inode_flag_strs[] = {
+ BCH_INODE_FLAGS()
+ NULL
+};
+#undef x
+
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
static int inode_decode_field(const u8 *in, const u8 *end,
@@ -361,9 +367,10 @@ int bch2_inode_peek(struct btree_trans *trans,
return ret;
}
-int bch2_inode_write(struct btree_trans *trans,
+int bch2_inode_write_flags(struct btree_trans *trans,
struct btree_iter *iter,
- struct bch_inode_unpacked *inode)
+ struct bch_inode_unpacked *inode,
+ enum btree_update_flags flags)
{
struct bkey_inode_buf *inode_p;
@@ -373,7 +380,7 @@ int bch2_inode_write(struct btree_trans *trans,
bch2_inode_pack_inlined(inode_p, inode);
inode_p->inode.k.p.snapshot = iter->snapshot;
- return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+ return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
}
struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
@@ -397,117 +404,121 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
return &inode_p->inode.k_i;
}
-static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
+static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
{
struct bch_inode_unpacked unpacked;
+ int ret = 0;
- if (k.k->p.inode) {
- prt_printf(err, "nonzero k.p.inode");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
- prt_printf(err, "fs inode in blockdev range");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(k.k->p.inode, c, err,
+ inode_pos_inode_nonzero,
+ "nonzero k.p.inode");
- if (bch2_inode_unpack(k, &unpacked)) {
- prt_printf(err, "invalid variable length fields");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
+ inode_pos_blockdev_range,
+ "fs inode in blockdev range");
- if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
- prt_printf(err, "invalid data checksum type (%u >= %u",
- unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
+ inode_unpack_error,
+ "invalid variable length fields");
- if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
- prt_printf(err, "invalid data checksum type (%u >= %u)",
- unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
+ inode_checksum_type_invalid,
+ "invalid data checksum type (%u >= %u",
+ unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
- if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
- unpacked.bi_nlink != 0) {
- prt_printf(err, "flagged as unlinked but bi_nlink != 0");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(unpacked.bi_compression &&
+ !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
+ inode_compression_type_invalid,
+ "invalid compression opt %u", unpacked.bi_compression - 1);
- if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
- prt_printf(err, "subvolume root but not a directory");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
+ unpacked.bi_nlink != 0, c, err,
+ inode_unlinked_but_nlink_nonzero,
+ "flagged as unlinked but bi_nlink != 0");
- return 0;
+ bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
+ inode_subvol_root_but_not_dir,
+ "subvolume root but not a directory");
+fsck_err:
+ return ret;
}
-int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+ int ret = 0;
- if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
- prt_printf(err, "invalid str hash type (%llu >= %u)",
- INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+ inode_str_hash_invalid,
+ "invalid str hash type (%llu >= %u)",
+ INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return __bch2_inode_invalid(k, err);
+ ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+ return ret;
}
-int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+ int ret = 0;
- if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
- prt_printf(err, "invalid str hash type (%llu >= %u)",
- INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+ inode_str_hash_invalid,
+ "invalid str hash type (%llu >= %u)",
+ INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return __bch2_inode_invalid(k, err);
+ ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+ return ret;
}
-int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+ int ret = 0;
- if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
- INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
- prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
- INODEv3_FIELDS_START(inode.v),
- INODEv3_FIELDS_START_INITIAL,
- bkey_val_u64s(inode.k));
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
+ inode_v3_fields_start_bad,
+ "invalid fields_start (got %llu, min %u max %zu)",
+ INODEv3_FIELDS_START(inode.v),
+ INODEv3_FIELDS_START_INITIAL,
+ bkey_val_u64s(inode.k));
- if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
- prt_printf(err, "invalid str hash type (%llu >= %u)",
- INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+ inode_str_hash_invalid,
+ "invalid str hash type (%llu >= %u)",
+ INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return __bch2_inode_invalid(k, err);
+ ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+ return ret;
}
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
- prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu",
- inode->bi_mode, inode->bi_flags,
+ prt_printf(out, "mode=%o ", inode->bi_mode);
+
+ prt_str(out, "flags=");
+ prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
+ prt_printf(out, " (%x)", inode->bi_flags);
+
+ prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
inode->bi_journal_seq,
inode->bi_size,
inode->bi_sectors,
inode->bi_version);
#define x(_name, _bits) \
- prt_printf(out, " "#_name " %llu", (u64) inode->_name);
+ prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
BCH_INODE_FIELDS_v3()
#undef x
}
@@ -546,7 +557,7 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k)
static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
{
- return bkey_inode_flags(k) & BCH_INODE_UNLINKED;
+ return bkey_inode_flags(k) & BCH_INODE_unlinked;
}
int bch2_trans_mark_inode(struct btree_trans *trans,
@@ -610,16 +621,17 @@ int bch2_mark_inode(struct btree_trans *trans,
return 0;
}
-int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (k.k->p.inode) {
- prt_printf(err, "nonzero k.p.inode");
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- return 0;
+ bkey_fsck_err_on(k.k->p.inode, c, err,
+ inode_pos_inode_nonzero,
+ "nonzero k.p.inode");
+fsck_err:
+ return ret;
}
void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -926,8 +938,8 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
{
- if (bi->bi_flags & BCH_INODE_UNLINKED)
- bi->bi_flags &= ~BCH_INODE_UNLINKED;
+ if (bi->bi_flags & BCH_INODE_unlinked)
+ bi->bi_flags &= ~BCH_INODE_unlinked;
else {
if (bi->bi_nlink == U32_MAX)
return -EINVAL;
@@ -940,13 +952,13 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
{
- if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) {
+ if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
bi->bi_inum);
return;
}
- if (bi->bi_flags & BCH_INODE_UNLINKED) {
+ if (bi->bi_flags & BCH_INODE_unlinked) {
bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
return;
}
@@ -954,7 +966,7 @@ void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *
if (bi->bi_nlink)
bi->bi_nlink--;
else
- bi->bi_flags |= BCH_INODE_UNLINKED;
+ bi->bi_flags |= BCH_INODE_unlinked;
}
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
@@ -979,6 +991,18 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
}
+int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
+{
+ struct bch_inode_unpacked inode;
+ int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
+
+ if (ret)
+ return ret;
+
+ bch2_inode_opts_get(opts, trans->c, &inode);
+ return 0;
+}
+
int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
@@ -1042,53 +1066,85 @@ err:
return ret ?: -BCH_ERR_transaction_restart_nested;
}
-static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
+static int may_delete_deleted_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos,
+ bool *need_another_pass)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
+ struct btree_iter inode_iter;
struct bkey_s_c k;
struct bch_inode_unpacked inode;
int ret;
- if (bch2_snapshot_is_internal_node(c, pos.snapshot))
- return 0;
-
- if (!fsck_err_on(c->sb.clean, c,
- "filesystem marked as clean but have deleted inode %llu:%u",
- pos.offset, pos.snapshot))
- return 0;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
+ k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
ret = bkey_err(k);
if (ret)
return ret;
ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
if (fsck_err_on(!bkey_is_inode(k.k), c,
+ deleted_inode_missing,
"nonexistent inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
ret = bch2_inode_unpack(k, &inode);
if (ret)
- goto err;
+ goto out;
if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
+ deleted_inode_is_dir,
"directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
- if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c,
+ if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
+ deleted_inode_not_unlinked,
"non-deleted inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
- return 1;
-err:
+ if (c->sb.clean &&
+ !fsck_err(c,
+ deleted_inode_but_clean,
+ "filesystem marked as clean but have deleted inode %llu:%u",
+ pos.offset, pos.snapshot)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
+ struct bpos new_min_pos;
+
+ ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
+ if (ret)
+ goto out;
+
+ inode.bi_flags &= ~BCH_INODE_unlinked;
+
+ ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch_err_msg(c, ret, "clearing inode unlinked flag");
+ if (ret)
+ goto out;
+
+ /*
+ * We'll need another write buffer flush to pick up the new
+ * unlinked inodes in the snapshot leaves:
+ */
+ *need_another_pass = true;
+ goto out;
+ }
+
+ ret = 1;
+out:
fsck_err:
+ bch2_trans_iter_exit(trans, &inode_iter);
return ret;
delete:
- return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ goto out;
}
int bch2_delete_dead_inodes(struct bch_fs *c)
@@ -1096,7 +1152,10 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
+ bool need_another_pass;
int ret;
+again:
+ need_another_pass = false;
ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
@@ -1110,7 +1169,10 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
*/
for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p));
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
if (ret < 0)
break;
@@ -1120,12 +1182,17 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
bch2_fs_lazy_rw(c);
}
+ bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
+
ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
}
bch2_trans_iter_exit(trans, &iter);
+
+ if (!ret && need_another_pass)
+ goto again;
err:
bch2_trans_put(trans);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index a7464e1b6960..88818a332b1e 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -3,16 +3,17 @@
#define _BCACHEFS_INODE_H
#include "bkey.h"
+#include "bkey_methods.h"
#include "opts.h"
enum bkey_invalid_flags;
extern const char * const bch2_inode_opts[];
-int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -52,7 +53,7 @@ static inline bool bkey_is_inode(const struct bkey *k)
k->type == KEY_TYPE_inode_v3;
}
-int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -101,8 +102,16 @@ void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *)
int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
-int bch2_inode_write(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *);
+
+int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, enum btree_update_flags);
+
+static inline int bch2_inode_write(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode)
+{
+ return bch2_inode_write_flags(trans, iter, inode, 0);
+}
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
@@ -177,7 +186,7 @@ static inline unsigned nlink_bias(umode_t mode)
static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
{
- return bi->bi_flags & BCH_INODE_UNLINKED
+ return bi->bi_flags & BCH_INODE_unlinked
? 0
: bi->bi_nlink + nlink_bias(bi->bi_mode);
}
@@ -187,10 +196,10 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
{
if (nlink) {
bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
- bi->bi_flags &= ~BCH_INODE_UNLINKED;
+ bi->bi_flags &= ~BCH_INODE_unlinked;
} else {
bi->bi_nlink = 0;
- bi->bi_flags |= BCH_INODE_UNLINKED;
+ bi->bi_flags |= BCH_INODE_unlinked;
}
}
@@ -200,6 +209,7 @@ void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
+int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
int bch2_delete_dead_inodes(struct bch_fs *);
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 119834cb8f9e..bebc11444ef5 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -16,13 +16,14 @@
#include "io_misc.h"
#include "io_write.h"
#include "logged_ops.h"
+#include "rebalance.h"
#include "subvolume.h"
/* Overwrites whatever was present with zeroes: */
int bch2_extent_fallocate(struct btree_trans *trans,
subvol_inum inum,
struct btree_iter *iter,
- unsigned sectors,
+ u64 sectors,
struct bch_io_opts opts,
s64 *i_sectors_delta,
struct write_point_specifier write_point)
@@ -104,7 +105,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
if (ret)
goto err;
- sectors = min(sectors, wp->sectors_free);
+ sectors = min_t(u64, sectors, wp->sectors_free);
sectors_allocated = sectors;
bch2_key_resize(&e->k, sectors);
@@ -355,6 +356,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+ struct bch_io_opts opts;
u64 dst_offset = le64_to_cpu(op->v.dst_offset);
u64 src_offset = le64_to_cpu(op->v.src_offset);
s64 shift = dst_offset - src_offset;
@@ -363,6 +365,10 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
bool insert = shift > 0;
int ret = 0;
+ ret = bch2_inum_opts_get(trans, inum, &opts);
+ if (ret)
+ return ret;
+
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, 0),
BTREE_ITER_INTENT);
@@ -443,7 +449,10 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, copy,
+ opts.background_target,
+ opts.background_compression) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
index c9e6ed40e1b8..9cb44a7c43c1 100644
--- a/fs/bcachefs/io_misc.h
+++ b/fs/bcachefs/io_misc.h
@@ -3,7 +3,7 @@
#define _BCACHEFS_IO_MISC_H
int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
- unsigned, struct bch_io_opts, s64 *,
+ u64, struct bch_io_opts, s64 *,
struct write_point_specifier);
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
subvol_inum, u64, s64 *);
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 443c3ea65527..a56ed553dc15 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -643,7 +643,7 @@ csum_err:
"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
- bch2_io_error(ca);
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
decompression_err:
@@ -677,7 +677,7 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
rbio->read_pos.inode,
rbio->read_pos.offset,
"data read error: %s",
@@ -1025,7 +1025,7 @@ get_bio:
trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
- if (bch2_ec_read_extent(c, rbio)) {
+ if (bch2_ec_read_extent(trans, rbio)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 6e4f85eb6ec8..d704a8f829c8 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -202,6 +202,17 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_i *k;
struct bkey_i_inode_v3 *inode;
+ /*
+ * Crazy performance optimization:
+ * Every extent update needs to also update the inode: the inode trigger
+ * will set bi->journal_seq to the journal sequence number of this
+ * transaction - for fsync.
+ *
+ * But if that's the only reason we're updating the inode (we're not
+ * updating bi_size or bi_sectors), then we don't need the inode update
+ * to be journalled - if we crash, the bi_journal_seq update will be
+ * lost, but that's fine.
+ */
unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
int ret;
@@ -223,7 +234,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
inode = bkey_i_to_inode_v3(k);
- if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+ if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
new_i_size > le64_to_cpu(inode->v.bi_size)) {
inode->v.bi_size = cpu_to_le64(new_i_size);
inode_update_flags = 0;
@@ -351,10 +362,13 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_extent_update(trans, inum, &iter, sk.k,
- &op->res,
- op->new_i_size, &op->i_sectors_delta,
- op->flags & BCH_WRITE_CHECK_ENOSPC);
+ ret = bch2_bkey_set_needs_rebalance(c, sk.k,
+ op->opts.background_target,
+ op->opts.background_compression) ?:
+ bch2_extent_update(trans, inum, &iter, sk.k,
+ &op->res,
+ op->new_i_size, &op->i_sectors_delta,
+ op->flags & BCH_WRITE_CHECK_ENOSPC);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -495,7 +509,6 @@ static void __bch2_write_index(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
- struct bkey_i *k;
unsigned dev;
int ret = 0;
@@ -505,14 +518,6 @@ static void __bch2_write_index(struct bch_write_op *op)
goto err;
}
- /*
- * probably not the ideal place to hook this in, but I don't
- * particularly want to plumb io_opts all the way through the btree
- * update stack right now
- */
- for_each_keylist_key(keys, k)
- bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
-
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
@@ -643,7 +648,7 @@ static void bch2_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
op->pos.inode,
wbio->inode_offset << 9,
"data write error: %s",
@@ -790,7 +795,7 @@ static int bch2_write_decrypt(struct bch_write_op *op)
* checksum:
*/
csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- if (bch2_crc_cmp(op->crc.csum, csum))
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
return -EIO;
ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
@@ -816,6 +821,7 @@ static enum prep_encoded_ret {
/* Can we just write the entire extent as is? */
if (op->crc.uncompressed_size == op->crc.live_size &&
+ op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
op->crc.compressed_size <= wp->sectors_free &&
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
op->incompressible)) {
@@ -1091,9 +1097,7 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
e = bkey_s_c_to_extent(k);
extent_for_each_ptr_decode(e, p, entry) {
- if (p.crc.csum_type ||
- crc_is_compressed(p.crc) ||
- p.has_ec)
+ if (crc_is_encoded(p.crc) || p.has_ec)
return false;
replicas += bch2_extent_ptr_durability(c, &p);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 0e7a9ffa3671..23a9b7845d11 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -526,36 +526,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret;
}
-/* journal_preres: */
-
-static bool journal_preres_available(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
-
- if (!ret && mutex_trylock(&j->reclaim_lock)) {
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
- }
-
- return ret;
-}
-
-int __bch2_journal_preres_get(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- int ret;
-
- closure_wait_event(&j->preres_wait,
- (ret = bch2_journal_error(j)) ||
- journal_preres_available(j, res, new_u64s, flags));
- return ret;
-}
-
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *j,
@@ -1019,6 +989,25 @@ err:
return ret;
}
+int bch2_fs_journal_alloc(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_online_member(ca, c, i) {
+ if (ca->journal.nr)
+ continue;
+
+ int ret = bch2_dev_journal_alloc(ca);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/* startup/shutdown: */
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
@@ -1287,7 +1276,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 491133cc52f3..c85d01cf4948 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -395,104 +395,6 @@ out:
return 0;
}
-/* journal_preres: */
-
-static inline void journal_set_watermark(struct journal *j)
-{
- union journal_preres_state s = READ_ONCE(j->prereserved);
- unsigned watermark = BCH_WATERMARK_stripe;
-
- if (fifo_free(&j->pin) < j->pin.size / 4)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
- if (fifo_free(&j->pin) < j->pin.size / 8)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (s.reserved > s.remaining)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
- if (!s.remaining)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (watermark == j->watermark)
- return;
-
- swap(watermark, j->watermark);
- if (watermark > j->watermark)
- journal_wake(j);
-}
-
-static inline void bch2_journal_preres_put(struct journal *j,
- struct journal_preres *res)
-{
- union journal_preres_state s = { .reserved = res->u64s };
-
- if (!res->u64s)
- return;
-
- s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
- res->u64s = 0;
-
- if (unlikely(s.waiting)) {
- clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
- (unsigned long *) &j->prereserved.v);
- closure_wake_up(&j->preres_wait);
- }
-
- if (s.reserved <= s.remaining && j->watermark)
- journal_set_watermark(j);
-}
-
-int __bch2_journal_preres_get(struct journal *,
- struct journal_preres *, unsigned, unsigned);
-
-static inline int bch2_journal_preres_get_fast(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags,
- bool set_waiting)
-{
- int d = new_u64s - res->u64s;
- union journal_preres_state old, new;
- u64 v = atomic64_read(&j->prereserved.counter);
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- int ret;
-
- do {
- old.v = new.v = v;
- ret = 0;
-
- if (watermark == BCH_WATERMARK_reclaim ||
- new.reserved + d < new.remaining) {
- new.reserved += d;
- ret = 1;
- } else if (set_waiting && !new.waiting)
- new.waiting = true;
- else
- return 0;
- } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
- old.v, new.v)) != old.v);
-
- if (ret)
- res->u64s += d;
- return ret;
-}
-
-static inline int bch2_journal_preres_get(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- if (new_u64s <= res->u64s)
- return 0;
-
- if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
- return 0;
-
- if (flags & JOURNAL_RES_GET_NONBLOCK)
- return -BCH_ERR_journal_preres_get_blocked;
-
- return __bch2_journal_preres_get(j, res, new_u64s, flags);
-}
-
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *,
@@ -534,6 +436,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_fs_journal_alloc(struct bch_fs *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 6a3d6a374e9c..786a09285509 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -140,7 +140,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
if (!dup->csum_good)
goto replace;
- fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
+ fsck_err(c, journal_entry_replicas_data_mismatch,
+ "found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
i = dup;
goto found;
@@ -235,7 +236,7 @@ static void journal_entry_err_msg(struct printbuf *out,
prt_str(out, ": ");
}
-#define journal_entry_err(c, version, jset, entry, msg, ...) \
+#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \
({ \
struct printbuf _buf = PRINTBUF; \
\
@@ -244,9 +245,10 @@ static void journal_entry_err_msg(struct printbuf *out,
\
switch (flags & BKEY_INVALID_WRITE) { \
case READ: \
- mustfix_fsck_err(c, "%s", _buf.buf); \
+ mustfix_fsck_err(c, _err, "%s", _buf.buf); \
break; \
case WRITE: \
+ bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
if (bch2_fs_inconsistent(c)) { \
ret = -BCH_ERR_fsck_errors_not_fixed; \
@@ -259,8 +261,8 @@ static void journal_entry_err_msg(struct printbuf *out,
true; \
})
-#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...) \
- ((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false)
+#define journal_entry_err_on(cond, ...) \
+ ((cond) ? journal_entry_err(__VA_ARGS__) : false)
#define FSCK_DELETED_KEY 5
@@ -277,7 +279,10 @@ static int journal_validate_key(struct bch_fs *c,
struct printbuf buf = PRINTBUF;
int ret = 0;
- if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) {
+ if (journal_entry_err_on(!k->k.u64s,
+ c, version, jset, entry,
+ journal_entry_bkey_u64s_0,
+ "k->u64s 0")) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
return FSCK_DELETED_KEY;
@@ -286,6 +291,7 @@ static int journal_validate_key(struct bch_fs *c,
if (journal_entry_err_on((void *) bkey_next(k) >
(void *) vstruct_next(entry),
c, version, jset, entry,
+ journal_entry_bkey_past_end,
"extends past end of journal entry")) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
@@ -294,6 +300,7 @@ static int journal_validate_key(struct bch_fs *c,
if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
c, version, jset, entry,
+ journal_entry_bkey_bad_format,
"bad format %u", k->k.format)) {
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -317,7 +324,8 @@ static int journal_validate_key(struct bch_fs *c,
bch2_bkey_invalid(c, bkey_i_to_s_c(k),
__btree_node_type(level, btree_id), write, &buf);
- mustfix_fsck_err(c, "%s", buf.buf);
+ mustfix_fsck_err(c, journal_entry_bkey_invalid,
+ "%s", buf.buf);
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -369,7 +377,7 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
prt_newline(out);
prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
}
- prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+ prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
first = false;
}
@@ -387,6 +395,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
if (journal_entry_err_on(!entry->u64s ||
le16_to_cpu(entry->u64s) != k->k.u64s,
c, version, jset, entry,
+ journal_entry_btree_root_bad_size,
"invalid btree root journal entry: wrong number of keys")) {
void *next = vstruct_next(entry);
/*
@@ -436,6 +445,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c,
if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
c, version, jset, entry,
+ journal_entry_blacklist_bad_size,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
}
@@ -463,6 +473,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
c, version, jset, entry,
+ journal_entry_blacklist_v2_bad_size,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
goto out;
@@ -473,6 +484,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
le64_to_cpu(bl_entry->end),
c, version, jset, entry,
+ journal_entry_blacklist_v2_start_past_end,
"invalid journal seq blacklist entry: start > end")) {
journal_entry_null_range(entry, vstruct_next(entry));
}
@@ -505,6 +517,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
if (journal_entry_err_on(bytes < sizeof(*u),
c, version, jset, entry,
+ journal_entry_usage_bad_size,
"invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
@@ -539,6 +552,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
if (journal_entry_err_on(bytes < sizeof(*u) ||
bytes < sizeof(*u) + u->r.nr_devs,
c, version, jset, entry,
+ journal_entry_data_usage_bad_size,
"invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
@@ -570,13 +584,17 @@ static int journal_entry_clock_validate(struct bch_fs *c,
int ret = 0;
if (journal_entry_err_on(bytes != sizeof(*clock),
- c, version, jset, entry, "bad size")) {
+ c, version, jset, entry,
+ journal_entry_clock_bad_size,
+ "bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(clock->rw > 1,
- c, version, jset, entry, "bad rw")) {
+ c, version, jset, entry,
+ journal_entry_clock_bad_rw,
+ "bad rw")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
@@ -608,7 +626,9 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
int ret = 0;
if (journal_entry_err_on(bytes < expected,
- c, version, jset, entry, "bad size (%u < %u)",
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_size,
+ "bad size (%u < %u)",
bytes, expected)) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
@@ -617,13 +637,17 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
dev = le32_to_cpu(u->dev);
if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
- c, version, jset, entry, "bad dev")) {
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_dev,
+ "bad dev")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(u->pad,
- c, version, jset, entry, "bad pad")) {
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_pad,
+ "bad pad")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
@@ -738,7 +762,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
vstruct_for_each(jset, entry) {
if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
- c, version, jset, entry,
+ c, version, jset, entry,
+ journal_entry_past_jset_end,
"journal entry extends past end of jset")) {
jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
break;
@@ -767,6 +792,7 @@ static int jset_validate(struct bch_fs *c,
version = le32_to_cpu(jset->version);
if (journal_entry_err_on(!bch2_version_compatible(version),
c, version, jset, NULL,
+ jset_unsupported_version,
"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
@@ -777,7 +803,8 @@ static int jset_validate(struct bch_fs *c,
}
if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
- c, version, jset, NULL,
+ c, version, jset, NULL,
+ jset_unknown_csum,
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
@@ -788,6 +815,7 @@ static int jset_validate(struct bch_fs *c,
if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
c, version, jset, NULL,
+ jset_last_seq_newer_than_seq,
"invalid journal entry: last_seq > seq (%llu > %llu)",
le64_to_cpu(jset->last_seq),
le64_to_cpu(jset->seq))) {
@@ -816,7 +844,8 @@ static int jset_validate_early(struct bch_fs *c,
version = le32_to_cpu(jset->version);
if (journal_entry_err_on(!bch2_version_compatible(version),
- c, version, jset, NULL,
+ c, version, jset, NULL,
+ jset_unsupported_version,
"%s sector %llu seq %llu: unknown journal entry version %u.%u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
@@ -831,7 +860,8 @@ static int jset_validate_early(struct bch_fs *c,
return JOURNAL_ENTRY_REREAD;
if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
- c, version, jset, NULL,
+ c, version, jset, NULL,
+ jset_past_bucket_end,
"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq), bytes))
@@ -900,7 +930,7 @@ reread:
ret = submit_bio_wait(bio);
kfree(bio);
- if (bch2_dev_io_err_on(ret, ca,
+ if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
"journal read error: sector %llu",
offset) ||
bch2_meta_read_fault("journal")) {
@@ -956,7 +986,8 @@ reread:
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
csum_good = jset_csum_good(c, j);
- if (!csum_good)
+ if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
+ "journal checksum error"))
saw_bad = true;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
@@ -1048,6 +1079,12 @@ found:
if (ja->bucket_seq[ja->cur_idx] &&
ja->sectors_free == ca->mi.bucket_size) {
+#if 0
+ /*
+ * Debug code for ZNS support, where we (probably) want to be
+ * correlated where we stopped in the journal to the zone write
+ * points:
+ */
bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
for (i = 0; i < 3; i++) {
@@ -1055,6 +1092,7 @@ found:
bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
}
+#endif
ja->sectors_free = 0;
}
@@ -1172,6 +1210,7 @@ int bch2_journal_read(struct bch_fs *c,
if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
c, le32_to_cpu(i->j.version), &i->j, NULL,
+ jset_last_seq_newer_than_seq,
"invalid journal entry: last_seq > seq (%llu > %llu)",
le64_to_cpu(i->j.last_seq),
le64_to_cpu(i->j.seq)))
@@ -1188,7 +1227,8 @@ int bch2_journal_read(struct bch_fs *c,
}
if (!*last_seq) {
- fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+ fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
+ "journal read done, but no entries found after dropping non-flushes");
return 0;
}
@@ -1214,6 +1254,7 @@ int bch2_journal_read(struct bch_fs *c,
if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+ jset_seq_blacklisted,
"found blacklisted journal entry %llu", seq);
i->ignore = true;
}
@@ -1254,7 +1295,8 @@ int bch2_journal_read(struct bch_fs *c,
bch2_journal_ptrs_to_text(&buf2, c, i);
missing_end = seq - 1;
- fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+ fsck_err(c, journal_entries_missing,
+ "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
" prev at %s\n"
" next at %s",
missing_start, missing_end,
@@ -1309,7 +1351,8 @@ int bch2_journal_read(struct bch_fs *c,
if (!degraded &&
!bch2_replicas_marked(c, &replicas.e) &&
(le64_to_cpu(i->j.seq) == *last_seq ||
- fsck_err(c, "superblock not marked as containing replicas for journal entry %llu\n %s",
+ fsck_err(c, journal_entry_replicas_not_marked,
+ "superblock not marked as containing replicas for journal entry %llu\n %s",
le64_to_cpu(i->j.seq), buf.buf))) {
ret = bch2_mark_replicas(c, &replicas.e);
if (ret)
@@ -1581,7 +1624,8 @@ static void journal_write_endio(struct bio *bio)
struct journal_buf *w = journal_last_unwritten_buf(j);
unsigned long flags;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ "error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
@@ -1641,9 +1685,15 @@ static void do_journal_write(struct closure *cl)
continue_at(cl, journal_write_done, c->io_complete_wq);
}
-static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
+static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
- struct jset_entry *i, *next, *prev = NULL;
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct jset_entry *start, *end, *i, *next, *prev = NULL;
+ struct jset *jset = w->data;
+ unsigned sectors, bytes, u64s;
+ bool validate_before_checksum = false;
+ unsigned long btree_roots_have = 0;
+ int ret;
/*
* Simple compaction, dropping empty jset_entries (from journal
@@ -1660,8 +1710,20 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
if (!u64s)
continue;
- if (i->type == BCH_JSET_ENTRY_btree_root)
+ /*
+ * New btree roots are set by journalling them; when the journal
+ * entry gets written we have to propagate them to
+ * c->btree_roots
+ *
+ * But, every journal entry we write has to contain all the
+ * btree roots (at least for now); so after we copy btree roots
+ * to c->btree_roots we have to get any missing btree roots and
+ * add them to this journal entry:
+ */
+ if (i->type == BCH_JSET_ENTRY_btree_root) {
bch2_journal_entry_to_btree_root(c, i);
+ __set_bit(i->btree_id, &btree_roots_have);
+ }
/* Can we merge with previous entry? */
if (prev &&
@@ -1685,85 +1747,10 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
-void bch2_journal_write(struct closure *cl)
-{
- struct journal *j = container_of(cl, struct journal, io);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bch_replicas_padded replicas;
- struct jset_entry *start, *end;
- struct jset *jset;
- struct bio *bio;
- struct printbuf journal_debug_buf = PRINTBUF;
- bool validate_before_checksum = false;
- unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
- int ret;
-
- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
- journal_buf_realloc(j, w);
- jset = w->data;
-
- j->write_start_time = local_clock();
-
- spin_lock(&j->lock);
-
- /*
- * If the journal is in an error state - we did an emergency shutdown -
- * we prefer to continue doing journal writes. We just mark them as
- * noflush so they'll never be used, but they'll still be visible by the
- * list_journal tool - this helps in debugging.
- *
- * There's a caveat: the first journal write after marking the
- * superblock dirty must always be a flush write, because on startup
- * from a clean shutdown we didn't necessarily read the journal and the
- * new journal write might overwrite whatever was in the journal
- * previously - we can't leave the journal without any flush writes in
- * it.
- *
- * So if we're in an error state, and we're still starting up, we don't
- * write anything at all.
- */
- if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
- (bch2_journal_error(j) ||
- w->noflush ||
- (!w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
- w->noflush = true;
- SET_JSET_NO_FLUSH(jset, true);
- jset->last_seq = 0;
- w->last_seq = 0;
-
- j->nr_noflush_writes++;
- } else if (!bch2_journal_error(j)) {
- j->last_flush_write = jiffies;
- j->nr_flush_writes++;
- clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
- } else {
- spin_unlock(&j->lock);
- goto err;
- }
- spin_unlock(&j->lock);
-
- /*
- * New btree roots are set by journalling them; when the journal entry
- * gets written we have to propagate them to c->btree_roots
- *
- * But, every journal entry we write has to contain all the btree roots
- * (at least for now); so after we copy btree roots to c->btree_roots we
- * have to get any missing btree roots and add them to this journal
- * entry:
- */
-
- bch2_journal_entries_postprocess(c, jset);
start = end = vstruct_last(jset);
- end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
+ end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
bch2_journal_super_entries_add_common(c, &end,
le64_to_cpu(jset->seq));
@@ -1779,7 +1766,7 @@ void bch2_journal_write(struct closure *cl)
bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
vstruct_bytes(jset), w->sectors << 9,
u64s, w->u64s_reserved, j->entry_u64s_reserved);
- goto err;
+ return -EINVAL;
}
jset->magic = cpu_to_le64(jset_magic(c));
@@ -1798,37 +1785,117 @@ void bch2_journal_write(struct closure *cl)
validate_before_checksum = true;
if (validate_before_checksum &&
- jset_validate(c, NULL, jset, 0, WRITE))
- goto err;
+ (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+ return ret;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
if (bch2_fs_fatal_err_on(ret, c,
"error decrypting journal entry: %i", ret))
- goto err;
+ return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset);
if (!validate_before_checksum &&
- jset_validate(c, NULL, jset, 0, WRITE))
- goto err;
+ (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+ return ret;
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+ return 0;
+}
+
+static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ int error = bch2_journal_error(j);
+
+ /*
+ * If the journal is in an error state - we did an emergency shutdown -
+ * we prefer to continue doing journal writes. We just mark them as
+ * noflush so they'll never be used, but they'll still be visible by the
+ * list_journal tool - this helps in debugging.
+ *
+ * There's a caveat: the first journal write after marking the
+ * superblock dirty must always be a flush write, because on startup
+ * from a clean shutdown we didn't necessarily read the journal and the
+ * new journal write might overwrite whatever was in the journal
+ * previously - we can't leave the journal without any flush writes in
+ * it.
+ *
+ * So if we're in an error state, and we're still starting up, we don't
+ * write anything at all.
+ */
+ if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
+ return -EIO;
+
+ if (error ||
+ w->noflush ||
+ (!w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+ w->noflush = true;
+ SET_JSET_NO_FLUSH(w->data, true);
+ w->data->last_seq = 0;
+ w->last_seq = 0;
+
+ j->nr_noflush_writes++;
+ } else {
+ j->last_flush_write = jiffies;
+ j->nr_flush_writes++;
+ clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+ }
+
+ return 0;
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+ struct journal *j = container_of(cl, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_replicas_padded replicas;
+ struct bio *bio;
+ struct printbuf journal_debug_buf = PRINTBUF;
+ unsigned i, nr_rw_members = 0;
+ int ret;
+
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+ j->write_start_time = local_clock();
-retry_alloc:
spin_lock(&j->lock);
- ret = journal_write_alloc(j, w);
+ ret = bch2_journal_write_pick_flush(j, w);
+ spin_unlock(&j->lock);
+ if (ret)
+ goto err;
+
+ journal_buf_realloc(j, w);
+
+ ret = bch2_journal_write_prep(j, w);
+ if (ret)
+ goto err;
+
+ while (1) {
+ spin_lock(&j->lock);
+ ret = journal_write_alloc(j, w);
+ if (!ret || !j->can_discard)
+ break;
- if (ret && j->can_discard) {
spin_unlock(&j->lock);
bch2_journal_do_discards(j);
- goto retry_alloc;
}
- if (ret)
+ if (ret) {
__bch2_journal_debug_to_text(&journal_debug_buf, j);
+ spin_unlock(&j->lock);
+ bch_err(c, "Unable to allocate journal write:\n%s",
+ journal_debug_buf.buf);
+ printbuf_exit(&journal_debug_buf);
+ goto err;
+ }
/*
* write is allocated, no longer need to account for it in
@@ -1843,13 +1910,6 @@ retry_alloc:
bch2_journal_space_available(j);
spin_unlock(&j->lock);
- if (ret) {
- bch_err(c, "Unable to allocate journal write:\n%s",
- journal_debug_buf.buf);
- printbuf_exit(&journal_debug_buf);
- goto err;
- }
-
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
if (c->opts.nochanges)
@@ -1871,7 +1931,7 @@ retry_alloc:
if (ret)
goto err;
- if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+ if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(ca, c, i) {
percpu_ref_get(&ca->io_ref);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 9a584aaaa2eb..e63c6eda86af 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -50,16 +50,21 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available;
}
-static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+static inline void journal_set_watermark(struct journal *j, bool low_on_space)
{
- union journal_preres_state old, new;
- u64 v = atomic64_read(&j->prereserved.counter);
+ unsigned watermark = BCH_WATERMARK_stripe;
- do {
- old.v = new.v = v;
- new.remaining = u64s_remaining;
- } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
- old.v, new.v)) != old.v);
+ if (low_on_space)
+ watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+ if (fifo_free(&j->pin) < j->pin.size / 4)
+ watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+
+ if (watermark == j->watermark)
+ return;
+
+ swap(watermark, j->watermark);
+ if (watermark > j->watermark)
+ journal_wake(j);
}
static struct journal_space
@@ -162,7 +167,6 @@ void bch2_journal_space_available(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned clean, clean_ondisk, total;
- s64 u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs_want;
@@ -222,16 +226,10 @@ void bch2_journal_space_available(struct journal *j)
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
- u64s_remaining = (u64) clean << 6;
- u64s_remaining -= (u64) total << 3;
- u64s_remaining = max(0LL, u64s_remaining);
- u64s_remaining /= 4;
- u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
+ journal_set_watermark(j, clean * 4 <= total);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
- journal_set_remaining(j, u64s_remaining);
- journal_set_watermark(j);
if (!ret)
journal_wake(j);
@@ -555,11 +553,6 @@ static u64 journal_seq_to_flush(struct journal *j)
/* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2;
- /* And include pre-reservations: */
- nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
- (ca->mi.bucket_size << 6) -
- journal_entry_overhead(j));
-
nr_buckets = min(nr_buckets, ja->nr);
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
@@ -638,10 +631,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
msecs_to_jiffies(c->opts.journal_reclaim_delay)))
min_nr = 1;
- if (j->prereserved.reserved * 4 > j->prereserved.remaining)
- min_nr = 1;
-
- if (fifo_free(&j->pin) <= 32)
+ if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
@@ -652,8 +642,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
- j->prereserved.reserved,
- j->prereserved.remaining,
atomic_read(&c->btree_cache.dirty),
c->btree_cache.used,
atomic_long_read(&c->btree_key_cache.nr_dirty),
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 42504e16acb6..a756b69582e3 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -76,14 +76,6 @@ struct journal_res {
u64 seq;
};
-/*
- * For reserving space in the journal prior to getting a reservation on a
- * particular journal entry:
- */
-struct journal_preres {
- unsigned u64s;
-};
-
union journal_res_state {
struct {
atomic64_t counter;
@@ -104,22 +96,6 @@ union journal_res_state {
};
};
-union journal_preres_state {
- struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
- struct {
- u64 waiting:1,
- reserved:31,
- remaining:32;
- };
-};
-
/* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
@@ -180,8 +156,6 @@ struct journal {
union journal_res_state reservations;
enum bch_watermark watermark;
- union journal_preres_state prereserved;
-
} __aligned(SMP_CACHE_BYTES);
unsigned long flags;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 215a653322f3..a5cc0ed195d6 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -10,17 +10,17 @@
#include "recovery.h"
/* KEY_TYPE_lru is obsolete: */
-int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (!lru_pos_time(k.k->p)) {
- prt_printf(err, "lru entry at time=0");
- return -BCH_ERR_invalid_bkey;
-
- }
+ int ret = 0;
- return 0;
+ bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err,
+ lru_entry_at_time_0,
+ "lru entry at time=0");
+fsck_err:
+ return ret;
}
void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
@@ -95,6 +95,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
int ret;
if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
+ lru_entry_to_invalid_bucket,
"lru key points to nonexistent device:bucket %llu:%llu",
alloc_pos.inode, alloc_pos.offset))
return bch2_btree_delete_at(trans, lru_iter, 0);
@@ -125,7 +126,8 @@ static int bch2_check_lru_key(struct btree_trans *trans,
}
if (c->opts.reconstruct_alloc ||
- fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
+ fsck_err(c, lru_entry_bad,
+ "incorrect lru entry: lru %s time %llu\n"
" %s\n"
" for %s",
bch2_lru_types[type],
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index be66bf9ad809..429dca816df5 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -48,7 +48,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
return BCH_LRU_read;
}
-int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 39a14e321680..ab749bf2fcbc 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -20,6 +20,7 @@
#include "keylist.h"
#include "move.h"
#include "replicas.h"
+#include "snapshot.h"
#include "super-io.h"
#include "trace.h"
@@ -59,20 +60,6 @@ static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c
}
}
-static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_add(&stats->list, &c->data_progress_list);
- mutex_unlock(&c->data_progress_lock);
-}
-
-static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_del(&stats->list);
- mutex_unlock(&c->data_progress_lock);
-}
-
struct moving_io {
struct list_head read_list;
struct list_head io_list;
@@ -156,35 +143,31 @@ static void move_read_endio(struct bio *bio)
closure_put(&ctxt->cl);
}
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
- struct btree_trans *trans)
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
{
struct moving_io *io;
- if (trans)
- bch2_trans_unlock(trans);
-
while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
+ bch2_trans_unlock_long(ctxt->trans);
list_del(&io->read_list);
move_write(io);
}
}
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
- struct btree_trans *trans)
+void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
{
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
!atomic_read(&ctxt->write_sectors) ||
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
void bch2_moving_ctxt_exit(struct moving_context *ctxt)
{
- struct bch_fs *c = ctxt->c;
+ struct bch_fs *c = ctxt->trans->c;
- move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
closure_sync(&ctxt->cl);
EBUG_ON(atomic_read(&ctxt->write_sectors));
@@ -192,16 +175,12 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
EBUG_ON(atomic_read(&ctxt->read_sectors));
EBUG_ON(atomic_read(&ctxt->read_ios));
- if (ctxt->stats) {
- progress_list_del(c, ctxt->stats);
- trace_move_data(c,
- atomic64_read(&ctxt->stats->sectors_moved),
- atomic64_read(&ctxt->stats->keys_moved));
- }
-
mutex_lock(&c->moving_context_lock);
list_del(&ctxt->list);
mutex_unlock(&c->moving_context_lock);
+
+ bch2_trans_put(ctxt->trans);
+ memset(ctxt, 0, sizeof(*ctxt));
}
void bch2_moving_ctxt_init(struct moving_context *ctxt,
@@ -213,7 +192,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
{
memset(ctxt, 0, sizeof(*ctxt));
- ctxt->c = c;
+ ctxt->trans = bch2_trans_get(c);
ctxt->fn = (void *) _RET_IP_;
ctxt->rate = rate;
ctxt->stats = stats;
@@ -230,16 +209,17 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
mutex_lock(&c->moving_context_lock);
list_add(&ctxt->list, &c->moving_context_list);
mutex_unlock(&c->moving_context_lock);
+}
- if (stats) {
- progress_list_add(c, stats);
- stats->data_type = BCH_DATA_user;
- }
+void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
+{
+ trace_move_data(c, stats);
}
void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
{
memset(stats, 0, sizeof(*stats));
+ stats->data_type = BCH_DATA_user;
scnprintf(stats->name, sizeof(stats->name), "%s", name);
}
@@ -286,15 +266,14 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
-static int bch2_move_extent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
- struct bch_io_opts io_opts,
- enum btree_id btree_id,
- struct bkey_s_c k,
- struct data_update_opts data_opts)
+int bch2_move_extent(struct moving_context *ctxt,
+ struct move_bucket_in_flight *bucket_in_flight,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct bch_io_opts io_opts,
+ struct data_update_opts data_opts)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct moving_io *io;
@@ -303,6 +282,8 @@ static int bch2_move_extent(struct btree_trans *trans,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
+ if (ctxt->stats)
+ ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@@ -355,7 +336,7 @@ static int bch2_move_extent(struct btree_trans *trans,
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
- io_opts, data_opts, btree_id, k);
+ io_opts, data_opts, iter->btree_id, k);
if (ret && ret != -BCH_ERR_unwritten_extent_update)
goto err_free_pages;
@@ -367,9 +348,11 @@ static int bch2_move_extent(struct btree_trans *trans,
BUG_ON(ret);
- io->write.ctxt = ctxt;
io->write.op.end_io = move_write_done;
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate, k.k->size);
+
if (ctxt->stats) {
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
@@ -399,7 +382,7 @@ static int bch2_move_extent(struct btree_trans *trans,
closure_get(&ctxt->cl);
bch2_read_extent(trans, &io->rbio,
bkey_start_pos(k.k),
- btree_id, k, 0,
+ iter->btree_id, k, 0,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
@@ -413,45 +396,96 @@ err:
return ret;
}
-static int lookup_inode(struct btree_trans *trans, struct bpos pos,
- struct bch_inode_unpacked *inode)
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+ struct per_snapshot_io_opts *io_opts,
+ struct bkey_s_c extent_k)
+{
+ struct bch_fs *c = trans->c;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+
+ if (io_opts->cur_inum != extent_k.k->p.inode) {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ io_opts->d.nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != extent_k.k->p.inode)
+ break;
+
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ struct bch_inode_unpacked inode;
+ BUG_ON(bch2_inode_unpack(k, &inode));
+
+ struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
+ bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
+
+ ret = darray_push(&io_opts->d, e);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ io_opts->cur_inum = extent_k.k->p.inode;
+ }
+
+ ret = ret ?: trans_was_restarted(trans, restart_count);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (extent_k.k->p.snapshot) {
+ struct snapshot_io_opts_entry *i;
+ darray_for_each(io_opts->d, i)
+ if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
+ return &i->io_opts;
+ }
+
+ return &io_opts->fs_io_opts;
+}
+
+int bch2_move_get_io_opts_one(struct btree_trans *trans,
+ struct bch_io_opts *io_opts,
+ struct bkey_s_c extent_k)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(&iter);
+ /* reflink btree? */
+ if (!extent_k.k->p.inode) {
+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+ return 0;
+ }
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
+ BTREE_ITER_CACHED);
ret = bkey_err(k);
- if (ret)
- goto err;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
- if (!k.k || !bkey_eq(k.k->p, pos)) {
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
+ if (!ret && bkey_is_inode(k.k)) {
+ struct bch_inode_unpacked inode;
+ bch2_inode_unpack(k, &inode);
+ bch2_inode_opts_get(io_opts, trans->c, &inode);
+ } else {
+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
}
- ret = bkey_is_inode(k.k) ? 0 : -EIO;
- if (ret)
- goto err;
-
- ret = bch2_inode_unpack(k, inode);
- if (ret)
- goto err;
-err:
bch2_trans_iter_exit(trans, &iter);
- return ret;
+ return 0;
}
-static int move_ratelimit(struct btree_trans *trans,
- struct moving_context *ctxt)
+int bch2_move_ratelimit(struct moving_context *ctxt)
{
- struct bch_fs *c = trans->c;
+ struct bch_fs *c = ctxt->trans->c;
u64 delay;
- if (ctxt->wait_on_copygc) {
- bch2_trans_unlock(trans);
+ if (ctxt->wait_on_copygc && !c->copygc_running) {
+ bch2_trans_unlock_long(ctxt->trans);
wait_event_killable(c->copygc_running_wq,
!c->copygc_running ||
kthread_should_stop());
@@ -460,8 +494,12 @@ static int move_ratelimit(struct btree_trans *trans,
do {
delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
+
if (delay) {
- bch2_trans_unlock(trans);
+ if (delay > HZ / 10)
+ bch2_trans_unlock_long(ctxt->trans);
+ else
+ bch2_trans_unlock(ctxt->trans);
set_current_state(TASK_INTERRUPTIBLE);
}
@@ -474,7 +512,7 @@ static int move_ratelimit(struct btree_trans *trans,
schedule_timeout(delay);
if (unlikely(freezing(current))) {
- move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
try_to_freeze();
}
} while (delay);
@@ -483,7 +521,7 @@ static int move_ratelimit(struct btree_trans *trans,
* XXX: these limits really ought to be per device, SSDs and hard drives
* will want different limits
*/
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
@@ -492,52 +530,28 @@ static int move_ratelimit(struct btree_trans *trans,
return 0;
}
-static int move_get_io_opts(struct btree_trans *trans,
- struct bch_io_opts *io_opts,
- struct bkey_s_c k, u64 *cur_inum)
-{
- struct bch_inode_unpacked inode;
- int ret;
-
- if (*cur_inum == k.k->p.inode)
- return 0;
-
- ret = lookup_inode(trans,
- SPOS(0, k.k->p.inode, k.k->p.snapshot),
- &inode);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
-
- if (!ret)
- bch2_inode_opts_get(io_opts, trans->c, &inode);
- else
- *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
- *cur_inum = k.k->p.inode;
- return 0;
-}
-
-static int __bch2_move_data(struct moving_context *ctxt,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- enum btree_id btree_id)
+static int bch2_move_data_btree(struct moving_context *ctxt,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ enum btree_id btree_id)
{
- struct bch_fs *c = ctxt->c;
- struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct per_snapshot_io_opts snapshot_io_opts;
+ struct bch_io_opts *io_opts;
struct bkey_buf sk;
- struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct data_update_opts data_opts;
- u64 cur_inum = U64_MAX;
int ret = 0, ret2;
+ per_snapshot_io_opts_init(&snapshot_io_opts, c);
bch2_bkey_buf_init(&sk);
if (ctxt->stats) {
ctxt->stats->data_type = BCH_DATA_user;
- ctxt->stats->btree_id = btree_id;
- ctxt->stats->pos = start;
+ ctxt->stats->pos = BBPOS(btree_id, start);
}
bch2_trans_iter_init(trans, &iter, btree_id, start,
@@ -547,7 +561,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
- while (!move_ratelimit(trans, ctxt)) {
+ while (!bch2_move_ratelimit(ctxt)) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@@ -564,17 +578,18 @@ static int __bch2_move_data(struct moving_context *ctxt,
break;
if (ctxt->stats)
- ctxt->stats->pos = iter.pos;
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
- ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
+ ret = PTR_ERR_OR_ZERO(io_opts);
if (ret)
continue;
memset(&data_opts, 0, sizeof(data_opts));
- if (!pred(c, arg, k, &io_opts, &data_opts))
+ if (!pred(c, arg, k, io_opts, &data_opts))
goto next;
/*
@@ -584,24 +599,20 @@ static int __bch2_move_data(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
- io_opts, btree_id, k, data_opts);
+ ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt, trans);
+ bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
/* XXX signal failure */
goto next;
}
-
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
@@ -610,59 +621,68 @@ next_nondata:
}
bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
+ per_snapshot_io_opts_exit(&snapshot_io_opts);
return ret;
}
-int bch2_move_data(struct bch_fs *c,
- enum btree_id start_btree_id, struct bpos start_pos,
- enum btree_id end_btree_id, struct bpos end_pos,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
+int __bch2_move_data(struct moving_context *ctxt,
+ struct bbpos start,
+ struct bbpos end,
+ move_pred_fn pred, void *arg)
{
- struct moving_context ctxt;
+ struct bch_fs *c = ctxt->trans->c;
enum btree_id id;
int ret = 0;
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-
- for (id = start_btree_id;
- id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
+ for (id = start.btree;
+ id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
id++) {
- stats->btree_id = id;
+ ctxt->stats->pos = BBPOS(id, POS_MIN);
- if (id != BTREE_ID_extents &&
- id != BTREE_ID_reflink)
+ if (!btree_type_has_ptrs(id) ||
+ !bch2_btree_id_root(c, id)->b)
continue;
- if (!bch2_btree_id_root(c, id)->b)
- continue;
-
- ret = __bch2_move_data(&ctxt,
- id == start_btree_id ? start_pos : POS_MIN,
- id == end_btree_id ? end_pos : POS_MAX,
+ ret = bch2_move_data_btree(ctxt,
+ id == start.btree ? start.pos : POS_MIN,
+ id == end.btree ? end.pos : POS_MAX,
pred, arg, id);
if (ret)
break;
}
+ return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+ struct bbpos start,
+ struct bbpos end,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
+{
+
+ struct moving_context ctxt;
+ int ret;
+
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ ret = __bch2_move_data(&ctxt, start, end, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
return ret;
}
-int __bch2_evacuate_bucket(struct btree_trans *trans,
- struct moving_context *ctxt,
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bpos bucket, int gen,
struct data_update_opts _data_opts)
{
- struct bch_fs *c = ctxt->c;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter;
struct bkey_buf sk;
@@ -673,7 +693,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
struct data_update_opts data_opts;
unsigned dirty_sectors, bucket_size;
u64 fragmentation;
- u64 cur_inum = U64_MAX;
struct bpos bp_pos = POS_MIN;
int ret = 0;
@@ -708,7 +727,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
goto err;
}
- while (!(ret = move_ratelimit(trans, ctxt))) {
+ while (!(ret = bch2_move_ratelimit(ctxt))) {
bch2_trans_begin(trans);
ret = bch2_get_next_backpointer(trans, bucket, gen,
@@ -737,7 +756,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
if (ret) {
bch2_trans_iter_exit(trans, &iter);
continue;
@@ -758,23 +777,20 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
i++;
}
- ret = bch2_move_extent(trans, &iter, ctxt,
- bucket_in_flight,
- io_opts, bp.btree_id, k, data_opts);
+ ret = bch2_move_extent(ctxt, bucket_in_flight,
+ &iter, k, io_opts, data_opts);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt, trans);
+ bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
if (ret)
goto err;
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
} else {
@@ -825,14 +841,12 @@ int bch2_evacuate_bucket(struct bch_fs *c,
struct write_point_specifier wp,
bool wait_on_copygc)
{
- struct btree_trans *trans = bch2_trans_get(c);
struct moving_context ctxt;
int ret;
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
+ ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
bch2_moving_ctxt_exit(&ctxt);
- bch2_trans_put(trans);
return ret;
}
@@ -849,21 +863,25 @@ static int bch2_move_btree(struct bch_fs *c,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct btree_trans *trans = bch2_trans_get(c);
+ struct moving_context ctxt;
+ struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
enum btree_id id;
struct data_update_opts data_opts;
int ret = 0;
- progress_list_add(c, stats);
+ bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
+ writepoint_ptr(&c->btree_write_point),
+ true);
+ trans = ctxt.trans;
stats->data_type = BCH_DATA_btree;
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
id++) {
- stats->btree_id = id;
+ stats->pos = BBPOS(id, POS_MIN);
if (!bch2_btree_id_root(c, id)->b)
continue;
@@ -882,7 +900,7 @@ retry:
bpos_cmp(b->key.k.p, end_pos)) > 0)
break;
- stats->pos = iter.pos;
+ stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
@@ -904,14 +922,10 @@ next:
break;
}
- bch2_trans_put(trans);
-
- if (ret)
- bch_err_fn(c, ret);
-
+ bch_err_fn(c, ret);
+ bch2_moving_ctxt_exit(&ctxt);
bch2_btree_interior_updates_flush(c);
- progress_list_del(c, stats);
return ret;
}
@@ -1032,8 +1046,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
mutex_unlock(&c->sb_lock);
}
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1056,14 +1069,16 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ (struct bbpos) { op.start_btree, op.start_pos },
+ (struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
+
+ bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_MIGRATE:
if (op.migrate.dev >= c->sb.nr_devices)
@@ -1080,18 +1095,21 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ (struct bbpos) { op.start_btree, op.start_pos },
+ (struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
+
+ bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
bch2_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
+ bch2_move_stats_exit(stats, c);
break;
default:
ret = -EINVAL;
@@ -1100,19 +1118,43 @@ int bch2_data_job(struct bch_fs *c,
return ret;
}
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
- struct bch_move_stats *stats = ctxt->stats;
- struct moving_io *io;
+ prt_printf(out, "%s: data type=%s pos=",
+ stats->name,
+ bch2_data_types[stats->data_type]);
+ bch2_bbpos_to_text(out, stats->pos);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_str(out, "keys moved: ");
+ prt_u64(out, atomic64_read(&stats->keys_moved));
+ prt_newline(out);
+
+ prt_str(out, "keys raced: ");
+ prt_u64(out, atomic64_read(&stats->keys_raced));
+ prt_newline(out);
+
+ prt_str(out, "bytes seen: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
+ prt_newline(out);
- prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
+ prt_str(out, "bytes moved: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
- prt_printf(out, " data type %s btree_id %s position: ",
- bch2_data_types[stats->data_type],
- bch2_btree_ids[stats->btree_id]);
- bch2_bpos_to_text(out, stats->pos);
+ prt_str(out, "bytes raced: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+{
+ struct moving_io *io;
+
+ bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
prt_printf(out, "reads: ios %u/%u sectors %u/%u",
@@ -1153,7 +1195,4 @@ void bch2_fs_move_init(struct bch_fs *c)
{
INIT_LIST_HEAD(&c->moving_context_list);
mutex_init(&c->moving_context_lock);
-
- INIT_LIST_HEAD(&c->data_progress_list);
- mutex_init(&c->data_progress_lock);
}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index cbdd58db8782..07cf9d42643b 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_MOVE_H
#define _BCACHEFS_MOVE_H
+#include "bbpos.h"
#include "bcachefs_ioctl.h"
#include "btree_iter.h"
#include "buckets.h"
@@ -11,7 +12,7 @@
struct bch_read_bio;
struct moving_context {
- struct bch_fs *c;
+ struct btree_trans *trans;
struct list_head list;
void *fn;
@@ -37,13 +38,14 @@ struct moving_context {
wait_queue_head_t wait;
};
-#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
+#define move_ctxt_wait_event(_ctxt, _cond) \
do { \
bool cond_finished = false; \
- bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \
+ bch2_moving_ctxt_do_pending_writes(_ctxt); \
\
if (_cond) \
break; \
+ bch2_trans_unlock_long((_ctxt)->trans); \
__wait_event((_ctxt)->wait, \
bch2_moving_ctxt_next_pending_write(_ctxt) || \
(cond_finished = (_cond))); \
@@ -59,22 +61,60 @@ void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
struct write_point_specifier, bool);
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
- struct btree_trans *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_move_ctxt_wait_for_io(struct moving_context *);
+int bch2_move_ratelimit(struct moving_context *);
+
+/* Inodes in different snapshots may have different IO options: */
+struct snapshot_io_opts_entry {
+ u32 snapshot;
+ struct bch_io_opts io_opts;
+};
+
+struct per_snapshot_io_opts {
+ u64 cur_inum;
+ struct bch_io_opts fs_io_opts;
+ DARRAY(struct snapshot_io_opts_entry) d;
+};
+
+static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
+{
+ memset(io_opts, 0, sizeof(*io_opts));
+ io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
+}
+
+static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
+{
+ darray_exit(&io_opts->d);
+}
+
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
+ struct per_snapshot_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+int bch2_move_extent(struct moving_context *,
+ struct move_bucket_in_flight *,
+ struct btree_iter *,
+ struct bkey_s_c,
+ struct bch_io_opts,
+ struct data_update_opts);
+
+int __bch2_move_data(struct moving_context *,
+ struct bbpos,
+ struct bbpos,
+ move_pred_fn, void *);
int bch2_move_data(struct bch_fs *,
- enum btree_id, struct bpos,
- enum btree_id, struct bpos,
+ struct bbpos start,
+ struct bbpos end,
struct bch_ratelimit *,
struct bch_move_stats *,
struct write_point_specifier,
bool,
move_pred_fn, void *);
-int __bch2_evacuate_bucket(struct btree_trans *,
- struct moving_context *,
+int __bch2_evacuate_bucket(struct moving_context *,
struct move_bucket_in_flight *,
struct bpos, int,
struct data_update_opts);
@@ -88,7 +128,10 @@ int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
+void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
+void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
+void bch2_move_stats_init(struct bch_move_stats *, char *);
+
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_move_init(struct bch_fs *);
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index baf1f8570b3f..e22841ef31e4 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -2,17 +2,17 @@
#ifndef _BCACHEFS_MOVE_TYPES_H
#define _BCACHEFS_MOVE_TYPES_H
+#include "bbpos_types.h"
+
struct bch_move_stats {
enum bch_data_type data_type;
- enum btree_id btree_id;
- struct bpos pos;
- struct list_head list;
+ struct bbpos pos;
char name[32];
atomic64_t keys_moved;
atomic64_t keys_raced;
- atomic64_t sectors_moved;
atomic64_t sectors_seen;
+ atomic64_t sectors_moved;
atomic64_t sectors_raced;
};
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 4017120baeee..0a0576326c5b 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -101,8 +101,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
return ret;
}
-static void move_buckets_wait(struct btree_trans *trans,
- struct moving_context *ctxt,
+static void move_buckets_wait(struct moving_context *ctxt,
struct buckets_in_flight *list,
bool flush)
{
@@ -111,7 +110,7 @@ static void move_buckets_wait(struct btree_trans *trans,
while ((i = list->first)) {
if (flush)
- move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
+ move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
if (atomic_read(&i->count))
break;
@@ -129,7 +128,7 @@ static void move_buckets_wait(struct btree_trans *trans,
kfree(i);
}
- bch2_trans_unlock(trans);
+ bch2_trans_unlock_long(ctxt->trans);
}
static bool bucket_in_flight(struct buckets_in_flight *list,
@@ -140,11 +139,11 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
typedef DARRAY(struct move_bucket) move_buckets;
-static int bch2_copygc_get_buckets(struct btree_trans *trans,
- struct moving_context *ctxt,
+static int bch2_copygc_get_buckets(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight,
move_buckets *buckets)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
@@ -152,7 +151,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
int ret;
- move_buckets_wait(trans, ctxt, buckets_in_flight, false);
+ move_buckets_wait(ctxt, buckets_in_flight, false);
ret = bch2_btree_write_buffer_flush(trans);
if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
@@ -188,10 +187,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
}
noinline
-static int bch2_copygc(struct btree_trans *trans,
- struct moving_context *ctxt,
- struct buckets_in_flight *buckets_in_flight)
+static int bch2_copygc(struct moving_context *ctxt,
+ struct buckets_in_flight *buckets_in_flight,
+ bool *did_work)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct data_update_opts data_opts = {
.btree_insert_flags = BCH_WATERMARK_copygc,
@@ -202,7 +202,7 @@ static int bch2_copygc(struct btree_trans *trans,
u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
- ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
+ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
if (ret)
goto err;
@@ -221,10 +221,12 @@ static int bch2_copygc(struct btree_trans *trans,
break;
}
- ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
+ ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
f->bucket.k.gen, data_opts);
if (ret)
goto err;
+
+ *did_work = true;
}
err:
darray_exit(&buckets);
@@ -300,24 +302,24 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
- struct btree_trans *trans;
struct moving_context ctxt;
struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE];
- struct buckets_in_flight buckets;
+ struct buckets_in_flight *buckets;
u64 last, wait;
int ret = 0;
- memset(&buckets, 0, sizeof(buckets));
-
- ret = rhashtable_init(&buckets.table, &bch_move_bucket_params);
+ buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
+ if (!buckets)
+ return -ENOMEM;
+ ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
if (ret) {
+ kfree(buckets);
bch_err_msg(c, ret, "allocating copygc buckets in flight");
return ret;
}
set_freezable();
- trans = bch2_trans_get(c);
bch2_move_stats_init(&move_stats, "copygc");
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
@@ -325,16 +327,18 @@ static int bch2_copygc_thread(void *arg)
false);
while (!ret && !kthread_should_stop()) {
- bch2_trans_unlock(trans);
+ bool did_work = false;
+
+ bch2_trans_unlock_long(ctxt.trans);
cond_resched();
if (!c->copy_gc_enabled) {
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, buckets, true);
kthread_wait_freezable(c->copy_gc_enabled);
}
if (unlikely(freezing(current))) {
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, buckets, true);
__refrigerator(false);
continue;
}
@@ -345,7 +349,7 @@ static int bch2_copygc_thread(void *arg)
if (wait > clock->max_slop) {
c->copygc_wait_at = last;
c->copygc_wait = last + wait;
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, buckets, true);
trace_and_count(c, copygc_wait, c, wait, last + wait);
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
@@ -355,16 +359,29 @@ static int bch2_copygc_thread(void *arg)
c->copygc_wait = 0;
c->copygc_running = true;
- ret = bch2_copygc(trans, &ctxt, &buckets);
+ ret = bch2_copygc(&ctxt, buckets, &did_work);
c->copygc_running = false;
wake_up(&c->copygc_running_wq);
+
+ if (!wait && !did_work) {
+ u64 min_member_capacity = bch2_min_rw_member_capacity(c);
+
+ if (min_member_capacity == U64_MAX)
+ min_member_capacity = 128 * 2048;
+
+ bch2_trans_unlock_long(ctxt.trans);
+ bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
+ MAX_SCHEDULE_TIMEOUT);
+ }
}
- move_buckets_wait(trans, &ctxt, &buckets, true);
- rhashtable_destroy(&buckets.table);
- bch2_trans_put(trans);
+ move_buckets_wait(&ctxt, buckets, true);
+
+ rhashtable_destroy(&buckets->table);
+ kfree(buckets);
bch2_moving_ctxt_exit(&ctxt);
+ bch2_move_stats_exit(&move_stats, c);
return 0;
}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 232f50c73a94..8dd4046cca41 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -12,11 +12,6 @@
#define x(t, n, ...) [n] = #t,
-const char * const bch2_iops_measurements[] = {
- BCH_IOPS_MEASUREMENTS()
- NULL
-};
-
const char * const bch2_error_actions[] = {
BCH_ERROR_ACTIONS()
NULL
@@ -42,9 +37,8 @@ const char * const bch2_sb_compat[] = {
NULL
};
-const char * const bch2_btree_ids[] = {
+const char * const __bch2_btree_ids[] = {
BCH_BTREE_IDS()
- "interior btree node",
NULL
};
@@ -271,14 +265,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
if (err)
prt_printf(err, "%s: too small (min %llu)",
opt->attr.name, opt->min);
- return -ERANGE;
+ return -BCH_ERR_ERANGE_option_too_small;
}
if (opt->max && v >= opt->max) {
if (err)
prt_printf(err, "%s: too big (max %llu)",
opt->attr.name, opt->max);
- return -ERANGE;
+ return -BCH_ERR_ERANGE_option_too_big;
}
if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
@@ -295,6 +289,9 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
return -EINVAL;
}
+ if (opt->fn.validate)
+ return opt->fn.validate(v, err);
+
return 0;
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 55014336c5f7..8526f177450a 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -10,13 +10,12 @@
struct bch_fs;
-extern const char * const bch2_iops_measurements[];
extern const char * const bch2_error_actions[];
extern const char * const bch2_fsck_fix_opts[];
extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
-extern const char * const bch2_btree_ids[];
+extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
extern const char * const bch2_compression_types[];
@@ -74,6 +73,7 @@ enum opt_type {
struct bch_opt_fn {
int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+ int (*validate)(u64, struct printbuf *);
};
/**
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index de41f9a14492..5e653eb81d54 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -415,11 +415,11 @@ void bch2_prt_bitflags(struct printbuf *out,
while (list[nr])
nr++;
- while (flags && (bit = __ffs(flags)) < nr) {
+ while (flags && (bit = __ffs64(flags)) < nr) {
if (!first)
bch2_prt_printf(out, ",");
first = false;
bch2_prt_printf(out, "%s", list[bit]);
- flags ^= 1 << bit;
+ flags ^= BIT_ULL(bit);
}
}
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index cb68ae44d597..a54647c36b85 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -59,17 +59,18 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
.to_text = bch2_sb_quota_to_text,
};
-int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (k.k->p.inode >= QTYP_NR) {
- prt_printf(err, "invalid quota type (%llu >= %u)",
- k.k->p.inode, QTYP_NR);
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- return 0;
+ bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err,
+ quota_type_invalid,
+ "invalid quota type (%llu >= %u)",
+ k.k->p.inode, QTYP_NR);
+fsck_err:
+ return ret;
}
void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 2f463874a362..884f601f41c4 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -8,7 +8,7 @@
enum bkey_invalid_flags;
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 568f1e8e7507..3319190b8d9c 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -1,15 +1,21 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "clock.h"
#include "compress.h"
#include "disk_groups.h"
#include "errcode.h"
+#include "error.h"
+#include "inode.h"
#include "move.h"
#include "rebalance.h"
+#include "subvolume.h"
#include "super-io.h"
#include "trace.h"
@@ -17,302 +23,396 @@
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
-/*
- * Check if an extent should be moved:
- * returns -1 if it should not be moved, or
- * device of pointer that should be moved, if known, or INT_MAX if unknown
- */
-static bool rebalance_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i;
-
- data_opts->rewrite_ptrs = 0;
- data_opts->target = io_opts->background_target;
- data_opts->extra_replicas = 0;
- data_opts->btree_insert_flags = 0;
-
- if (io_opts->background_compression &&
- !bch2_bkey_is_incompressible(k)) {
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- i = 0;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (!p.ptr.cached &&
- p.crc.compression_type !=
- bch2_compression_opt_to_type(io_opts->background_compression))
- data_opts->rewrite_ptrs |= 1U << i;
- i++;
- }
- }
+#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
- if (io_opts->background_target) {
- const struct bch_extent_ptr *ptr;
+static const char * const bch2_rebalance_state_strs[] = {
+#define x(t) #t,
+ BCH_REBALANCE_STATES()
+ NULL
+#undef x
+};
- i = 0;
- bkey_for_each_ptr(ptrs, ptr) {
- if (!ptr->cached &&
- !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
- bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
- data_opts->rewrite_ptrs |= 1U << i;
- i++;
- }
- }
+static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie *cookie;
+ u64 v;
+ int ret;
- return data_opts->rewrite_ptrs != 0;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ v = k.k->type == KEY_TYPE_cookie
+ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+ : 0;
+
+ cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
+ ret = PTR_ERR_OR_ZERO(cookie);
+ if (ret)
+ goto err;
+
+ bkey_cookie_init(&cookie->k_i);
+ cookie->k.p = iter.pos;
+ cookie->v.cookie = cpu_to_le64(v + 1);
+
+ ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
-void bch2_rebalance_add_key(struct bch_fs *c,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts)
+int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- struct data_update_opts update_opts = { 0 };
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
- unsigned i;
-
- if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
- return;
-
- i = 0;
- ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr) {
- if ((1U << i) && update_opts.rewrite_ptrs)
- if (atomic64_add_return(k.k->size,
- &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
- k.k->size)
- rebalance_wakeup(c);
- i++;
- }
+ int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ __bch2_set_rebalance_needs_scan(trans, inum));
+ rebalance_wakeup(c);
+ return ret;
}
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+int bch2_set_fs_needs_rebalance(struct bch_fs *c)
{
- if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
- sectors)
- rebalance_wakeup(c);
+ return bch2_set_rebalance_needs_scan(c, 0);
}
-struct rebalance_work {
- int dev_most_full_idx;
- unsigned dev_most_full_percent;
- u64 dev_most_full_work;
- u64 dev_most_full_capacity;
- u64 total_work;
-};
+static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 v;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ v = k.k->type == KEY_TYPE_cookie
+ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+ : 0;
+
+ if (v == cookie)
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
-static void rebalance_work_accumulate(struct rebalance_work *w,
- u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
+ struct btree_iter *work_iter)
{
- unsigned percent_full;
- u64 work = dev_work + unknown_dev;
+ return !kthread_should_stop()
+ ? bch2_btree_iter_peek(work_iter)
+ : bkey_s_c_null;
+}
- /* avoid divide by 0 */
- if (!capacity)
- return;
+static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ extent_entry_drop(bkey_i_to_s(n),
+ (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
+ return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
+ struct bpos work_pos,
+ struct btree_iter *extent_iter,
+ struct data_update_opts *data_opts)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+
+ bch2_trans_iter_exit(trans, extent_iter);
+ bch2_trans_iter_init(trans, extent_iter,
+ work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
+ work_pos,
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek_slot(extent_iter);
+ if (bkey_err(k))
+ return k;
+
+ const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
+ if (!r) {
+ /* raced due to btree write buffer, nothing to do */
+ return bkey_s_c_null;
+ }
- if (work < dev_work || work < unknown_dev)
- work = U64_MAX;
- work = min(work, capacity);
+ memset(data_opts, 0, sizeof(*data_opts));
- percent_full = div64_u64(work * 100, capacity);
+ data_opts->rewrite_ptrs =
+ bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
+ data_opts->target = r->target;
- if (percent_full >= w->dev_most_full_percent) {
- w->dev_most_full_idx = idx;
- w->dev_most_full_percent = percent_full;
- w->dev_most_full_work = work;
- w->dev_most_full_capacity = capacity;
+ if (!data_opts->rewrite_ptrs) {
+ /*
+ * device we would want to write to offline? devices in target
+ * changed?
+ *
+ * We'll now need a full scan before this extent is picked up
+ * again:
+ */
+ int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
+ if (ret)
+ return bkey_s_c_err(ret);
+ return bkey_s_c_null;
}
- if (w->total_work + dev_work >= w->total_work &&
- w->total_work + dev_work >= dev_work)
- w->total_work += dev_work;
+ return k;
}
-static struct rebalance_work rebalance_work(struct bch_fs *c)
+noinline_for_stack
+static int do_rebalance_extent(struct moving_context *ctxt,
+ struct bpos work_pos,
+ struct btree_iter *extent_iter)
{
- struct bch_dev *ca;
- struct rebalance_work ret = { .dev_most_full_idx = -1 };
- u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
- unsigned i;
-
- for_each_online_member(ca, c, i)
- rebalance_work_accumulate(&ret,
- atomic64_read(&ca->rebalance_work),
- unknown_dev,
- bucket_to_sector(ca, ca->mi.nbuckets -
- ca->mi.first_bucket),
- i);
-
- rebalance_work_accumulate(&ret,
- unknown_dev, 0, c->capacity, -1);
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &trans->c->rebalance;
+ struct data_update_opts data_opts;
+ struct bch_io_opts io_opts;
+ struct bkey_s_c k;
+ struct bkey_buf sk;
+ int ret;
+
+ ctxt->stats = &r->work_stats;
+ r->state = BCH_REBALANCE_working;
+
+ bch2_bkey_buf_init(&sk);
+
+ ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
+ extent_iter, &data_opts));
+ if (ret || !k.k)
+ goto out;
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+ if (ret)
+ goto out;
+
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+
+ /*
+ * The iterator gets unlocked by __bch2_read_extent - need to
+ * save a copy of @k elsewhere:
+ */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
+ ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
+ if (ret) {
+ if (bch2_err_matches(ret, ENOMEM)) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt);
+ ret = -BCH_ERR_transaction_restart_nested;
+ }
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto out;
+
+ /* skip it and continue, XXX signal failure */
+ ret = 0;
+ }
+out:
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
-static void rebalance_work_reset(struct bch_fs *c)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
- struct bch_dev *ca;
- unsigned i;
+ unsigned target, compression;
+
+ if (k.k->p.inode) {
+ target = io_opts->background_target;
+ compression = io_opts->background_compression ?: io_opts->compression;
+ } else {
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
- for_each_online_member(ca, c, i)
- atomic64_set(&ca->rebalance_work, 0);
+ target = r ? r->target : io_opts->background_target;
+ compression = r ? r->compression :
+ (io_opts->background_compression ?: io_opts->compression);
+ }
- atomic64_set(&c->rebalance.work_unknown_dev, 0);
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
+ data_opts->target = target;
+ return data_opts->rewrite_ptrs != 0;
}
-static unsigned long curr_cputime(void)
+static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
{
- u64 utime, stime;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs_rebalance *r = &trans->c->rebalance;
+ int ret;
+
+ bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+ ctxt->stats = &r->scan_stats;
- task_cputime_adjusted(current, &utime, &stime);
- return nsecs_to_jiffies(utime + stime);
+ if (!inum) {
+ r->scan_start = BBPOS_MIN;
+ r->scan_end = BBPOS_MAX;
+ } else {
+ r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
+ r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
+ }
+
+ r->state = BCH_REBALANCE_scanning;
+
+ ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
+ commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+
+ bch2_move_stats_exit(&r->scan_stats, trans->c);
+ return ret;
}
-static int bch2_rebalance_thread(void *arg)
+static void rebalance_wait(struct bch_fs *c)
{
- struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct io_clock *clock = &c->io_clock[WRITE];
- struct rebalance_work w, p;
- struct bch_move_stats move_stats;
- unsigned long start, prev_start;
- unsigned long prev_run_time, prev_run_cputime;
- unsigned long cputime, prev_cputime;
- u64 io_start;
- long throttle;
+ u64 now = atomic64_read(&clock->now);
+ u64 min_member_capacity = bch2_min_rw_member_capacity(c);
- set_freezable();
+ if (min_member_capacity == U64_MAX)
+ min_member_capacity = 128 * 2048;
+
+ r->wait_iotime_end = now + (min_member_capacity >> 6);
- io_start = atomic64_read(&clock->now);
- p = rebalance_work(c);
- prev_start = jiffies;
- prev_cputime = curr_cputime();
+ if (r->state != BCH_REBALANCE_waiting) {
+ r->wait_iotime_start = now;
+ r->wait_wallclock_start = ktime_get_real_ns();
+ r->state = BCH_REBALANCE_waiting;
+ }
+
+ bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+}
- bch2_move_stats_init(&move_stats, "rebalance");
- while (!kthread_wait_freezable(r->enabled)) {
- cond_resched();
+static int do_rebalance(struct moving_context *ctxt)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &c->rebalance;
+ struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+ struct bkey_s_c k;
+ int ret = 0;
- start = jiffies;
- cputime = curr_cputime();
+ bch2_move_stats_init(&r->work_stats, "rebalance_work");
+ bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
- prev_run_time = start - prev_start;
- prev_run_cputime = cputime - prev_cputime;
+ bch2_trans_iter_init(trans, &rebalance_work_iter,
+ BTREE_ID_rebalance_work, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS);
- w = rebalance_work(c);
- BUG_ON(!w.dev_most_full_capacity);
+ while (!bch2_move_ratelimit(ctxt) &&
+ !kthread_wait_freezable(r->enabled)) {
+ bch2_trans_begin(trans);
- if (!w.total_work) {
- r->state = REBALANCE_WAITING;
- kthread_wait_freezable(rebalance_work(c).total_work);
+ ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
- }
+ if (ret || !k.k)
+ break;
- /*
- * If there isn't much work to do, throttle cpu usage:
- */
- throttle = prev_run_cputime * 100 /
- max(1U, w.dev_most_full_percent) -
- prev_run_time;
-
- if (w.dev_most_full_percent < 20 && throttle > 0) {
- r->throttled_until_iotime = io_start +
- div_u64(w.dev_most_full_capacity *
- (20 - w.dev_most_full_percent),
- 50);
-
- if (atomic64_read(&clock->now) + clock->max_slop <
- r->throttled_until_iotime) {
- r->throttled_until_cputime = start + throttle;
- r->state = REBALANCE_THROTTLED;
-
- bch2_kthread_io_clock_wait(clock,
- r->throttled_until_iotime,
- throttle);
- continue;
- }
- }
+ ret = k.k->type == KEY_TYPE_cookie
+ ? do_rebalance_scan(ctxt, k.k->p.inode,
+ le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
+ : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
- /* minimum 1 mb/sec: */
- r->pd.rate.rate =
- max_t(u64, 1 << 11,
- r->pd.rate.rate *
- max(p.dev_most_full_percent, 1U) /
- max(w.dev_most_full_percent, 1U));
-
- io_start = atomic64_read(&clock->now);
- p = w;
- prev_start = start;
- prev_cputime = cputime;
-
- r->state = REBALANCE_RUNNING;
- memset(&move_stats, 0, sizeof(move_stats));
- rebalance_work_reset(c);
-
- bch2_move_data(c,
- 0, POS_MIN,
- BTREE_ID_NR, POS_MAX,
- /* ratelimiting disabled for now */
- NULL, /* &r->pd.rate, */
- &move_stats,
- writepoint_ptr(&c->rebalance_write_point),
- true,
- rebalance_pred, NULL);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+
+ bch2_btree_iter_advance(&rebalance_work_iter);
}
- return 0;
+ bch2_trans_iter_exit(trans, &extent_iter);
+ bch2_trans_iter_exit(trans, &rebalance_work_iter);
+ bch2_move_stats_exit(&r->scan_stats, c);
+
+ if (!ret &&
+ !kthread_should_stop() &&
+ !atomic64_read(&r->work_stats.sectors_seen) &&
+ !atomic64_read(&r->scan_stats.sectors_seen)) {
+ bch2_trans_unlock_long(trans);
+ rebalance_wait(c);
+ }
+
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_fn(c, ret);
+ return ret;
}
-void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
+static int bch2_rebalance_thread(void *arg)
{
+ struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
- struct rebalance_work w = rebalance_work(c);
+ struct moving_context ctxt;
+ int ret;
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 20);
+ set_freezable();
- prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
- prt_tab(out);
+ bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
+ writepoint_ptr(&c->rebalance_write_point),
+ true);
- prt_human_readable_u64(out, w.dev_most_full_work << 9);
- prt_printf(out, "/");
- prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
- prt_newline(out);
+ while (!kthread_should_stop() &&
+ !(ret = do_rebalance(&ctxt)))
+ ;
- prt_printf(out, "total work:");
- prt_tab(out);
+ bch2_moving_ctxt_exit(&ctxt);
- prt_human_readable_u64(out, w.total_work << 9);
- prt_printf(out, "/");
- prt_human_readable_u64(out, c->capacity << 9);
- prt_newline(out);
+ return 0;
+}
+
+void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_fs_rebalance *r = &c->rebalance;
- prt_printf(out, "rate:");
- prt_tab(out);
- prt_printf(out, "%u", r->pd.rate.rate);
+ prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out);
+ printbuf_indent_add(out, 2);
switch (r->state) {
- case REBALANCE_WAITING:
- prt_printf(out, "waiting");
+ case BCH_REBALANCE_waiting: {
+ u64 now = atomic64_read(&c->io_clock[WRITE].now);
+
+ prt_str(out, "io wait duration: ");
+ bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+ prt_newline(out);
+
+ prt_str(out, "io wait remaining: ");
+ bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+ prt_newline(out);
+
+ prt_str(out, "duration waited: ");
+ bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
+ prt_newline(out);
break;
- case REBALANCE_THROTTLED:
- prt_printf(out, "throttled for %lu sec or ",
- (r->throttled_until_cputime - jiffies) / HZ);
- prt_human_readable_u64(out,
- (r->throttled_until_iotime -
- atomic64_read(&c->io_clock[WRITE].now)) << 9);
- prt_printf(out, " io");
+ }
+ case BCH_REBALANCE_working:
+ bch2_move_stats_to_text(out, &r->work_stats);
break;
- case REBALANCE_RUNNING:
- prt_printf(out, "running");
+ case BCH_REBALANCE_scanning:
+ bch2_move_stats_to_text(out, &r->scan_stats);
break;
}
prt_newline(out);
+ printbuf_indent_sub(out, 2);
}
void bch2_rebalance_stop(struct bch_fs *c)
@@ -361,6 +461,4 @@ int bch2_rebalance_start(struct bch_fs *c)
void bch2_fs_rebalance_init(struct bch_fs *c)
{
bch2_pd_controller_init(&c->rebalance.pd);
-
- atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 7ade0bb81cce..28a52638f16c 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -4,6 +4,9 @@
#include "rebalance_types.h"
+int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_fs_needs_rebalance(struct bch_fs *);
+
static inline void rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;
@@ -15,11 +18,7 @@ static inline void rebalance_wakeup(struct bch_fs *c)
rcu_read_unlock();
}
-void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
- struct bch_io_opts *);
-void bch2_rebalance_add_work(struct bch_fs *, u64);
-
-void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
+void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 7462a92e9598..0fffb536c1d0 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -2,25 +2,36 @@
#ifndef _BCACHEFS_REBALANCE_TYPES_H
#define _BCACHEFS_REBALANCE_TYPES_H
+#include "bbpos_types.h"
#include "move_types.h"
-enum rebalance_state {
- REBALANCE_WAITING,
- REBALANCE_THROTTLED,
- REBALANCE_RUNNING,
+#define BCH_REBALANCE_STATES() \
+ x(waiting) \
+ x(working) \
+ x(scanning)
+
+enum bch_rebalance_states {
+#define x(t) BCH_REBALANCE_##t,
+ BCH_REBALANCE_STATES()
+#undef x
};
struct bch_fs_rebalance {
- struct task_struct __rcu *thread;
+ struct task_struct __rcu *thread;
struct bch_pd_controller pd;
- atomic64_t work_unknown_dev;
+ enum bch_rebalance_states state;
+ u64 wait_iotime_start;
+ u64 wait_iotime_end;
+ u64 wait_wallclock_start;
+
+ struct bch_move_stats work_stats;
- enum rebalance_state state;
- u64 throttled_until_iotime;
- unsigned long throttled_until_cputime;
+ struct bbpos scan_start;
+ struct bbpos scan_end;
+ struct bch_move_stats scan_stats;
- unsigned enabled:1;
+ unsigned enabled:1;
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 4cd660650e5b..9c30500ce920 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -23,6 +23,7 @@
#include "logged_ops.h"
#include "move.h"
#include "quota.h"
+#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
@@ -182,7 +183,7 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_replay_key(trans, k));
if (ret) {
bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
- bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret));
+ bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret));
goto err;
}
}
@@ -225,7 +226,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
if (entry->u64s) {
r->level = entry->level;
- bkey_copy(&r->key, &entry->start[0]);
+ bkey_copy(&r->key, (struct bkey_i *) entry->start);
r->error = 0;
} else {
r->error = -EIO;
@@ -364,10 +365,12 @@ static int read_btree_roots(struct bch_fs *c)
}
if (r->error) {
- __fsck_err(c, btree_id_is_alloc(i)
+ __fsck_err(c,
+ btree_id_is_alloc(i)
? FSCK_CAN_IGNORE : 0,
+ btree_root_bkey_invalid,
"invalid btree root %s",
- bch2_btree_ids[i]);
+ bch2_btree_id_str(i));
if (i == BTREE_ID_alloc)
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
}
@@ -375,8 +378,9 @@ static int read_btree_roots(struct bch_fs *c)
ret = bch2_btree_root_read(c, i, &r->key, r->level);
if (ret) {
fsck_err(c,
+ btree_root_read_error,
"error reading btree root %s",
- bch2_btree_ids[i]);
+ bch2_btree_id_str(i));
if (btree_id_is_alloc(i))
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
ret = 0;
@@ -713,6 +717,7 @@ int bch2_fs_recovery(struct bch_fs *c)
if (mustfix_fsck_err_on(c->sb.clean &&
last_journal_entry &&
!journal_entry_empty(last_journal_entry), c,
+ clean_but_journal_not_empty,
"filesystem marked clean but journal not empty")) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
@@ -720,7 +725,9 @@ int bch2_fs_recovery(struct bch_fs *c)
}
if (!last_journal_entry) {
- fsck_err_on(!c->sb.clean, c, "no journal entries found");
+ fsck_err_on(!c->sb.clean, c,
+ dirty_but_no_journal_entries,
+ "no journal entries found");
if (clean)
goto use_clean;
@@ -728,6 +735,13 @@ int bch2_fs_recovery(struct bch_fs *c)
if (*i) {
last_journal_entry = &(*i)->j;
(*i)->ignore = false;
+ /*
+ * This was probably a NO_FLUSH entry,
+ * so last_seq was garbage - but we know
+ * we're only using a single journal
+ * entry, set it here:
+ */
+ (*i)->j.last_seq = (*i)->j.seq;
break;
}
}
@@ -901,7 +915,7 @@ out:
}
kfree(clean);
- if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
+ if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
bch2_fs_read_write_early(c);
bch2_delete_dead_snapshots_async(c);
}
@@ -946,16 +960,12 @@ int bch2_fs_initialize(struct bch_fs *c)
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- for_each_online_member(ca, c, i)
+ for_each_member_device(ca, c, i)
bch2_dev_usage_init(ca);
- for_each_online_member(ca, c, i) {
- ret = bch2_dev_journal_alloc(ca);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
- }
+ ret = bch2_fs_journal_alloc(c);
+ if (ret)
+ goto err;
/*
* journal_res_get() will crash if called before this has
@@ -973,15 +983,13 @@ int bch2_fs_initialize(struct bch_fs *c)
* btree updates
*/
bch_verbose(c, "marking superblocks");
- for_each_member_device(ca, c, i) {
- ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
+ ret = bch2_trans_mark_dev_sbs(c);
+ bch_err_msg(c, ret, "marking superblocks");
+ if (ret)
+ goto err;
+ for_each_online_member(ca, c, i)
ca->new_fs_bucket_idx = 0;
- }
ret = bch2_fs_freespace_init(c);
if (ret)
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index fbfa9d831d6f..515e3d62c2ac 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -14,6 +14,8 @@
x(snapshots_read, PASS_ALWAYS) \
x(check_topology, 0) \
x(check_allocations, PASS_FSCK) \
+ x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \
+ x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \
x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \
x(journal_replay, PASS_ALWAYS) \
x(check_alloc_info, PASS_FSCK) \
@@ -27,11 +29,12 @@
x(check_snapshot_trees, PASS_FSCK) \
x(check_snapshots, PASS_FSCK) \
x(check_subvols, PASS_FSCK) \
- x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN) \
+ x(delete_dead_snapshots, PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 0) \
x(resume_logged_ops, PASS_ALWAYS) \
x(check_inodes, PASS_FSCK) \
x(check_extents, PASS_FSCK) \
+ x(check_indirect_extents, PASS_FSCK) \
x(check_dirents, PASS_FSCK) \
x(check_xattrs, PASS_FSCK) \
x(check_root, PASS_FSCK) \
@@ -39,6 +42,7 @@
x(check_nlinks, PASS_FSCK) \
x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 0) \
+ x(set_fs_needs_rebalance, 0) \
enum bch_recovery_pass {
#define x(n, when) BCH_RECOVERY_PASS_##n,
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index d77d0ea9afff..6e1bfe9feb59 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -7,6 +7,7 @@
#include "inode.h"
#include "io_misc.h"
#include "io_write.h"
+#include "rebalance.h"
#include "reflink.h"
#include "subvolume.h"
#include "super-io.h"
@@ -27,7 +28,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
/* reflink pointers */
-int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
@@ -74,7 +75,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
/* indirect extents */
-int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
@@ -103,28 +104,29 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
#endif
+static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
+{
+ if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
+ new->k.type = KEY_TYPE_deleted;
+ new->k.size = 0;
+ set_bkey_val_u64s(&new->k, 0);;
+ *flags &= ~BTREE_TRIGGER_INSERT;
+ }
+}
+
int bch2_trans_mark_reflink_v(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
- if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
- struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
-
- if (!r->v.refcount) {
- r->k.type = KEY_TYPE_deleted;
- r->k.size = 0;
- set_bkey_val_u64s(&r->k, 0);
- return 0;
- }
- }
+ check_indirect_extent_deleting(new, &flags);
return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
}
/* indirect inline data */
-int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
@@ -132,7 +134,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
}
void bch2_indirect_inline_data_to_text(struct printbuf *out,
- struct bch_fs *c, struct bkey_s_c k)
+ struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
unsigned datalen = bkey_inline_data_bytes(k.k);
@@ -147,16 +149,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
- if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
- struct bkey_i_indirect_inline_data *r =
- bkey_i_to_indirect_inline_data(new);
-
- if (!r->v.refcount) {
- r->k.type = KEY_TYPE_deleted;
- r->k.size = 0;
- set_bkey_val_u64s(&r->k, 0);
- }
- }
+ check_indirect_extent_deleting(new, &flags);
return 0;
}
@@ -260,8 +253,9 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bpos dst_start = POS(dst_inum.inum, dst_offset);
struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
+ struct bch_io_opts opts;
struct bpos src_want;
- u64 dst_done;
+ u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
@@ -277,6 +271,10 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_bkey_buf_init(&new_src);
trans = bch2_trans_get(c);
+ ret = bch2_inum_opts_get(trans, src_inum, &opts);
+ if (ret)
+ goto err;
+
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
BTREE_ITER_INTENT);
bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
@@ -360,10 +358,13 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_extent_update(trans, dst_inum, &dst_iter,
- new_dst.k, &disk_res,
- new_i_size, i_sectors_delta,
- true);
+ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
+ opts.background_target,
+ opts.background_compression) ?:
+ bch2_extent_update(trans, dst_inum, &dst_iter,
+ new_dst.k, &disk_res,
+ new_i_size, i_sectors_delta,
+ true);
bch2_disk_reservation_put(c, &disk_res);
}
bch2_trans_iter_exit(trans, &dst_iter);
@@ -394,7 +395,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_iter_exit(trans, &inode_iter);
} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-
+err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index fe52538efb52..8ccf3f9c4939 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -4,7 +4,7 @@
enum bkey_invalid_flags;
-int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
@@ -19,7 +19,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.min_val_size = 16, \
})
-int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
@@ -35,7 +35,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
.min_val_size = 8, \
})
-int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cef2a0447b86..1c3ae13bfced 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -462,18 +462,13 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
{
lockdep_assert_held(&c->replicas_gc_lock);
- if (ret)
- goto err;
-
mutex_lock(&c->sb_lock);
percpu_down_write(&c->mark_lock);
- ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
- if (ret)
- goto err;
+ ret = ret ?:
+ bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
+ replicas_table_update(c, &c->replicas_gc);
- ret = replicas_table_update(c, &c->replicas_gc);
-err:
kfree(c->replicas_gc.entries);
c->replicas_gc.entries = NULL;
@@ -579,12 +574,9 @@ retry:
bch2_cpu_replicas_sort(&new);
- ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
- if (ret)
- goto err;
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
+ replicas_table_update(c, &new);
- ret = replicas_table_update(c, &new);
-err:
kfree(new.entries);
percpu_up_write(&c->mark_lock);
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 61203d7c8d36..e151ada1c8bd 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -82,6 +82,7 @@ int bch2_verify_superblock_clean(struct bch_fs *c,
int ret = 0;
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+ sb_clean_journal_seq_mismatch,
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
le64_to_cpu(clean->journal_seq),
le64_to_cpu(j->seq))) {
@@ -119,6 +120,7 @@ int bch2_verify_superblock_clean(struct bch_fs *c,
k1->k.u64s != k2->k.u64s ||
memcmp(k1, k2, bkey_bytes(&k1->k)) ||
l1 != l2, c,
+ sb_clean_btree_root_mismatch,
"superblock btree root %u doesn't match journal after clean shutdown\n"
"sb: l=%u %s\n"
"journal: l=%u %s\n", i,
@@ -140,6 +142,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
if (fsck_err_on(!sb_clean, c,
+ sb_clean_missing,
"superblock marked clean but clean section not present")) {
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
@@ -373,7 +376,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
entry = sb_clean->start;
bch2_journal_super_entries_add_common(c, &entry, 0);
- entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
+ entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
memset(entry, 0,
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
new file mode 100644
index 000000000000..f0930ab7f036
--- /dev/null
+++ b/fs/bcachefs/sb-errors.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "sb-errors.h"
+#include "super-io.h"
+
+static const char * const bch2_sb_error_strs[] = {
+#define x(t, n, ...) [n] = #t,
+ BCH_SB_ERRS()
+ NULL
+};
+
+static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
+{
+ if (id < BCH_SB_ERR_MAX)
+ prt_str(out, bch2_sb_error_strs[id]);
+ else
+ prt_printf(out, "(unknown error %u)", id);
+}
+
+static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
+{
+ return e
+ ? (bch2_sb_field_bytes(&e->field) - sizeof(*e)) / sizeof(e->entries[0])
+ : 0;
+}
+
+static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
+{
+ return (sizeof(struct bch_sb_field_errors) +
+ sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
+}
+
+static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_errors *e = field_to_type(f, errors);
+ unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
+
+ for (i = 0; i < nr; i++) {
+ if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
+ prt_printf(err, "entry with count 0 (id ");
+ bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
+ prt_printf(err, ")");
+ return -BCH_ERR_invalid_sb_errors;
+ }
+
+ if (i + 1 < nr &&
+ BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
+ BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
+ prt_printf(err, "entries out of order");
+ return -BCH_ERR_invalid_sb_errors;
+ }
+ }
+
+ return 0;
+}
+
+static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_errors *e = field_to_type(f, errors);
+ unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
+
+ if (out->nr_tabstops <= 1)
+ printbuf_tabstop_push(out, 16);
+
+ for (i = 0; i < nr; i++) {
+ bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
+ prt_tab(out);
+ prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
+ prt_tab(out);
+ bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
+ prt_newline(out);
+ }
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_errors = {
+ .validate = bch2_sb_errors_validate,
+ .to_text = bch2_sb_errors_to_text,
+};
+
+void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
+{
+ bch_sb_errors_cpu *e = &c->fsck_error_counts;
+ struct bch_sb_error_entry_cpu n = {
+ .id = err,
+ .nr = 1,
+ .last_error_time = ktime_get_real_seconds()
+ };
+ unsigned i;
+
+ mutex_lock(&c->fsck_error_counts_lock);
+ for (i = 0; i < e->nr; i++) {
+ if (err == e->data[i].id) {
+ e->data[i].nr++;
+ e->data[i].last_error_time = n.last_error_time;
+ goto out;
+ }
+ if (err < e->data[i].id)
+ break;
+ }
+
+ if (darray_make_room(e, 1))
+ goto out;
+
+ darray_insert_item(e, i, n);
+out:
+ mutex_unlock(&c->fsck_error_counts_lock);
+}
+
+void bch2_sb_errors_from_cpu(struct bch_fs *c)
+{
+ bch_sb_errors_cpu *src = &c->fsck_error_counts;
+ struct bch_sb_field_errors *dst =
+ bch2_sb_field_resize(&c->disk_sb, errors,
+ bch2_sb_field_errors_u64s(src->nr));
+ unsigned i;
+
+ if (!dst)
+ return;
+
+ for (i = 0; i < src->nr; i++) {
+ SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
+ SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
+ dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
+ }
+}
+
+static int bch2_sb_errors_to_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
+ bch_sb_errors_cpu *dst = &c->fsck_error_counts;
+ unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
+ int ret;
+
+ if (!nr)
+ return 0;
+
+ mutex_lock(&c->fsck_error_counts_lock);
+ ret = darray_make_room(dst, nr);
+ if (ret)
+ goto err;
+
+ dst->nr = nr;
+
+ for (i = 0; i < nr; i++) {
+ dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
+ dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
+ dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
+ }
+err:
+ mutex_unlock(&c->fsck_error_counts_lock);
+
+ return ret;
+}
+
+void bch2_fs_sb_errors_exit(struct bch_fs *c)
+{
+ darray_exit(&c->fsck_error_counts);
+}
+
+void bch2_fs_sb_errors_init_early(struct bch_fs *c)
+{
+ mutex_init(&c->fsck_error_counts_lock);
+ darray_init(&c->fsck_error_counts);
+}
+
+int bch2_fs_sb_errors_init(struct bch_fs *c)
+{
+ return bch2_sb_errors_to_cpu(c);
+}
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
new file mode 100644
index 000000000000..5a09a53966be
--- /dev/null
+++ b/fs/bcachefs/sb-errors.h
@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_H
+#define _BCACHEFS_SB_ERRORS_H
+
+#include "sb-errors_types.h"
+
+#define BCH_SB_ERRS() \
+ x(clean_but_journal_not_empty, 0) \
+ x(dirty_but_no_journal_entries, 1) \
+ x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \
+ x(sb_clean_journal_seq_mismatch, 3) \
+ x(sb_clean_btree_root_mismatch, 4) \
+ x(sb_clean_missing, 5) \
+ x(jset_unsupported_version, 6) \
+ x(jset_unknown_csum, 7) \
+ x(jset_last_seq_newer_than_seq, 8) \
+ x(jset_past_bucket_end, 9) \
+ x(jset_seq_blacklisted, 10) \
+ x(journal_entries_missing, 11) \
+ x(journal_entry_replicas_not_marked, 12) \
+ x(journal_entry_past_jset_end, 13) \
+ x(journal_entry_replicas_data_mismatch, 14) \
+ x(journal_entry_bkey_u64s_0, 15) \
+ x(journal_entry_bkey_past_end, 16) \
+ x(journal_entry_bkey_bad_format, 17) \
+ x(journal_entry_bkey_invalid, 18) \
+ x(journal_entry_btree_root_bad_size, 19) \
+ x(journal_entry_blacklist_bad_size, 20) \
+ x(journal_entry_blacklist_v2_bad_size, 21) \
+ x(journal_entry_blacklist_v2_start_past_end, 22) \
+ x(journal_entry_usage_bad_size, 23) \
+ x(journal_entry_data_usage_bad_size, 24) \
+ x(journal_entry_clock_bad_size, 25) \
+ x(journal_entry_clock_bad_rw, 26) \
+ x(journal_entry_dev_usage_bad_size, 27) \
+ x(journal_entry_dev_usage_bad_dev, 28) \
+ x(journal_entry_dev_usage_bad_pad, 29) \
+ x(btree_node_unreadable, 30) \
+ x(btree_node_fault_injected, 31) \
+ x(btree_node_bad_magic, 32) \
+ x(btree_node_bad_seq, 33) \
+ x(btree_node_unsupported_version, 34) \
+ x(btree_node_bset_older_than_sb_min, 35) \
+ x(btree_node_bset_newer_than_sb, 36) \
+ x(btree_node_data_missing, 37) \
+ x(btree_node_bset_after_end, 38) \
+ x(btree_node_replicas_sectors_written_mismatch, 39) \
+ x(btree_node_replicas_data_mismatch, 40) \
+ x(bset_unknown_csum, 41) \
+ x(bset_bad_csum, 42) \
+ x(bset_past_end_of_btree_node, 43) \
+ x(bset_wrong_sector_offset, 44) \
+ x(bset_empty, 45) \
+ x(bset_bad_seq, 46) \
+ x(bset_blacklisted_journal_seq, 47) \
+ x(first_bset_blacklisted_journal_seq, 48) \
+ x(btree_node_bad_btree, 49) \
+ x(btree_node_bad_level, 50) \
+ x(btree_node_bad_min_key, 51) \
+ x(btree_node_bad_max_key, 52) \
+ x(btree_node_bad_format, 53) \
+ x(btree_node_bkey_past_bset_end, 54) \
+ x(btree_node_bkey_bad_format, 55) \
+ x(btree_node_bad_bkey, 56) \
+ x(btree_node_bkey_out_of_order, 57) \
+ x(btree_root_bkey_invalid, 58) \
+ x(btree_root_read_error, 59) \
+ x(btree_root_bad_min_key, 50) \
+ x(btree_root_bad_max_key, 61) \
+ x(btree_node_read_error, 62) \
+ x(btree_node_topology_bad_min_key, 63) \
+ x(btree_node_topology_bad_max_key, 64) \
+ x(btree_node_topology_overwritten_by_prev_node, 65) \
+ x(btree_node_topology_overwritten_by_next_node, 66) \
+ x(btree_node_topology_interior_node_empty, 67) \
+ x(fs_usage_hidden_wrong, 68) \
+ x(fs_usage_btree_wrong, 69) \
+ x(fs_usage_data_wrong, 70) \
+ x(fs_usage_cached_wrong, 71) \
+ x(fs_usage_reserved_wrong, 72) \
+ x(fs_usage_persistent_reserved_wrong, 73) \
+ x(fs_usage_nr_inodes_wrong, 74) \
+ x(fs_usage_replicas_wrong, 75) \
+ x(dev_usage_buckets_wrong, 76) \
+ x(dev_usage_sectors_wrong, 77) \
+ x(dev_usage_fragmented_wrong, 78) \
+ x(dev_usage_buckets_ec_wrong, 79) \
+ x(bkey_version_in_future, 80) \
+ x(bkey_u64s_too_small, 81) \
+ x(bkey_invalid_type_for_btree, 82) \
+ x(bkey_extent_size_zero, 83) \
+ x(bkey_extent_size_greater_than_offset, 84) \
+ x(bkey_size_nonzero, 85) \
+ x(bkey_snapshot_nonzero, 86) \
+ x(bkey_snapshot_zero, 87) \
+ x(bkey_at_pos_max, 88) \
+ x(bkey_before_start_of_btree_node, 89) \
+ x(bkey_after_end_of_btree_node, 90) \
+ x(bkey_val_size_nonzero, 91) \
+ x(bkey_val_size_too_small, 92) \
+ x(alloc_v1_val_size_bad, 93) \
+ x(alloc_v2_unpack_error, 94) \
+ x(alloc_v3_unpack_error, 95) \
+ x(alloc_v4_val_size_bad, 96) \
+ x(alloc_v4_backpointers_start_bad, 97) \
+ x(alloc_key_data_type_bad, 98) \
+ x(alloc_key_empty_but_have_data, 99) \
+ x(alloc_key_dirty_sectors_0, 100) \
+ x(alloc_key_data_type_inconsistency, 101) \
+ x(alloc_key_to_missing_dev_bucket, 102) \
+ x(alloc_key_cached_inconsistency, 103) \
+ x(alloc_key_cached_but_read_time_zero, 104) \
+ x(alloc_key_to_missing_lru_entry, 105) \
+ x(alloc_key_data_type_wrong, 106) \
+ x(alloc_key_gen_wrong, 107) \
+ x(alloc_key_dirty_sectors_wrong, 108) \
+ x(alloc_key_cached_sectors_wrong, 109) \
+ x(alloc_key_stripe_wrong, 110) \
+ x(alloc_key_stripe_redundancy_wrong, 111) \
+ x(bucket_sector_count_overflow, 112) \
+ x(bucket_metadata_type_mismatch, 113) \
+ x(need_discard_key_wrong, 114) \
+ x(freespace_key_wrong, 115) \
+ x(freespace_hole_missing, 116) \
+ x(bucket_gens_val_size_bad, 117) \
+ x(bucket_gens_key_wrong, 118) \
+ x(bucket_gens_hole_wrong, 119) \
+ x(bucket_gens_to_invalid_dev, 120) \
+ x(bucket_gens_to_invalid_buckets, 121) \
+ x(bucket_gens_nonzero_for_invalid_buckets, 122) \
+ x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
+ x(need_discard_freespace_key_bad, 124) \
+ x(backpointer_pos_wrong, 125) \
+ x(backpointer_to_missing_device, 126) \
+ x(backpointer_to_missing_alloc, 127) \
+ x(backpointer_to_missing_ptr, 128) \
+ x(lru_entry_at_time_0, 129) \
+ x(lru_entry_to_invalid_bucket, 130) \
+ x(lru_entry_bad, 131) \
+ x(btree_ptr_val_too_big, 132) \
+ x(btree_ptr_v2_val_too_big, 133) \
+ x(btree_ptr_has_non_ptr, 134) \
+ x(extent_ptrs_invalid_entry, 135) \
+ x(extent_ptrs_no_ptrs, 136) \
+ x(extent_ptrs_too_many_ptrs, 137) \
+ x(extent_ptrs_redundant_crc, 138) \
+ x(extent_ptrs_redundant_stripe, 139) \
+ x(extent_ptrs_unwritten, 140) \
+ x(extent_ptrs_written_and_unwritten, 141) \
+ x(ptr_to_invalid_device, 142) \
+ x(ptr_to_duplicate_device, 143) \
+ x(ptr_after_last_bucket, 144) \
+ x(ptr_before_first_bucket, 145) \
+ x(ptr_spans_multiple_buckets, 146) \
+ x(ptr_to_missing_backpointer, 147) \
+ x(ptr_to_missing_alloc_key, 148) \
+ x(ptr_to_missing_replicas_entry, 149) \
+ x(ptr_to_missing_stripe, 150) \
+ x(ptr_to_incorrect_stripe, 151) \
+ x(ptr_gen_newer_than_bucket_gen, 152) \
+ x(ptr_too_stale, 153) \
+ x(stale_dirty_ptr, 154) \
+ x(ptr_bucket_data_type_mismatch, 155) \
+ x(ptr_cached_and_erasure_coded, 156) \
+ x(ptr_crc_uncompressed_size_too_small, 157) \
+ x(ptr_crc_csum_type_unknown, 158) \
+ x(ptr_crc_compression_type_unknown, 159) \
+ x(ptr_crc_redundant, 160) \
+ x(ptr_crc_uncompressed_size_too_big, 161) \
+ x(ptr_crc_nonce_mismatch, 162) \
+ x(ptr_stripe_redundant, 163) \
+ x(reservation_key_nr_replicas_invalid, 164) \
+ x(reflink_v_refcount_wrong, 165) \
+ x(reflink_p_to_missing_reflink_v, 166) \
+ x(stripe_pos_bad, 167) \
+ x(stripe_val_size_bad, 168) \
+ x(stripe_sector_count_wrong, 169) \
+ x(snapshot_tree_pos_bad, 170) \
+ x(snapshot_tree_to_missing_snapshot, 171) \
+ x(snapshot_tree_to_missing_subvol, 172) \
+ x(snapshot_tree_to_wrong_subvol, 173) \
+ x(snapshot_tree_to_snapshot_subvol, 174) \
+ x(snapshot_pos_bad, 175) \
+ x(snapshot_parent_bad, 176) \
+ x(snapshot_children_not_normalized, 177) \
+ x(snapshot_child_duplicate, 178) \
+ x(snapshot_child_bad, 179) \
+ x(snapshot_skiplist_not_normalized, 180) \
+ x(snapshot_skiplist_bad, 181) \
+ x(snapshot_should_not_have_subvol, 182) \
+ x(snapshot_to_bad_snapshot_tree, 183) \
+ x(snapshot_bad_depth, 184) \
+ x(snapshot_bad_skiplist, 185) \
+ x(subvol_pos_bad, 186) \
+ x(subvol_not_master_and_not_snapshot, 187) \
+ x(subvol_to_missing_root, 188) \
+ x(subvol_root_wrong_bi_subvol, 189) \
+ x(bkey_in_missing_snapshot, 190) \
+ x(inode_pos_inode_nonzero, 191) \
+ x(inode_pos_blockdev_range, 192) \
+ x(inode_unpack_error, 193) \
+ x(inode_str_hash_invalid, 194) \
+ x(inode_v3_fields_start_bad, 195) \
+ x(inode_snapshot_mismatch, 196) \
+ x(inode_unlinked_but_clean, 197) \
+ x(inode_unlinked_but_nlink_nonzero, 198) \
+ x(inode_checksum_type_invalid, 199) \
+ x(inode_compression_type_invalid, 200) \
+ x(inode_subvol_root_but_not_dir, 201) \
+ x(inode_i_size_dirty_but_clean, 202) \
+ x(inode_i_sectors_dirty_but_clean, 203) \
+ x(inode_i_sectors_wrong, 204) \
+ x(inode_dir_wrong_nlink, 205) \
+ x(inode_dir_multiple_links, 206) \
+ x(inode_multiple_links_but_nlink_0, 207) \
+ x(inode_wrong_backpointer, 208) \
+ x(inode_wrong_nlink, 209) \
+ x(inode_unreachable, 210) \
+ x(deleted_inode_but_clean, 211) \
+ x(deleted_inode_missing, 212) \
+ x(deleted_inode_is_dir, 213) \
+ x(deleted_inode_not_unlinked, 214) \
+ x(extent_overlapping, 215) \
+ x(extent_in_missing_inode, 216) \
+ x(extent_in_non_reg_inode, 217) \
+ x(extent_past_end_of_inode, 218) \
+ x(dirent_empty_name, 219) \
+ x(dirent_val_too_big, 220) \
+ x(dirent_name_too_long, 221) \
+ x(dirent_name_embedded_nul, 222) \
+ x(dirent_name_dot_or_dotdot, 223) \
+ x(dirent_name_has_slash, 224) \
+ x(dirent_d_type_wrong, 225) \
+ x(dirent_d_parent_subvol_wrong, 226) \
+ x(dirent_in_missing_dir_inode, 227) \
+ x(dirent_in_non_dir_inode, 228) \
+ x(dirent_to_missing_inode, 229) \
+ x(dirent_to_missing_subvol, 230) \
+ x(dirent_to_itself, 231) \
+ x(quota_type_invalid, 232) \
+ x(xattr_val_size_too_small, 233) \
+ x(xattr_val_size_too_big, 234) \
+ x(xattr_invalid_type, 235) \
+ x(xattr_name_invalid_chars, 236) \
+ x(xattr_in_missing_inode, 237) \
+ x(root_subvol_missing, 238) \
+ x(root_dir_missing, 239) \
+ x(root_inode_not_dir, 240) \
+ x(dir_loop, 241) \
+ x(hash_table_key_duplicate, 242) \
+ x(hash_table_key_wrong_offset, 243)
+
+enum bch_sb_error_id {
+#define x(t, n) BCH_FSCK_ERR_##t = n,
+ BCH_SB_ERRS()
+#undef x
+ BCH_SB_ERR_MAX
+};
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
+
+void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
+
+void bch2_sb_errors_from_cpu(struct bch_fs *);
+
+void bch2_fs_sb_errors_exit(struct bch_fs *);
+void bch2_fs_sb_errors_init_early(struct bch_fs *);
+int bch2_fs_sb_errors_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_ERRORS_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
new file mode 100644
index 000000000000..b1c099843a39
--- /dev/null
+++ b/fs/bcachefs/sb-errors_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
+#define _BCACHEFS_SB_ERRORS_TYPES_H
+
+#include "darray.h"
+
+struct bch_sb_error_entry_cpu {
+ u64 id:16,
+ nr:48;
+ u64 last_error_time;
+};
+
+typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
+
+#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
+
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 6dd85bb996fe..bed0f857fe5b 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -7,21 +7,28 @@
#include "sb-members.h"
#include "super-io.h"
-/* Code for bch_sb_field_members_v1: */
+#define x(t, n, ...) [n] = #t,
+static const char * const bch2_iops_measurements[] = {
+ BCH_IOPS_MEASUREMENTS()
+ NULL
+};
-static struct bch_member *members_v2_get_mut(struct bch_sb_field_members_v2 *mi, int i)
-{
- return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
-}
+char * const bch2_member_error_strs[] = {
+ BCH_MEMBER_ERROR_TYPES()
+ NULL
+};
+#undef x
+
+/* Code for bch_sb_field_members_v1: */
struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
{
- return members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
+ return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
}
static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
{
- struct bch_member ret, *p = members_v2_get_mut(mi, i);
+ struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i);
memset(&ret, 0, sizeof(ret));
memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
return ret;
@@ -36,7 +43,8 @@ static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int
{
struct bch_member ret, *p = members_v1_get_mut(mi, i);
memset(&ret, 0, sizeof(ret));
- memcpy(&ret, p, min_t(size_t, sizeof(struct bch_member), sizeof(ret))); return ret;
+ memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret)));
+ return ret;
}
struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
@@ -62,7 +70,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
- memmove(dst, members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
+ memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
memset(dst + le16_to_cpu(mi->member_bytes),
0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
}
@@ -71,7 +79,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
return 0;
}
-int bch2_members_v2_init(struct bch_fs *c)
+int bch2_sb_members_v2_init(struct bch_fs *c)
{
struct bch_sb_field_members_v1 *mi1;
struct bch_sb_field_members_v2 *mi2;
@@ -91,7 +99,7 @@ int bch2_members_v2_init(struct bch_fs *c)
return sb_members_v2_resize_entries(c);
}
-int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
+int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
{
struct bch_sb_field_members_v1 *mi1;
struct bch_sb_field_members_v2 *mi2;
@@ -105,7 +113,7 @@ int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
- memcpy(members_v1_get_mut(mi1, i), members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
+ memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
return 0;
}
@@ -155,6 +163,8 @@ static void member_to_text(struct printbuf *out,
u64 bucket_size = le16_to_cpu(m.bucket_size);
u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
+ if (!bch2_member_exists(&m))
+ return;
prt_printf(out, "Device:");
prt_tab(out);
@@ -163,6 +173,21 @@ static void member_to_text(struct printbuf *out,
printbuf_indent_add(out, 2);
+ prt_printf(out, "Label:");
+ prt_tab(out);
+ if (BCH_MEMBER_GROUP(&m)) {
+ unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
+
+ if (idx < disk_groups_nr(gi))
+ prt_printf(out, "%s (%u)",
+ gi->entries[idx].label, idx);
+ else
+ prt_printf(out, "(bad disk labels section)");
+ } else {
+ prt_printf(out, "(none)");
+ }
+ prt_newline(out);
+
prt_printf(out, "UUID:");
prt_tab(out);
pr_uuid(out, m.uuid.b);
@@ -173,6 +198,13 @@ static void member_to_text(struct printbuf *out,
prt_units_u64(out, device_size << 9);
prt_newline(out);
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+ prt_printf(out, "%s errors:", bch2_member_error_strs[i]);
+ prt_tab(out);
+ prt_u64(out, le64_to_cpu(m.errors[i]));
+ prt_newline(out);
+ }
+
for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
prt_tab(out);
@@ -198,7 +230,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Last mount:");
prt_tab(out);
if (m.last_mount)
- pr_time(out, le64_to_cpu(m.last_mount));
+ bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
else
prt_printf(out, "(never)");
prt_newline(out);
@@ -211,21 +243,6 @@ static void member_to_text(struct printbuf *out,
: "unknown");
prt_newline(out);
- prt_printf(out, "Label:");
- prt_tab(out);
- if (BCH_MEMBER_GROUP(&m)) {
- unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
-
- if (idx < disk_groups_nr(gi))
- prt_printf(out, "%s (%u)",
- gi->entries[idx].label, idx);
- else
- prt_printf(out, "(bad disk labels section)");
- } else {
- prt_printf(out, "(none)");
- }
- prt_newline(out);
-
prt_printf(out, "Data allowed:");
prt_tab(out);
if (BCH_MEMBER_DATA_ALLOWED(&m))
@@ -262,8 +279,7 @@ static int bch2_sb_members_v1_validate(struct bch_sb *sb,
struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
unsigned i;
- if ((void *) members_v1_get_mut(mi, sb->nr_devices) >
- vstruct_end(&mi->field)) {
+ if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) {
prt_printf(err, "too many devices for section size");
return -BCH_ERR_invalid_sb_members;
}
@@ -286,10 +302,8 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
unsigned i;
- for (i = 0; i < sb->nr_devices; i++) {
- struct bch_member m = members_v1_get(mi, i);
- member_to_text(out, m, gi, sb, i);
- }
+ for (i = 0; i < sb->nr_devices; i++)
+ member_to_text(out, members_v1_get(mi, i), gi, sb, i);
}
const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
@@ -304,10 +318,8 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
unsigned i;
- for (i = 0; i < sb->nr_devices; i++) {
- struct bch_member m = members_v2_get(mi, i);
- member_to_text(out, m, gi, sb, i);
- }
+ for (i = 0; i < sb->nr_devices; i++)
+ member_to_text(out, members_v2_get(mi, i), gi, sb, i);
}
static int bch2_sb_members_v2_validate(struct bch_sb *sb,
@@ -315,7 +327,7 @@ static int bch2_sb_members_v2_validate(struct bch_sb *sb,
struct printbuf *err)
{
struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
- size_t mi_bytes = (void *) members_v2_get_mut(mi, sb->nr_devices) -
+ size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
(void *) mi;
if (mi_bytes > vstruct_bytes(&mi->field)) {
@@ -337,3 +349,72 @@ const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
.validate = bch2_sb_members_v2_validate,
.to_text = bch2_sb_members_v2_to_text,
};
+
+void bch2_sb_members_from_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ struct bch_dev *ca;
+ unsigned i, e;
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL) {
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, i);
+
+ for (e = 0; e < BCH_MEMBER_ERROR_NR; e++)
+ m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
+ }
+ rcu_read_unlock();
+}
+
+void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+ struct bch_member m;
+
+ mutex_lock(&ca->fs->sb_lock);
+ m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
+ mutex_unlock(&ca->fs->sb_lock);
+
+ printbuf_tabstop_push(out, 12);
+
+ prt_str(out, "IO errors since filesystem creation");
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+ prt_printf(out, "%s:", bch2_member_error_strs[i]);
+ prt_tab(out);
+ prt_u64(out, atomic64_read(&ca->errors[i]));
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
+
+ prt_str(out, "IO errors since ");
+ bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC);
+ prt_str(out, " ago");
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+ prt_printf(out, "%s:", bch2_member_error_strs[i]);
+ prt_tab(out);
+ prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
+}
+
+void bch2_dev_errors_reset(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+ struct bch_member *m;
+
+ mutex_lock(&c->sb_lock);
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
+ m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
+ m->errors_reset_time = ktime_get_real_seconds();
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 430f3457bfd4..03613e3eb8e3 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -2,8 +2,16 @@
#ifndef _BCACHEFS_SB_MEMBERS_H
#define _BCACHEFS_SB_MEMBERS_H
-int bch2_members_v2_init(struct bch_fs *c);
-int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
+extern char * const bch2_member_error_strs[];
+
+static inline struct bch_member *
+__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i)
+{
+ return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
+}
+
+int bch2_sb_members_v2_init(struct bch_fs *c);
+int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
@@ -179,4 +187,41 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
+static inline bool bch2_member_exists(struct bch_member *m)
+{
+ return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+}
+
+static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev)
+{
+ if (dev < sb->nr_devices) {
+ struct bch_member m = bch2_sb_member_get(sb, dev);
+ return bch2_member_exists(&m);
+ }
+ return false;
+}
+
+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
+{
+ return (struct bch_member_cpu) {
+ .nbuckets = le64_to_cpu(mi->nbuckets),
+ .first_bucket = le16_to_cpu(mi->first_bucket),
+ .bucket_size = le16_to_cpu(mi->bucket_size),
+ .group = BCH_MEMBER_GROUP(mi),
+ .state = BCH_MEMBER_STATE(mi),
+ .discard = BCH_MEMBER_DISCARD(mi),
+ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
+ .durability = BCH_MEMBER_DURABILITY(mi)
+ ? BCH_MEMBER_DURABILITY(mi) - 1
+ : 1,
+ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
+ .valid = bch2_member_exists(mi),
+ };
+}
+
+void bch2_sb_members_from_cpu(struct bch_fs *);
+
+void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
+void bch2_dev_errors_reset(struct bch_dev *);
+
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index b684b9f00c1b..97790445e67a 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -11,6 +11,8 @@
#include <linux/sched/task.h>
#include <linux/slab.h>
+#include <trace/events/lock.h>
+
#include "six.h"
#ifdef DEBUG
@@ -161,8 +163,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
this_cpu_sub(*lock->readers, !ret);
preempt_enable();
- if (!ret && (old & SIX_LOCK_WAITING_write))
- ret = -1 - SIX_LOCK_write;
+ if (!ret) {
+ smp_mb();
+ if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
+ ret = -1 - SIX_LOCK_write;
+ }
} else if (type == SIX_LOCK_write && lock->readers) {
if (try) {
atomic_add(SIX_LOCK_HELD_write, &lock->state);
@@ -462,11 +467,12 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
smp_mb__after_atomic();
}
+ trace_contention_begin(lock, 0);
+ lock_contended(&lock->dep_map, ip);
+
if (six_optimistic_spin(lock, type))
goto out;
- lock_contended(&lock->dep_map, ip);
-
wait->task = current;
wait->lock_want = type;
wait->lock_acquired = false;
@@ -546,6 +552,7 @@ out:
six_clear_bitmask(lock, SIX_LOCK_HELD_write);
six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
}
+ trace_contention_end(lock, 0);
return ret;
}
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 4982468bfe11..e9af77b384c7 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -30,17 +30,18 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
le32_to_cpu(t.v->root_snapshot));
}
-int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1))) {
- prt_printf(err, "bad pos");
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- return 0;
+ bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+ bkey_lt(k.k->p, POS(0, 1)), c, err,
+ snapshot_tree_pos_bad,
+ "bad pos");
+fsck_err:
+ return ret;
}
int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
@@ -202,68 +203,60 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
le32_to_cpu(s.v->skip[2]));
}
-int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_snapshot s;
u32 i, id;
+ int ret = 0;
- if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1))) {
- prt_printf(err, "bad pos");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+ bkey_lt(k.k->p, POS(0, 1)), c, err,
+ snapshot_pos_bad,
+ "bad pos");
s = bkey_s_c_to_snapshot(k);
id = le32_to_cpu(s.v->parent);
- if (id && id <= k.k->p.offset) {
- prt_printf(err, "bad parent node (%u <= %llu)",
- id, k.k->p.offset);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
+ snapshot_parent_bad,
+ "bad parent node (%u <= %llu)",
+ id, k.k->p.offset);
- if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
- prt_printf(err, "children not normalized");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
+ snapshot_children_not_normalized,
+ "children not normalized");
- if (s.v->children[0] &&
- s.v->children[0] == s.v->children[1]) {
- prt_printf(err, "duplicate child nodes");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
+ snapshot_child_duplicate,
+ "duplicate child nodes");
for (i = 0; i < 2; i++) {
id = le32_to_cpu(s.v->children[i]);
- if (id >= k.k->p.offset) {
- prt_printf(err, "bad child node (%u >= %llu)",
- id, k.k->p.offset);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(id >= k.k->p.offset, c, err,
+ snapshot_child_bad,
+ "bad child node (%u >= %llu)",
+ id, k.k->p.offset);
}
if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
- if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
- le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
- prt_printf(err, "skiplist not normalized");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
+ le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
+ snapshot_skiplist_not_normalized,
+ "skiplist not normalized");
for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
id = le32_to_cpu(s.v->skip[i]);
- if ((id && !s.v->parent) ||
- (id && id <= k.k->p.offset)) {
- prt_printf(err, "bad skiplist node %u", id);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
+ snapshot_skiplist_bad,
+ "bad skiplist node %u", id);
}
}
-
- return 0;
+fsck_err:
+ return ret;
}
static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
@@ -325,8 +318,9 @@ int bch2_mark_snapshot(struct btree_trans *trans,
__set_is_ancestor_bitmap(c, id);
if (BCH_SNAPSHOT_DELETED(s.v)) {
- set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
+ set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
+ bch2_delete_dead_snapshots_async(c);
}
} else {
memset(t, 0, sizeof(*t));
@@ -529,7 +523,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
if (fsck_err_on(ret ||
root_id != bch2_snapshot_root(c, root_id) ||
st.k->p.offset != le32_to_cpu(s.tree),
- c,
+ c, snapshot_tree_to_missing_snapshot,
"snapshot tree points to missing/incorrect snapshot:\n %s",
(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
@@ -541,17 +535,20 @@ static int check_snapshot_tree(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
- if (fsck_err_on(ret, c,
+ if (fsck_err_on(ret,
+ c, snapshot_tree_to_missing_subvol,
"snapshot tree points to missing subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
le32_to_cpu(subvol.snapshot),
- root_id), c,
+ root_id),
+ c, snapshot_tree_to_wrong_subvol,
"snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
+ fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
+ c, snapshot_tree_to_snapshot_subvol,
"snapshot tree points to snapshot subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
@@ -787,7 +784,9 @@ static int check_snapshot(struct btree_trans *trans,
goto err;
}
} else {
- if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s",
+ if (fsck_err_on(s.subvol,
+ c, snapshot_should_not_have_subvol,
+ "snapshot should not point to subvol:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
@@ -803,7 +802,8 @@ static int check_snapshot(struct btree_trans *trans,
if (ret < 0)
goto err;
- if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s",
+ if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
+ "snapshot points to missing/incorrect tree:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
if (ret)
@@ -815,7 +815,8 @@ static int check_snapshot(struct btree_trans *trans,
if (le32_to_cpu(s.depth) != real_depth &&
(c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
- fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s",
+ fsck_err(c, snapshot_bad_depth,
+ "snapshot with incorrect depth field, should be %u:\n %s",
real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
@@ -832,7 +833,8 @@ static int check_snapshot(struct btree_trans *trans,
if (!ret &&
(c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
- fsck_err(c, "snapshot with bad skiplist field:\n %s",
+ fsck_err(c, snapshot_bad_skiplist,
+ "snapshot with bad skiplist field:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
@@ -1251,13 +1253,7 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans,
return 0;
}
-/*
- * For a given snapshot, if it doesn't have a subvolume that points to it, and
- * it doesn't have child snapshot nodes - it's now redundant and we can mark it
- * as deleted.
- */
-static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
+static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
{
struct bkey_s_c_snapshot snap;
u32 children[2];
@@ -1278,10 +1274,21 @@ static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btre
bch2_snapshot_live(trans, children[1]);
if (ret < 0)
return ret;
+ return !ret;
+}
- if (!ret)
- return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
- return 0;
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
+{
+ int ret = bch2_snapshot_needs_delete(trans, k);
+
+ return ret <= 0
+ ? ret
+ : bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
}
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
@@ -1342,12 +1349,12 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
u32 id = le32_to_cpu(s->v.skip[j]);
if (snapshot_list_has_id(deleted, id)) {
- id = depth > 1
- ? bch2_snapshot_nth_parent_skip(c,
+ id = bch2_snapshot_nth_parent_skip(c,
parent,
- get_random_u32_below(depth - 1),
- deleted)
- : parent;
+ depth > 1
+ ? get_random_u32_below(depth - 1)
+ : 0,
+ deleted);
s->v.skip[j] = cpu_to_le32(id);
}
}
@@ -1369,6 +1376,9 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
u32 *i, id;
int ret = 0;
+ if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags))
+ return 0;
+
if (!test_bit(BCH_FS_STARTED, &c->flags)) {
ret = bch2_fs_read_write_early(c);
if (ret) {
@@ -1386,7 +1396,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k,
NULL, NULL, 0,
- bch2_delete_redundant_snapshot(trans, &iter, k));
+ bch2_delete_redundant_snapshot(trans, k));
if (ret) {
bch_err_msg(c, ret, "deleting redundant snapshots");
goto err;
@@ -1427,6 +1437,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
if (!btree_type_has_snapshots(id))
continue;
+ /*
+ * deleted inodes btree is maintained by a trigger on the inodes
+ * btree - no work for us to do here, and it's not safe to scan
+ * it because we'll see out of date keys due to the btree write
+ * buffer:
+ */
+ if (id == BTREE_ID_deleted_inodes)
+ continue;
+
ret = for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
@@ -1447,6 +1466,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
}
}
+ bch2_trans_unlock(trans);
down_write(&c->snapshot_create_lock);
for_each_btree_key(trans, iter, BTREE_ID_snapshots,
@@ -1491,8 +1511,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
goto err_create_lock;
}
}
-
- clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
err_create_lock:
up_write(&c->snapshot_create_lock);
err:
@@ -1508,8 +1526,7 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
- if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
- bch2_delete_dead_snapshots(c);
+ bch2_delete_dead_snapshots(c);
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
@@ -1520,20 +1537,6 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
-int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
- struct btree_trans_commit_hook *h)
-{
- struct bch_fs *c = trans->c;
-
- set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-
- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
- return 0;
-
- bch2_delete_dead_snapshots_async(c);
- return 0;
-}
-
int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
enum btree_id id,
struct bpos pos)
@@ -1664,6 +1667,26 @@ again:
return ret ?: trans_was_restarted(trans, restart_count);
}
+static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_snapshot snap;
+ int ret = 0;
+
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
+
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v) ||
+ bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
+ (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
+ set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ return 0;
+ }
+
+ return ret;
+}
+
int bch2_snapshots_read(struct bch_fs *c)
{
struct btree_iter iter;
@@ -1674,7 +1697,8 @@ int bch2_snapshots_read(struct bch_fs *c)
for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k,
bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
- bch2_snapshot_set_equiv(trans, k)) ?:
+ bch2_snapshot_set_equiv(trans, k) ?:
+ bch2_check_snapshot_needs_deletion(trans, k)) ?:
for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index de215d9d1252..f09a22f44239 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -5,7 +5,7 @@
enum bkey_invalid_flags;
void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
@@ -19,7 +19,7 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s_c, unsigned);
@@ -244,8 +244,6 @@ int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
-int bch2_delete_dead_snapshots_hook(struct btree_trans *,
- struct btree_trans_commit_hook *);
void bch2_delete_dead_snapshots_work(struct work_struct *);
int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index caf2dd7dafff..fccd25aa3242 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -62,7 +62,8 @@ static int check_subvol(struct btree_trans *trans,
if (ret)
return ret;
- if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c,
+ if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
+ c, subvol_not_master_and_not_snapshot,
"subvolume %llu is not set as snapshot but is not master subvolume",
k.k->p.offset)) {
struct bkey_i_subvolume *s =
@@ -97,16 +98,17 @@ int bch2_check_subvols(struct bch_fs *c)
/* Subvolumes: */
-int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
- if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
- bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
- prt_printf(err, "invalid pos");
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- return 0;
+ bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
+ bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err,
+ subvol_pos_bad,
+ "invalid pos");
+fsck_err:
+ return ret;
}
void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
@@ -230,7 +232,6 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
struct btree_iter iter;
struct bkey_s_c_subvolume subvol;
- struct btree_trans_commit_hook *h;
u32 snapid;
int ret = 0;
@@ -246,22 +247,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
snapid = le32_to_cpu(subvol.v->snapshot);
- ret = bch2_btree_delete_at(trans, &iter, 0);
- if (ret)
- goto err;
-
- ret = bch2_snapshot_node_set_deleted(trans, snapid);
- if (ret)
- goto err;
-
- h = bch2_trans_kmalloc(trans, sizeof(*h));
- ret = PTR_ERR_OR_ZERO(h);
- if (ret)
- goto err;
-
- h->fn = bch2_delete_dead_snapshots_hook;
- bch2_trans_commit_hook(trans, h);
-err:
+ ret = bch2_btree_delete_at(trans, &iter, 0) ?:
+ bch2_snapshot_node_set_deleted(trans, snapid);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index bb14f92e8687..a1003d30ab0a 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -9,7 +9,7 @@ enum bkey_invalid_flags;
int bch2_check_subvols(struct bch_fs *);
-int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 86833445af20..2d2e66a4e468 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,7 +20,7 @@ struct snapshot_t {
};
struct snapshot_table {
- struct snapshot_t s[0];
+ DECLARE_FLEX_ARRAY(struct snapshot_t, s);
};
typedef struct {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 332d41e1c0a3..f4cad903f4d6 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -13,6 +13,7 @@
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
+#include "sb-errors.h"
#include "sb-members.h"
#include "super-io.h"
#include "super.h"
@@ -720,7 +721,7 @@ retry:
if (opt_defined(*opts, sb))
goto err;
- printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
+ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
printbuf_reset(&err);
@@ -782,7 +783,7 @@ got_super:
ret = bch2_sb_validate(sb, &err, READ);
if (ret) {
- printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
+ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
goto err_no_print;
}
@@ -790,7 +791,7 @@ out:
printbuf_exit(&err);
return ret;
err:
- printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
+ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
path, err.buf);
err_no_print:
bch2_free_super(sb);
@@ -805,7 +806,12 @@ static void write_super_endio(struct bio *bio)
/* XXX: return errors directly */
- if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca,
+ bio_data_dir(bio)
+ ? BCH_MEMBER_ERROR_write
+ : BCH_MEMBER_ERROR_read,
+ "superblock %s error: %s",
+ bio_data_dir(bio) ? "write" : "read",
bch2_blk_status_to_str(bio->bi_status)))
ca->sb_write_error = 1;
@@ -892,7 +898,9 @@ int bch2_write_super(struct bch_fs *c)
SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
bch2_sb_counters_from_cpu(c);
- bch_members_cpy_v2_v1(&c->disk_sb);
+ bch2_sb_members_from_cpu(c);
+ bch2_sb_members_cpy_v2_v1(&c->disk_sb);
+ bch2_sb_errors_from_cpu(c);
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
@@ -1175,7 +1183,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Created:");
prt_tab(out);
if (sb->time_base_lo)
- pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+ bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
else
prt_printf(out, "(not set)");
prt_newline(out);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index b0d8584f475f..f5abd102bff7 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -23,6 +23,11 @@ u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
unsigned,
unsigned);
+static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
+{
+ return le32_to_cpu(f->u64s) * sizeof(u64);
+}
+
#define field_to_type(_f, _name) \
container_of_or_null(_f, struct bch_sb_field_##_name, field)
@@ -78,41 +83,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
__bch2_check_set_feature(c, feat);
}
-/* BCH_SB_FIELD_members_v1: */
-
-static inline bool bch2_member_exists(struct bch_member *m)
-{
- return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
-}
-
-static inline bool bch2_dev_exists(struct bch_sb *sb,
- unsigned dev)
-{
- if (dev < sb->nr_devices) {
- struct bch_member m = bch2_sb_member_get(sb, dev);
- return bch2_member_exists(&m);
- }
- return false;
-}
-
-static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
-{
- return (struct bch_member_cpu) {
- .nbuckets = le64_to_cpu(mi->nbuckets),
- .first_bucket = le16_to_cpu(mi->first_bucket),
- .bucket_size = le16_to_cpu(mi->bucket_size),
- .group = BCH_MEMBER_GROUP(mi),
- .state = BCH_MEMBER_STATE(mi),
- .discard = BCH_MEMBER_DISCARD(mi),
- .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
- .durability = BCH_MEMBER_DURABILITY(mi)
- ? BCH_MEMBER_DURABILITY(mi) - 1
- : 1,
- .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
- .valid = bch2_member_exists(mi),
- };
-}
-
void bch2_sb_maybe_downgrade(struct bch_fs *);
void bch2_sb_upgrade(struct bch_fs *, unsigned);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 0e85c22672be..24672bb31cbe 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -49,6 +49,7 @@
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
+#include "sb-errors.h"
#include "sb-members.h"
#include "snapshot.h"
#include "subvolume.h"
@@ -400,7 +401,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch_info(c, "going read-write");
- ret = bch2_members_v2_init(c);
+ ret = bch2_sb_members_v2_init(c);
if (ret)
goto err;
@@ -481,6 +482,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_time_stats_exit(&c->times[i]);
bch2_free_pending_node_rewrites(c);
+ bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
bch2_fs_quota_exit(c);
@@ -713,6 +715,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_quota_init(c);
bch2_fs_ec_init_early(c);
bch2_fs_move_init(c);
+ bch2_fs_sb_errors_init_early(c);
INIT_LIST_HEAD(&c->list);
@@ -729,8 +732,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->journal_iters);
- INIT_LIST_HEAD(&c->fsck_errors);
- mutex_init(&c->fsck_error_lock);
+ INIT_LIST_HEAD(&c->fsck_error_msgs);
+ mutex_init(&c->fsck_error_msgs_lock);
seqcount_init(&c->gc_pos_lock);
@@ -840,6 +843,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
}
ret = bch2_fs_counters_init(c) ?:
+ bch2_fs_sb_errors_init(c) ?:
bch2_io_clock_init(&c->io_clock[READ]) ?:
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?:
@@ -942,16 +946,13 @@ int bch2_fs_start(struct bch_fs *c)
mutex_lock(&c->sb_lock);
- ret = bch2_members_v2_init(c);
+ ret = bch2_sb_members_v2_init(c);
if (ret) {
mutex_unlock(&c->sb_lock);
goto err;
}
for_each_online_member(ca, c, i)
- bch2_sb_from_fs(c, ca);
-
- for_each_online_member(ca, c, i)
bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
mutex_unlock(&c->sb_lock);
@@ -960,12 +961,6 @@ int bch2_fs_start(struct bch_fs *c)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
- mutex_lock(&c->btree_transaction_stats[i].lock);
- bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
- mutex_unlock(&c->btree_transaction_stats[i].lock);
- }
-
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
? bch2_fs_recovery(c)
: bch2_fs_initialize(c);
@@ -1140,6 +1135,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
struct bch_member *member)
{
struct bch_dev *ca;
+ unsigned i;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
@@ -1157,6 +1153,10 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
bch2_time_stats_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
+
+ for (i = 0; i < ARRAY_SIZE(member->errors); i++)
+ atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
+
ca->uuid = member->uuid;
ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
@@ -1591,7 +1591,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
- bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+ bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
if (label.allocation_failure) {
ret = -ENOMEM;
goto err;
@@ -1631,16 +1631,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_unlock;
}
- mi = bch2_sb_field_get(ca->disk_sb.sb, members_v2);
-
- if (!bch2_sb_field_resize(&ca->disk_sb, members_v2,
- le32_to_cpu(mi->field.u64s) +
- sizeof(dev_mi) / sizeof(u64))) {
- ret = -BCH_ERR_ENOSPC_sb_members;
- bch_err_msg(c, ret, "setting up new superblock");
- goto err_unlock;
- }
-
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
@@ -1654,6 +1644,8 @@ no_slot:
have_slot:
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+
+ mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
@@ -1689,13 +1681,13 @@ have_slot:
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
- bch_err_msg(c, ret, "marking new superblock");
+ bch_err_msg(ca, ret, "marking new superblock");
goto err_late;
}
ret = bch2_fs_freespace_init(c);
if (ret) {
- bch_err_msg(c, ret, "initializing free space");
+ bch_err_msg(ca, ret, "initializing free space");
goto err_late;
}
@@ -1763,19 +1755,26 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
- mutex_lock(&c->sb_lock);
- struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ if (!ca->mi.freespace_initialized) {
+ ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+ bch_err_msg(ca, ret, "initializing free space");
+ if (ret)
+ goto err;
+ }
- m->last_mount =
- cpu_to_le64(ktime_get_real_seconds());
+ if (!ca->journal.nr) {
+ ret = bch2_dev_journal_alloc(ca);
+ bch_err_msg(ca, ret, "allocating journal");
+ if (ret)
+ goto err;
+ }
+ mutex_lock(&c->sb_lock);
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+ cpu_to_le64(ktime_get_real_seconds());
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- ret = bch2_fs_freespace_init(c);
- if (ret)
- bch_err_msg(c, ret, "initializing free space");
-
up_write(&c->state_lock);
return 0;
err:
@@ -1886,9 +1885,9 @@ found:
struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
struct bch_opts opts)
{
- struct bch_sb_handle *sb = NULL;
+ DARRAY(struct bch_sb_handle) sbs = { 0 };
struct bch_fs *c = NULL;
- unsigned i, best_sb = 0;
+ struct bch_sb_handle *sb, *best = NULL;
struct printbuf errbuf = PRINTBUF;
int ret = 0;
@@ -1900,49 +1899,46 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
goto err;
}
- sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
- if (!sb) {
- ret = -ENOMEM;
+ ret = darray_make_room(&sbs, nr_devices);
+ if (ret)
goto err;
- }
- for (i = 0; i < nr_devices; i++) {
- ret = bch2_read_super(devices[i], &opts, &sb[i]);
+ for (unsigned i = 0; i < nr_devices; i++) {
+ struct bch_sb_handle sb = { NULL };
+
+ ret = bch2_read_super(devices[i], &opts, &sb);
if (ret)
goto err;
+ BUG_ON(darray_push(&sbs, sb));
}
- for (i = 1; i < nr_devices; i++)
- if (le64_to_cpu(sb[i].sb->seq) >
- le64_to_cpu(sb[best_sb].sb->seq))
- best_sb = i;
-
- i = 0;
- while (i < nr_devices) {
- if (i != best_sb &&
- !bch2_dev_exists(sb[best_sb].sb, sb[i].sb->dev_idx)) {
- pr_info("%pg has been removed, skipping", sb[i].bdev);
- bch2_free_super(&sb[i]);
- array_remove_item(sb, nr_devices, i);
+ darray_for_each(sbs, sb)
+ if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+ best = sb;
+
+ darray_for_each_reverse(sbs, sb) {
+ if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
+ pr_info("%pg has been removed, skipping", sb->bdev);
+ bch2_free_super(sb);
+ darray_remove_item(&sbs, sb);
+ best -= best > sb;
continue;
}
- ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
+ ret = bch2_dev_in_fs(best->sb, sb->sb);
if (ret)
goto err_print;
- i++;
}
- c = bch2_fs_alloc(sb[best_sb].sb, opts);
- if (IS_ERR(c)) {
- ret = PTR_ERR(c);
+ c = bch2_fs_alloc(best->sb, opts);
+ ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
goto err;
- }
down_write(&c->state_lock);
- for (i = 0; i < nr_devices; i++) {
- ret = bch2_dev_attach_bdev(c, &sb[i]);
+ darray_for_each(sbs, sb) {
+ ret = bch2_dev_attach_bdev(c, sb);
if (ret) {
up_write(&c->state_lock);
goto err;
@@ -1961,7 +1957,9 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
goto err;
}
out:
- kfree(sb);
+ darray_for_each(sbs, sb)
+ bch2_free_super(sb);
+ darray_exit(&sbs);
printbuf_exit(&errbuf);
module_put(THIS_MODULE);
return c;
@@ -1971,9 +1969,6 @@ err_print:
err:
if (!IS_ERR_OR_NULL(c))
bch2_fs_stop(c);
- if (sb)
- for (i = 0; i < nr_devices; i++)
- bch2_free_super(&sb[i]);
c = ERR_PTR(ret);
goto out;
}
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 78d6138db62d..7dda4985b99f 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -37,16 +37,4 @@ struct bch_member_cpu {
u8 valid;
};
-struct bch_disk_group_cpu {
- bool deleted;
- u16 parent;
- struct bch_devs_mask devs;
-};
-
-struct bch_disk_groups_cpu {
- struct rcu_head rcu;
- unsigned nr;
- struct bch_disk_group_cpu entries[] __counted_by(nr);
-};
-
#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index eb764b9a4629..ab743115f169 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -149,7 +149,9 @@ read_attribute(bucket_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
rw_attribute(durability);
-read_attribute(iodone);
+read_attribute(io_done);
+read_attribute(io_errors);
+write_attribute(io_errors_reset);
read_attribute(io_latency_read);
read_attribute(io_latency_write);
@@ -212,7 +214,7 @@ read_attribute(copy_gc_wait);
rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_work);
+read_attribute(rebalance_status);
rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
@@ -341,7 +343,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
{
- prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+ prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree));
bch2_bpos_to_text(out, c->gc_gens_pos);
prt_printf(out, "\n");
}
@@ -386,8 +388,8 @@ SHOW(bch2_fs)
if (attr == &sysfs_copy_gc_wait)
bch2_copygc_wait_to_text(out, c);
- if (attr == &sysfs_rebalance_work)
- bch2_rebalance_work_to_text(out, c);
+ if (attr == &sysfs_rebalance_status)
+ bch2_rebalance_status_to_text(out, c);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
@@ -494,7 +496,7 @@ STORE(bch2_fs)
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+ c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
}
if (attr == &sysfs_btree_wakeup)
@@ -646,7 +648,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_copy_gc_wait,
&sysfs_rebalance_enabled,
- &sysfs_rebalance_work,
+ &sysfs_rebalance_status,
sysfs_pd_controller_files(rebalance),
&sysfs_moving_ctxts,
@@ -707,10 +709,8 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_by_id(&c->opts, id, v);
if ((id == Opt_background_target ||
- id == Opt_background_compression) && v) {
- bch2_rebalance_add_work(c, S64_MAX);
- rebalance_wakeup(c);
- }
+ id == Opt_background_compression) && v)
+ bch2_set_rebalance_needs_scan(c, 0);
ret = size;
err:
@@ -882,7 +882,7 @@ static const char * const bch2_rw[] = {
NULL
};
-static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
+static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
{
int rw, i;
@@ -910,13 +910,8 @@ SHOW(bch2_dev)
sysfs_print(discard, ca->mi.discard);
if (attr == &sysfs_label) {
- if (ca->mi.group) {
- mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(out, c->disk_sb.sb,
- ca->mi.group - 1);
- mutex_unlock(&c->sb_lock);
- }
-
+ if (ca->mi.group)
+ bch2_disk_path_to_text(out, c, ca->mi.group - 1);
prt_char(out, '\n');
}
@@ -930,8 +925,11 @@ SHOW(bch2_dev)
prt_char(out, '\n');
}
- if (attr == &sysfs_iodone)
- dev_iodone_to_text(out, ca);
+ if (attr == &sysfs_io_done)
+ dev_io_done_to_text(out, ca);
+
+ if (attr == &sysfs_io_errors)
+ bch2_dev_io_errors_to_text(out, ca);
sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
@@ -998,6 +996,9 @@ STORE(bch2_dev)
return ret;
}
+ if (attr == &sysfs_io_errors_reset)
+ bch2_dev_errors_reset(ca);
+
return size;
}
SYSFS_OPS(bch2_dev);
@@ -1015,7 +1016,9 @@ struct attribute *bch2_dev_files[] = {
&sysfs_label,
&sysfs_has_data,
- &sysfs_iodone,
+ &sysfs_io_done,
+ &sysfs_io_errors,
+ &sysfs_io_errors_reset,
&sysfs_io_latency_read,
&sysfs_io_latency_write,
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
index 33efa6005c6f..dc48b52b01b4 100644
--- a/fs/bcachefs/trace.c
+++ b/fs/bcachefs/trace.c
@@ -7,6 +7,7 @@
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "keylist.h"
+#include "move_types.h"
#include "opts.h"
#include "six.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 19264492151b..7857671159b4 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(btree_node,
TP_printk("%d,%d %u %s %llu:%llu:%u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->level,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
);
@@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write,
TRACE_EVENT(journal_reclaim_start,
TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
u64 min_nr, u64 min_key_cache,
- u64 prereserved, u64 prereserved_total,
u64 btree_cache_dirty, u64 btree_cache_total,
u64 btree_key_cache_dirty, u64 btree_key_cache_total),
- TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
+ TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
btree_cache_dirty, btree_cache_total,
btree_key_cache_dirty, btree_key_cache_total),
@@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start,
__field(bool, kicked )
__field(u64, min_nr )
__field(u64, min_key_cache )
- __field(u64, prereserved )
- __field(u64, prereserved_total )
__field(u64, btree_cache_dirty )
__field(u64, btree_cache_total )
__field(u64, btree_key_cache_dirty )
@@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start,
__entry->kicked = kicked;
__entry->min_nr = min_nr;
__entry->min_key_cache = min_key_cache;
- __entry->prereserved = prereserved;
- __entry->prereserved_total = prereserved_total;
__entry->btree_cache_dirty = btree_cache_dirty;
__entry->btree_cache_total = btree_cache_total;
__entry->btree_key_cache_dirty = btree_key_cache_dirty;
__entry->btree_key_cache_total = btree_key_cache_total;
),
- TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+ TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->direct,
__entry->kicked,
__entry->min_nr,
__entry->min_key_cache,
- __entry->prereserved,
- __entry->prereserved_total,
__entry->btree_cache_dirty,
__entry->btree_cache_total,
__entry->btree_key_cache_dirty,
@@ -461,7 +454,7 @@ TRACE_EVENT(btree_path_relock_fail,
TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
@@ -522,7 +515,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
@@ -767,25 +760,36 @@ DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
);
TRACE_EVENT(move_data,
- TP_PROTO(struct bch_fs *c, u64 sectors_moved,
- u64 keys_moved),
- TP_ARGS(c, sectors_moved, keys_moved),
+ TP_PROTO(struct bch_fs *c,
+ struct bch_move_stats *stats),
+ TP_ARGS(c, stats),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, sectors_moved )
+ __field(dev_t, dev )
__field(u64, keys_moved )
+ __field(u64, keys_raced )
+ __field(u64, sectors_seen )
+ __field(u64, sectors_moved )
+ __field(u64, sectors_raced )
),
TP_fast_assign(
- __entry->dev = c->dev;
- __entry->sectors_moved = sectors_moved;
- __entry->keys_moved = keys_moved;
+ __entry->dev = c->dev;
+ __entry->keys_moved = atomic64_read(&stats->keys_moved);
+ __entry->keys_raced = atomic64_read(&stats->keys_raced);
+ __entry->sectors_seen = atomic64_read(&stats->sectors_seen);
+ __entry->sectors_moved = atomic64_read(&stats->sectors_moved);
+ __entry->sectors_raced = atomic64_read(&stats->sectors_raced);
),
- TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
+ TP_printk("%d,%d keys moved %llu raced %llu"
+ "sectors seen %llu moved %llu raced %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->sectors_moved, __entry->keys_moved)
+ __entry->keys_moved,
+ __entry->keys_raced,
+ __entry->sectors_seen,
+ __entry->sectors_moved,
+ __entry->sectors_raced)
);
TRACE_EVENT(evacuate_bucket,
@@ -1012,7 +1016,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
TP_printk("%s %pS btree %s pos %llu:%llu:%u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot)
@@ -1032,13 +1036,16 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path)
);
+struct get_locks_fail;
+
TRACE_EVENT(trans_restart_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_path *path,
unsigned old_locks_want,
- unsigned new_locks_want),
- TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want),
+ unsigned new_locks_want,
+ struct get_locks_fail *f),
+ TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
@@ -1046,6 +1053,11 @@ TRACE_EVENT(trans_restart_upgrade,
__field(u8, btree_id )
__field(u8, old_locks_want )
__field(u8, new_locks_want )
+ __field(u8, level )
+ __field(u32, path_seq )
+ __field(u32, node_seq )
+ __field(u32, path_alloc_seq )
+ __field(u32, downgrade_seq)
TRACE_BPOS_entries(pos)
),
@@ -1055,18 +1067,28 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->btree_id = path->btree_id;
__entry->old_locks_want = old_locks_want;
__entry->new_locks_want = new_locks_want;
+ __entry->level = f->l;
+ __entry->path_seq = path->l[f->l].lock_seq;
+ __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
+ __entry->path_alloc_seq = path->alloc_seq;
+ __entry->downgrade_seq = path->downgrade_seq;
TRACE_BPOS_assign(pos, path->pos)
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u",
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
__entry->old_locks_want,
- __entry->new_locks_want)
+ __entry->new_locks_want,
+ __entry->level,
+ __entry->path_seq,
+ __entry->node_seq,
+ __entry->path_alloc_seq,
+ __entry->downgrade_seq)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
@@ -1219,7 +1241,7 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
@@ -1227,6 +1249,27 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s)
);
+TRACE_EVENT(path_downgrade,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ ),
+
+ TP_printk("%s %pS",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip)
+);
+
DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 08bac0ba8d0b..84b142fcc3df 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -467,6 +467,24 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
prt_printf(out, "%s", u->name);
}
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ time_t t = sec;
+ char buf[64];
+ ctime_r(&t, buf);
+ prt_str(out, buf);
+}
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ char buf[64];
+ snprintf(buf, sizeof(buf), "%ptT", &sec);
+ prt_u64(out, sec);
+}
+#endif
+
#define TABSTOP_SIZE 12
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 849a37ae497c..2984b57b2958 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -245,26 +245,7 @@ do { \
#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__)
void bch2_pr_time_units(struct printbuf *, u64);
-
-#ifdef __KERNEL__
-static inline void pr_time(struct printbuf *out, u64 time)
-{
- prt_printf(out, "%llu", time);
-}
-#else
-#include <time.h>
-static inline void pr_time(struct printbuf *out, u64 _time)
-{
- char time_str[64];
- time_t time = _time;
- struct tm *tm = localtime(&time);
- size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
- if (!err)
- prt_printf(out, "(formatting error)");
- else
- prt_printf(out, "%s", time_str);
-}
-#endif
+void bch2_prt_datetime(struct printbuf *, time64_t);
#ifdef __KERNEL__
static inline void uuid_unparse_lower(u8 *uuid, char *out)
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index b069b1a62e25..79d982674c18 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -70,46 +70,38 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
.cmp_bkey = xattr_cmp_bkey,
};
-int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+ unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len));
+ int ret = 0;
- if (bkey_val_u64s(k.k) <
- xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len))) {
- prt_printf(err, "value too small (%zu < %u)",
- bkey_val_u64s(k.k),
- xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len)));
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err,
+ xattr_val_size_too_small,
+ "value too small (%zu < %u)",
+ bkey_val_u64s(k.k), val_u64s);
/* XXX why +4 ? */
- if (bkey_val_u64s(k.k) >
- xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len) + 4)) {
- prt_printf(err, "value too big (%zu > %u)",
- bkey_val_u64s(k.k),
- xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len) + 4));
- return -BCH_ERR_invalid_bkey;
- }
-
- handler = bch2_xattr_type_to_handler(xattr.v->x_type);
- if (!handler) {
- prt_printf(err, "invalid type (%u)", xattr.v->x_type);
- return -BCH_ERR_invalid_bkey;
- }
-
- if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
- prt_printf(err, "xattr name has invalid characters");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len) + 4);
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err,
+ xattr_val_size_too_big,
+ "value too big (%zu > %u)",
+ bkey_val_u64s(k.k), val_u64s);
+
+ bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err,
+ xattr_invalid_type,
+ "invalid type (%u)", xattr.v->x_type);
+
+ bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err,
+ xattr_name_invalid_chars,
+ "xattr name has invalid characters");
+fsck_err:
+ return ret;
}
void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -560,6 +552,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
s.v = v + 1;
s.defined = true;
} else {
+ /*
+ * Check if this option was set on the parent - if so, switched
+ * back to inheriting from the parent:
+ *
+ * rename() also has to deal with keeping inherited options up
+ * to date - see bch2_reinherit_attrs()
+ */
+ spin_lock(&dentry->d_lock);
if (!IS_ROOT(dentry)) {
struct bch_inode_info *dir =
to_bch_ei(d_inode(dentry->d_parent));
@@ -568,6 +568,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
} else {
s.v = 0;
}
+ spin_unlock(&dentry->d_lock);
s.defined = false;
}
@@ -590,7 +591,7 @@ err:
if (value &&
(opt_id == Opt_background_compression ||
opt_id == Opt_background_target))
- bch2_rebalance_add_work(c, inode->v.i_blocks);
+ bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index f5a52e3a6016..1337f31a5c49 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,7 +6,7 @@
extern const struct bch_hash_desc bch2_xattr_hash_desc;
-int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9acdec56f626..a93d76df8ed8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -96,6 +96,7 @@ static const struct address_space_operations befs_symlink_aops = {
};
static const struct export_operations befs_export_operations = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = befs_fh_to_dentry,
.fh_to_parent = befs_fh_to_parent,
.get_parent = befs_get_parent,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2a9344a3fcee..35c1d24d4a78 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -432,7 +432,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (btrfs_block_can_be_shared(trans, root, buf)) {
ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
btrfs_header_level(buf), 1,
- &refs, &flags);
+ &refs, &flags, NULL);
if (ret)
return ret;
if (unlikely(refs == 0)) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9223934d95f4..891ea2fa263c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -1041,7 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
+ if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
@@ -1144,7 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
+ if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c8e5b4715b49..0455935ff558 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -102,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
*/
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags)
+ u64 offset, int metadata, u64 *refs, u64 *flags,
+ u64 *owning_root)
{
struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
@@ -114,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u32 item_size;
u64 num_refs;
u64 extent_flags;
+ u64 owner = 0;
int ret;
/*
@@ -167,6 +169,8 @@ search_again:
struct btrfs_extent_item);
num_refs = btrfs_extent_refs(leaf, ei);
extent_flags = btrfs_extent_flags(leaf, ei);
+ owner = btrfs_get_extent_owner_root(fs_info, leaf,
+ path->slots[0]);
} else {
ret = -EUCLEAN;
btrfs_err(fs_info,
@@ -226,6 +230,8 @@ out:
*refs = num_refs;
if (flags)
*flags = extent_flags;
+ if (owning_root)
+ *owning_root = owner;
out_free:
btrfs_free_path(path);
return ret;
@@ -5234,7 +5240,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
/* We don't lock the tree block, it's OK to be racy here */
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
wc->level - 1, 1, &refs,
- &flags);
+ &flags, NULL);
/* We don't care about errors in readahead. */
if (ret < 0)
continue;
@@ -5301,7 +5307,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level],
+ NULL);
BUG_ON(ret == -ENOMEM);
if (ret)
return ret;
@@ -5391,6 +5398,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 bytenr;
u64 generation;
u64 parent;
+ u64 owner_root = 0;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_key key;
struct btrfs_ref ref = { 0 };
@@ -5434,7 +5442,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
- &wc->flags[level - 1]);
+ &wc->flags[level - 1],
+ &owner_root);
if (ret < 0)
goto out_unlock;
@@ -5567,8 +5576,7 @@ skip:
find_next_key(path, level, &wc->drop_progress);
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- fs_info->nodesize, parent,
- btrfs_header_owner(next));
+ fs_info->nodesize, parent, owner_root);
btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
0, false);
ret = btrfs_free_extent(trans, &ref);
@@ -5635,7 +5643,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level],
+ NULL);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
@@ -5880,7 +5889,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
level, 1, &wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level], NULL);
if (ret < 0) {
err = ret;
goto out_end_trans;
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 0716f65d9753..2e066035ccee 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -99,7 +99,8 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags);
+ u64 offset, int metadata, u64 *refs, u64 *flags,
+ u64 *owner_root);
int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5e3fccddde0c..9f5a9894f88f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6983,8 +6983,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
int ret;
alloc_hint = get_extent_allocation_hint(inode, start, len);
+again:
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
0, alloc_hint, &ins, 1, 1);
+ if (ret == -EAGAIN) {
+ ASSERT(btrfs_is_zoned(fs_info));
+ wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ goto again;
+ }
if (ret)
return ERR_PTR(ret);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 752acff2c734..dfe257e1845b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1528,7 +1528,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
@@ -1660,7 +1660,7 @@ out:
static noinline int search_ioctl(struct inode *inode,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf)
{
struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
@@ -1733,7 +1733,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
int ret;
- size_t buf_size;
+ u64 buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1763,8 +1763,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
int ret;
- size_t buf_size;
- const size_t buf_limit = SZ_16M;
+ u64 buf_size;
+ const u64 buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index edb84cc03237..ce446d9d7f23 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1888,7 +1888,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
u64 bytenr = record->bytenr;
if (!btrfs_qgroup_full_accounting(fs_info))
- return 0;
+ return 1;
lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
@@ -2875,12 +2875,18 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
num_bytes, seq);
/*
+ * We're done using the iterator, release all its qgroups while holding
+ * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
+ * and trigger use-after-free accesses to qgroups.
+ */
+ qgroup_iterator_nested_clean(&qgroups);
+
+ /*
* Bump qgroup_seq to avoid seq overlap
*/
fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
spin_unlock(&fs_info->qgroup_lock);
out_free:
- qgroup_iterator_nested_clean(&qgroups);
ulist_free(old_roots);
ulist_free(new_roots);
return ret;
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 944e8f1862aa..9589362acfbf 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -145,7 +145,7 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
btrfs_put_bioc(bioc);
}
- return ret;
+ return 0;
}
int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9ce5be21b036..f62a408671cb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1868,6 +1868,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
*/
ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
+ /* @found_logical_ret must be specified. */
+ ASSERT(found_logical_ret);
+
stripe = &sctx->stripes[sctx->cur_stripe];
scrub_reset_stripe(stripe);
ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
@@ -1876,8 +1879,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
/* Either >0 as no more extents or <0 for error. */
if (ret)
return ret;
- if (found_logical_ret)
- *found_logical_ret = stripe->logical;
+ *found_logical_ret = stripe->logical;
sctx->cur_stripe++;
/* We filled one group, submit it. */
@@ -2080,7 +2082,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
/* Go through each extent items inside the logical range */
while (cur_logical < logical_end) {
- u64 found_logical;
+ u64 found_logical = U64_MAX;
u64 cur_physical = physical + cur_logical - logical_start;
/* Canceled? */
@@ -2115,6 +2117,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
if (ret < 0)
break;
+ /* queue_scrub_stripe() returned 0, @found_logical must be updated. */
+ ASSERT(found_logical != U64_MAX);
cur_logical = found_logical + BTRFS_STRIPE_LEN;
/* Don't hold CPU for too long time */
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6ecf78d09694..f638dc339693 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1472,7 +1472,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
error = -EBUSY;
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
+ shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name,
s->s_id);
btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c87e18827a0a..c6f16625af51 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -748,13 +748,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (!fs_devices) {
fs_devices = alloc_fs_devices(disk_super->fsid);
+ if (IS_ERR(fs_devices))
+ return ERR_CAST(fs_devices);
+
if (has_metadata_uuid)
memcpy(fs_devices->metadata_uuid,
disk_super->metadata_uuid, BTRFS_FSID_SIZE);
- if (IS_ERR(fs_devices))
- return ERR_CAST(fs_devices);
-
if (same_fsid_diff_dev) {
generate_random_uuid(fs_devices->fsid);
fs_devices->temp_fsid = true;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 3504ade30cb0..188378ca19c7 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1661,13 +1661,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
out:
- if (cache->alloc_offset > fs_info->zone_size) {
- btrfs_err(fs_info,
- "zoned: invalid write pointer %llu in block group %llu",
- cache->alloc_offset, cache->start);
- ret = -EIO;
- }
-
if (cache->alloc_offset > cache->zone_capacity) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
diff --git a/fs/buffer.c b/fs/buffer.c
index 12e9a71c693d..967f34b70aa8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -282,13 +282,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
} while (tmp != bh);
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- /*
- * If all of the buffers are uptodate then we can set the page
- * uptodate.
- */
- if (folio_uptodate)
- folio_mark_uptodate(folio);
- folio_unlock(folio);
+ folio_end_read(folio, folio_uptodate);
return;
still_busy:
@@ -915,16 +909,12 @@ int remove_inode_buffers(struct inode *inode)
* which may not fail from ordinary buffer allocations.
*/
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
- bool retry)
+ gfp_t gfp)
{
struct buffer_head *bh, *head;
- gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
struct mem_cgroup *memcg, *old_memcg;
- if (retry)
- gfp |= __GFP_NOFAIL;
-
/* The folio lock pins the memcg */
memcg = folio_memcg(folio);
old_memcg = set_active_memcg(memcg);
@@ -967,7 +957,11 @@ EXPORT_SYMBOL_GPL(folio_alloc_buffers);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
bool retry)
{
- return folio_alloc_buffers(page_folio(page), size, retry);
+ gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
+ if (retry)
+ gfp |= __GFP_NOFAIL;
+
+ return folio_alloc_buffers(page_folio(page), size, gfp);
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);
@@ -1043,20 +1037,11 @@ grow_dev_page(struct block_device *bdev, sector_t block,
struct buffer_head *bh;
sector_t end_block;
int ret = 0;
- gfp_t gfp_mask;
-
- gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
-
- /*
- * XXX: __getblk_slow() can not really deal with failure and
- * will endlessly loop on improvised global reclaim. Prefer
- * looping in the allocator rather than here, at least that
- * code knows what it's doing.
- */
- gfp_mask |= __GFP_NOFAIL;
folio = __filemap_get_folio(inode->i_mapping, index,
- FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask);
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
bh = folio_buffers(folio);
if (bh) {
@@ -1069,7 +1054,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
goto failed;
}
- bh = folio_alloc_buffers(folio, size, true);
+ ret = -ENOMEM;
+ bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
+ if (!bh)
+ goto failed;
/*
* Link the folio to the buffers and initialise them. Take the
@@ -1420,33 +1408,36 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
}
EXPORT_SYMBOL(__find_get_block);
-/*
- * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
- * which corresponds to the passed block_device, block and size. The
- * returned buffer has its reference count incremented.
+/**
+ * bdev_getblk - Get a buffer_head in a block device's buffer cache.
+ * @bdev: The block device.
+ * @block: The block number.
+ * @size: The size of buffer_heads for this @bdev.
+ * @gfp: The memory allocation flags to use.
*
- * __getblk_gfp() will lock up the machine if grow_dev_page's
- * try_to_free_buffers() attempt is failing. FIXME, perhaps?
+ * Return: The buffer head, or NULL if memory could not be allocated.
*/
-struct buffer_head *
-__getblk_gfp(struct block_device *bdev, sector_t block,
- unsigned size, gfp_t gfp)
+struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
struct buffer_head *bh = __find_get_block(bdev, block, size);
- might_sleep();
- if (bh == NULL)
- bh = __getblk_slow(bdev, block, size, gfp);
- return bh;
+ might_alloc(gfp);
+ if (bh)
+ return bh;
+
+ return __getblk_slow(bdev, block, size, gfp);
}
-EXPORT_SYMBOL(__getblk_gfp);
+EXPORT_SYMBOL(bdev_getblk);
/*
* Do async read-ahead on a buffer..
*/
void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
- struct buffer_head *bh = __getblk(bdev, block, size);
+ struct buffer_head *bh = bdev_getblk(bdev, block, size,
+ GFP_NOWAIT | __GFP_MOVABLE);
+
if (likely(bh)) {
bh_readahead(bh, REQ_RAHEAD);
brelse(bh);
@@ -1470,7 +1461,17 @@ struct buffer_head *
__bread_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
+ struct buffer_head *bh;
+
+ gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+
+ /*
+ * Prefer looping in the allocator rather than here, at least that
+ * code knows what it's doing.
+ */
+ gfp |= __GFP_NOFAIL;
+
+ bh = bdev_getblk(bdev, block, size, gfp);
if (likely(bh) && !buffer_uptodate(bh))
bh = __bread_slow(bh);
@@ -1640,12 +1641,13 @@ EXPORT_SYMBOL(block_invalidate_folio);
* block_dirty_folio() via private_lock. try_to_free_buffers
* is already excluded via the folio lock.
*/
-void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
- unsigned long b_state)
+struct buffer_head *create_empty_buffers(struct folio *folio,
+ unsigned long blocksize, unsigned long b_state)
{
struct buffer_head *bh, *head, *tail;
+ gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
- head = folio_alloc_buffers(folio, blocksize, true);
+ head = folio_alloc_buffers(folio, blocksize, gfp);
bh = head;
do {
bh->b_state |= b_state;
@@ -1667,13 +1669,8 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
}
folio_attach_private(folio, head);
spin_unlock(&folio->mapping->private_lock);
-}
-EXPORT_SYMBOL(folio_create_empty_buffers);
-void create_empty_buffers(struct page *page,
- unsigned long blocksize, unsigned long b_state)
-{
- folio_create_empty_buffers(page_folio(page), blocksize, b_state);
+ return head;
}
EXPORT_SYMBOL(create_empty_buffers);
@@ -1768,13 +1765,15 @@ static struct buffer_head *folio_create_buffers(struct folio *folio,
struct inode *inode,
unsigned int b_state)
{
+ struct buffer_head *bh;
+
BUG_ON(!folio_test_locked(folio));
- if (!folio_buffers(folio))
- folio_create_empty_buffers(folio,
- 1 << READ_ONCE(inode->i_blkbits),
- b_state);
- return folio_buffers(folio);
+ bh = folio_buffers(folio);
+ if (!bh)
+ bh = create_empty_buffers(folio,
+ 1 << READ_ONCE(inode->i_blkbits), b_state);
+ return bh;
}
/*
@@ -2425,12 +2424,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (!nr) {
/*
- * All buffers are uptodate - we can set the folio uptodate
- * as well. But not if get_block() returned an error.
+ * All buffers are uptodate or get_block() returned an
+ * error when trying to map them - we can finish the read.
*/
- if (!page_error)
- folio_mark_uptodate(folio);
- folio_unlock(folio);
+ folio_end_read(folio, !page_error);
return 0;
}
@@ -2676,10 +2673,8 @@ int block_truncate_page(struct address_space *mapping,
return PTR_ERR(folio);
bh = folio_buffers(folio);
- if (!bh) {
- folio_create_empty_buffers(folio, blocksize, 0);
- bh = folio_buffers(folio);
- }
+ if (!bh)
+ bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
offset = offset_in_folio(folio, from);
@@ -2988,13 +2983,13 @@ EXPORT_SYMBOL(try_to_free_buffers);
/*
* Buffer-head allocation
*/
-static struct kmem_cache *bh_cachep __read_mostly;
+static struct kmem_cache *bh_cachep __ro_after_init;
/*
* Once the number of bh's in the machine exceeds this level, we start
* stripping them in writeback.
*/
-static unsigned long max_buffer_heads;
+static unsigned long max_buffer_heads __ro_after_init;
int buffer_heads_over_limit;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index c53a1d220622..1564eacc253d 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include "super.h"
+#include "mds_client.h"
static inline void ceph_set_cached_acl(struct inode *inode,
int type, struct posix_acl *acl)
@@ -31,6 +32,7 @@ static inline void ceph_set_cached_acl(struct inode *inode,
struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int size;
unsigned int retry_cnt = 0;
const char *name;
@@ -72,8 +74,8 @@ retry:
} else if (size == -ENODATA || size == 0) {
acl = NULL;
} else {
- pr_err_ratelimited("get acl %llx.%llx failed, err=%d\n",
- ceph_vinop(inode), size);
+ pr_err_ratelimited_client(cl, "%llx.%llx failed, err=%d\n",
+ ceph_vinop(inode), size);
acl = ERR_PTR(-EIO);
}
@@ -105,7 +107,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
- ret = posix_acl_update_mode(&nop_mnt_idmap, inode,
+ ret = posix_acl_update_mode(idmap, inode,
&new_mode, &acl);
if (ret)
goto out;
@@ -140,7 +142,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
newattrs.ia_ctime = current_time(inode);
newattrs.ia_mode = new_mode;
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- ret = __ceph_setattr(inode, &newattrs, NULL);
+ ret = __ceph_setattr(idmap, inode, &newattrs, NULL);
if (ret)
goto out_free;
}
@@ -151,7 +153,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
newattrs.ia_ctime = old_ctime;
newattrs.ia_mode = old_mode;
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- __ceph_setattr(inode, &newattrs, NULL);
+ __ceph_setattr(idmap, inode, &newattrs, NULL);
}
goto out_free;
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 936b9e0b351d..85be3bf18cdf 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -79,18 +79,18 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page)
*/
static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct inode *inode;
+ struct inode *inode = mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
if (folio_test_dirty(folio)) {
- dout("%p dirty_folio %p idx %lu -- already dirty\n",
- mapping->host, folio, folio->index);
+ doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n",
+ ceph_vinop(inode), folio, folio->index);
VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
return false;
}
- inode = mapping->host;
ci = ceph_inode(inode);
/* dirty the head */
@@ -111,12 +111,12 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
- dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d "
- "snapc %p seq %lld (%d snaps)\n",
- mapping->host, folio, folio->index,
- ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
- ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
- snapc, snapc->seq, snapc->num_snaps);
+ doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d "
+ "snapc %p seq %lld (%d snaps)\n",
+ ceph_vinop(inode), folio, folio->index,
+ ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+ ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+ snapc, snapc->seq, snapc->num_snaps);
spin_unlock(&ci->i_ceph_lock);
/*
@@ -137,23 +137,22 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
static void ceph_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
- struct inode *inode;
- struct ceph_inode_info *ci;
+ struct inode *inode = folio->mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- inode = folio->mapping->host;
- ci = ceph_inode(inode);
if (offset != 0 || length != folio_size(folio)) {
- dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n",
- inode, folio->index, offset, length);
+ doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n",
+ ceph_vinop(inode), folio->index, offset, length);
return;
}
WARN_ON(!folio_test_locked(folio));
if (folio_test_private(folio)) {
- dout("%p invalidate_folio idx %lu full dirty page\n",
- inode, folio->index);
+ doutc(cl, "%llx.%llx idx %lu full dirty page\n",
+ ceph_vinop(inode), folio->index);
snapc = folio_detach_private(folio);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
@@ -166,10 +165,10 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
{
struct inode *inode = folio->mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
- dout("%llx:%llx release_folio idx %lu (%sdirty)\n",
- ceph_vinop(inode),
- folio->index, folio_test_dirty(folio) ? "" : "not ");
+ doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode),
+ folio->index, folio_test_dirty(folio) ? "" : "not ");
if (folio_test_private(folio))
return false;
@@ -229,7 +228,7 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
{
struct inode *inode = subreq->rreq->inode;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
u32 xlen;
@@ -244,7 +243,8 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
static void finish_netfs_read(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
struct netfs_io_subrequest *subreq = req->r_priv;
struct ceph_osd_req_op *op = &req->r_ops[0];
@@ -254,8 +254,8 @@ static void finish_netfs_read(struct ceph_osd_request *req)
ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, osd_data->length, err);
- dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result,
- subreq->len, i_size_read(req->r_inode));
+ doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result,
+ subreq->len, i_size_read(req->r_inode));
/* no object means success but no data */
if (err == -ENOENT)
@@ -348,7 +348,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct netfs_io_request *rreq = subreq->rreq;
struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_request *req = NULL;
struct ceph_vino vino = ceph_vino(inode);
struct iov_iter iter;
@@ -383,7 +384,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
goto out;
}
- dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
+ doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
+ ceph_vinop(inode), subreq->start, subreq->len, len);
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
@@ -400,8 +402,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
if (err < 0) {
- dout("%s: iov_ter_get_pages_alloc returned %d\n",
- __func__, err);
+ doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
+ ceph_vinop(inode), err);
goto out;
}
@@ -429,12 +431,13 @@ out:
ceph_osdc_put_request(req);
if (err)
netfs_subreq_terminated(subreq, err, false);
- dout("%s: result %d\n", __func__, err);
+ doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
}
static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
{
struct inode *inode = rreq->inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int got = 0, want = CEPH_CAP_FILE_CACHE;
struct ceph_netfs_request_data *priv;
int ret = 0;
@@ -466,12 +469,12 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
*/
ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
if (ret < 0) {
- dout("start_read %p, error getting cap\n", inode);
+ doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode));
goto out;
}
if (!(got & want)) {
- dout("start_read %p, no cache cap\n", inode);
+ doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode));
ret = -EACCES;
goto out;
}
@@ -563,13 +566,14 @@ get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
struct ceph_snap_context *page_snapc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc = NULL;
struct ceph_cap_snap *capsnap = NULL;
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
- capsnap->context, capsnap->dirty_pages);
+ doutc(cl, " capsnap %p snapc %p has %d dirty pages\n",
+ capsnap, capsnap->context, capsnap->dirty_pages);
if (!capsnap->dirty_pages)
continue;
@@ -601,8 +605,8 @@ get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
}
if (!snapc && ci->i_wrbuffer_ref_head) {
snapc = ceph_get_snap_context(ci->i_head_snapc);
- dout(" head snapc %p has %d dirty pages\n",
- snapc, ci->i_wrbuffer_ref_head);
+ doutc(cl, " head snapc %p has %d dirty pages\n", snapc,
+ ci->i_wrbuffer_ref_head);
if (ctl) {
ctl->i_size = i_size_read(inode);
ctl->truncate_size = ci->i_truncate_size;
@@ -658,7 +662,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
int err;
@@ -670,7 +675,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
bool caching = ceph_is_cache_enabled(inode);
struct page *bounce_page = NULL;
- dout("writepage %p idx %lu\n", page, page->index);
+ doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page,
+ page->index);
if (ceph_inode_is_shutdown(inode))
return -EIO;
@@ -678,13 +684,14 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* verify this is a writeable snap context */
snapc = page_snap_context(page);
if (!snapc) {
- dout("writepage %p page %p not dirty?\n", inode, page);
+ doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode),
+ page);
return 0;
}
oldest = get_oldest_context(inode, &ceph_wbc, snapc);
if (snapc->seq > oldest->seq) {
- dout("writepage %p page %p snapc %p not writeable - noop\n",
- inode, page, snapc);
+ doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n",
+ ceph_vinop(inode), page, snapc);
/* we should only noop if called by kswapd */
WARN_ON(!(current->flags & PF_MEMALLOC));
ceph_put_snap_context(oldest);
@@ -695,8 +702,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* is this a partial page at end of file? */
if (page_off >= ceph_wbc.i_size) {
- dout("folio at %lu beyond eof %llu\n", folio->index,
- ceph_wbc.i_size);
+ doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n",
+ ceph_vinop(inode), folio->index, ceph_wbc.i_size);
folio_invalidate(folio, 0, folio_size(folio));
return 0;
}
@@ -705,8 +712,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
len = ceph_wbc.i_size - page_off;
wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
- dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
- inode, page, page->index, page_off, wlen, snapc, snapc->seq);
+ doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n",
+ ceph_vinop(inode), page, page->index, page_off, wlen, snapc,
+ snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
@@ -747,8 +755,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
osd_req_op_extent_osd_data_pages(req, 0,
bounce_page ? &bounce_page : &page, wlen, 0,
false, false);
- dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n",
- page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not ");
+ doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n",
+ ceph_vinop(inode), page_off, len, wlen,
+ IS_ENCRYPTED(inode) ? "" : "not ");
req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(osdc, req);
@@ -767,19 +776,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
wbc = &tmp_wbc;
if (err == -ERESTARTSYS) {
/* killed by SIGKILL */
- dout("writepage interrupted page %p\n", page);
+ doutc(cl, "%llx.%llx interrupted page %p\n",
+ ceph_vinop(inode), page);
redirty_page_for_writepage(wbc, page);
end_page_writeback(page);
return err;
}
if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
- dout("writepage setting page/mapping error %d %p\n",
- err, page);
+ doutc(cl, "%llx.%llx setting page/mapping error %d %p\n",
+ ceph_vinop(inode), err, page);
mapping_set_error(&inode->i_data, err);
wbc->pages_skipped++;
} else {
- dout("writepage cleaned page %p\n", page);
+ doutc(cl, "%llx.%llx cleaned page %p\n",
+ ceph_vinop(inode), page);
err = 0; /* vfs expects us to return 0 */
}
oldest = detach_page_private(page);
@@ -803,7 +814,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
ihold(inode);
if (wbc->sync_mode == WB_SYNC_NONE &&
- ceph_inode_to_client(inode)->write_congested)
+ ceph_inode_to_fs_client(inode)->write_congested)
return AOP_WRITEPAGE_ACTIVATE;
wait_on_page_fscache(page);
@@ -829,6 +840,7 @@ static void writepages_finish(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_osd_data *osd_data;
struct page *page;
int num_pages, total_pages = 0;
@@ -836,11 +848,11 @@ static void writepages_finish(struct ceph_osd_request *req)
int rc = req->r_result;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
unsigned int len = 0;
bool remove_page;
- dout("writepages_finish %p rc %d\n", inode, rc);
+ doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc);
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
@@ -862,8 +874,10 @@ static void writepages_finish(struct ceph_osd_request *req)
/* clean all pages */
for (i = 0; i < req->r_num_ops; i++) {
if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
- pr_warn("%s incorrect op %d req %p index %d tid %llu\n",
- __func__, req->r_ops[i].op, req, i, req->r_tid);
+ pr_warn_client(cl,
+ "%llx.%llx incorrect op %d req %p index %d tid %llu\n",
+ ceph_vinop(inode), req->r_ops[i].op, req, i,
+ req->r_tid);
break;
}
@@ -890,7 +904,7 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
- dout("unlocking %p\n", page);
+ doutc(cl, "unlocking %p\n", page);
if (remove_page)
generic_error_remove_page(inode->i_mapping,
@@ -898,8 +912,9 @@ static void writepages_finish(struct ceph_osd_request *req)
unlock_page(page);
}
- dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
- inode, osd_data->length, rc >= 0 ? num_pages : 0);
+ doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n",
+ ceph_vinop(inode), osd_data->length,
+ rc >= 0 ? num_pages : 0);
release_pages(osd_data->pages, num_pages);
}
@@ -926,7 +941,8 @@ static int ceph_writepages_start(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_vino vino = ceph_vino(inode);
pgoff_t index, start_index, end = -1;
struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
@@ -944,15 +960,15 @@ static int ceph_writepages_start(struct address_space *mapping,
fsc->write_congested)
return 0;
- dout("writepages_start %p (mode=%s)\n", inode,
- wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
- (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+ doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
+ wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+ (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (ceph_inode_is_shutdown(inode)) {
if (ci->i_wrbuffer_ref > 0) {
- pr_warn_ratelimited(
- "writepage_start %p %lld forced umount\n",
- inode, ceph_ino(inode));
+ pr_warn_ratelimited_client(cl,
+ "%llx.%llx %lld forced umount\n",
+ ceph_vinop(inode), ceph_ino(inode));
}
mapping_set_error(mapping, -EIO);
return -EIO; /* we're in a forced umount, don't write! */
@@ -976,11 +992,11 @@ retry:
if (!snapc) {
/* hmm, why does writepages get called when there
is no dirty data? */
- dout(" no snap context with dirty data?\n");
+ doutc(cl, " no snap context with dirty data?\n");
goto out;
}
- dout(" oldest snapc is %p seq %lld (%d snaps)\n",
- snapc, snapc->seq, snapc->num_snaps);
+ doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc,
+ snapc->seq, snapc->num_snaps);
should_loop = false;
if (ceph_wbc.head_snapc && snapc != last_snapc) {
@@ -990,13 +1006,13 @@ retry:
end = -1;
if (index > 0)
should_loop = true;
- dout(" cyclic, start at %lu\n", index);
+ doutc(cl, " cyclic, start at %lu\n", index);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = true;
- dout(" not cyclic, %lu to %lu\n", index, end);
+ doutc(cl, " not cyclic, %lu to %lu\n", index, end);
}
} else if (!ceph_wbc.head_snapc) {
/* Do not respect wbc->range_{start,end}. Dirty pages
@@ -1005,7 +1021,7 @@ retry:
* associated with 'snapc' get written */
if (index > 0)
should_loop = true;
- dout(" non-head snapc, range whole\n");
+ doutc(cl, " non-head snapc, range whole\n");
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
@@ -1028,12 +1044,12 @@ retry:
get_more_pages:
nr_folios = filemap_get_folios_tag(mapping, &index,
end, tag, &fbatch);
- dout("pagevec_lookup_range_tag got %d\n", nr_folios);
+ doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios);
if (!nr_folios && !locked_pages)
break;
for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
page = &fbatch.folios[i]->page;
- dout("? %p idx %lu\n", page, page->index);
+ doutc(cl, "? %p idx %lu\n", page, page->index);
if (locked_pages == 0)
lock_page(page); /* first page */
else if (!trylock_page(page))
@@ -1042,15 +1058,15 @@ get_more_pages:
/* only dirty pages, or our accounting breaks */
if (unlikely(!PageDirty(page)) ||
unlikely(page->mapping != mapping)) {
- dout("!dirty or !mapping %p\n", page);
+ doutc(cl, "!dirty or !mapping %p\n", page);
unlock_page(page);
continue;
}
/* only if matching snap context */
pgsnapc = page_snap_context(page);
if (pgsnapc != snapc) {
- dout("page snapc %p %lld != oldest %p %lld\n",
- pgsnapc, pgsnapc->seq, snapc, snapc->seq);
+ doutc(cl, "page snapc %p %lld != oldest %p %lld\n",
+ pgsnapc, pgsnapc->seq, snapc, snapc->seq);
if (!should_loop &&
!ceph_wbc.head_snapc &&
wbc->sync_mode != WB_SYNC_NONE)
@@ -1061,8 +1077,8 @@ get_more_pages:
if (page_offset(page) >= ceph_wbc.i_size) {
struct folio *folio = page_folio(page);
- dout("folio at %lu beyond eof %llu\n",
- folio->index, ceph_wbc.i_size);
+ doutc(cl, "folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc.i_size);
if ((ceph_wbc.size_stable ||
folio_pos(folio) >= i_size_read(inode)) &&
folio_clear_dirty_for_io(folio))
@@ -1072,23 +1088,23 @@ get_more_pages:
continue;
}
if (strip_unit_end && (page->index > strip_unit_end)) {
- dout("end of strip unit %p\n", page);
+ doutc(cl, "end of strip unit %p\n", page);
unlock_page(page);
break;
}
if (PageWriteback(page) || PageFsCache(page)) {
if (wbc->sync_mode == WB_SYNC_NONE) {
- dout("%p under writeback\n", page);
+ doutc(cl, "%p under writeback\n", page);
unlock_page(page);
continue;
}
- dout("waiting on writeback %p\n", page);
+ doutc(cl, "waiting on writeback %p\n", page);
wait_on_page_writeback(page);
wait_on_page_fscache(page);
}
if (!clear_page_dirty_for_io(page)) {
- dout("%p !clear_page_dirty_for_io\n", page);
+ doutc(cl, "%p !clear_page_dirty_for_io\n", page);
unlock_page(page);
continue;
}
@@ -1143,8 +1159,8 @@ get_more_pages:
}
/* note position of first page in fbatch */
- dout("%p will write page %p idx %lu\n",
- inode, page, page->index);
+ doutc(cl, "%llx.%llx will write page %p idx %lu\n",
+ ceph_vinop(inode), page, page->index);
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
@@ -1158,8 +1174,9 @@ get_more_pages:
locked_pages ? GFP_NOWAIT : GFP_NOFS);
if (IS_ERR(pages[locked_pages])) {
if (PTR_ERR(pages[locked_pages]) == -EINVAL)
- pr_err("%s: inode->i_blkbits=%hhu\n",
- __func__, inode->i_blkbits);
+ pr_err_client(cl,
+ "inode->i_blkbits=%hhu\n",
+ inode->i_blkbits);
/* better not fail on first page! */
BUG_ON(locked_pages == 0);
pages[locked_pages] = NULL;
@@ -1193,7 +1210,7 @@ get_more_pages:
if (nr_folios && i == nr_folios &&
locked_pages < max_pages) {
- dout("reached end fbatch, trying for more\n");
+ doutc(cl, "reached end fbatch, trying for more\n");
folio_batch_release(&fbatch);
goto get_more_pages;
}
@@ -1254,8 +1271,8 @@ new_request:
/* Start a new extent */
osd_req_op_extent_dup_last(req, op_idx,
cur_offset - offset);
- dout("writepages got pages at %llu~%llu\n",
- offset, len);
+ doutc(cl, "got pages at %llu~%llu\n", offset,
+ len);
osd_req_op_extent_osd_data_pages(req, op_idx,
data_pages, len, 0,
from_pool, false);
@@ -1288,12 +1305,13 @@ new_request:
if (IS_ENCRYPTED(inode))
len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
- dout("writepages got pages at %llu~%llu\n", offset, len);
+ doutc(cl, "got pages at %llu~%llu\n", offset, len);
if (IS_ENCRYPTED(inode) &&
((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK))
- pr_warn("%s: bad encrypted write offset=%lld len=%llu\n",
- __func__, offset, len);
+ pr_warn_client(cl,
+ "bad encrypted write offset=%lld len=%llu\n",
+ offset, len);
osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
0, from_pool, false);
@@ -1345,14 +1363,14 @@ new_request:
done = true;
release_folios:
- dout("folio_batch release on %d folios (%p)\n", (int)fbatch.nr,
- fbatch.nr ? fbatch.folios[0] : NULL);
+ doutc(cl, "folio_batch release on %d folios (%p)\n",
+ (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL);
folio_batch_release(&fbatch);
}
if (should_loop && !done) {
/* more to do; loop back to beginning of file */
- dout("writepages looping back to beginning of file\n");
+ doutc(cl, "looping back to beginning of file\n");
end = start_index - 1; /* OK even when start_index == 0 */
/* to write dirty pages associated with next snapc,
@@ -1390,7 +1408,8 @@ release_folios:
out:
ceph_osdc_put_request(req);
ceph_put_snap_context(last_snapc);
- dout("writepages dend - startone, rc = %d\n", rc);
+ doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
+ rc);
return rc;
}
@@ -1424,11 +1443,12 @@ static struct ceph_snap_context *
ceph_find_incompatible(struct page *page)
{
struct inode *inode = page->mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
if (ceph_inode_is_shutdown(inode)) {
- dout(" page %p %llx:%llx is shutdown\n", page,
- ceph_vinop(inode));
+ doutc(cl, " %llx.%llx page %p is shutdown\n",
+ ceph_vinop(inode), page);
return ERR_PTR(-ESTALE);
}
@@ -1449,13 +1469,15 @@ ceph_find_incompatible(struct page *page)
if (snapc->seq > oldest->seq) {
/* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
- dout(" page %p snapc %p not current or oldest\n", page, snapc);
+ doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n",
+ ceph_vinop(inode), page, snapc);
return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
/* yay, writeable, do it now (without dropping page lock) */
- dout(" page %p snapc %p not current, but oldest\n", page, snapc);
+ doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n",
+ ceph_vinop(inode), page, snapc);
if (clear_page_dirty_for_io(page)) {
int r = writepage_nounlock(page, NULL);
if (r < 0)
@@ -1524,10 +1546,11 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
{
struct folio *folio = page_folio(subpage);
struct inode *inode = file_inode(file);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
bool check_cap = false;
- dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file,
- inode, folio, (int)pos, (int)copied, (int)len);
+ doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode),
+ file, folio, (int)pos, (int)copied, (int)len);
if (!folio_test_uptodate(folio)) {
/* just return that nothing was copied on a short copy */
@@ -1587,6 +1610,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
int want, got, err;
@@ -1598,8 +1622,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ceph_block_sigs(&oldset);
- dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
- inode, ceph_vinop(inode), off);
+ doutc(cl, "%llx.%llx %llu trying to get caps\n",
+ ceph_vinop(inode), off);
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
@@ -1610,8 +1634,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
if (err < 0)
goto out_restore;
- dout("filemap_fault %p %llu got cap refs on %s\n",
- inode, off, ceph_cap_string(got));
+ doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode),
+ off, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
!ceph_has_inline_data(ci)) {
@@ -1619,8 +1643,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ceph_add_rw_context(fi, &rw_ctx);
ret = filemap_fault(vmf);
ceph_del_rw_context(fi, &rw_ctx);
- dout("filemap_fault %p %llu drop cap refs %s ret %x\n",
- inode, off, ceph_cap_string(got), ret);
+ doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n",
+ ceph_vinop(inode), off, ceph_cap_string(got), ret);
} else
err = -EAGAIN;
@@ -1661,8 +1685,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
out_inline:
filemap_invalidate_unlock_shared(mapping);
- dout("filemap_fault %p %llu read inline data ret %x\n",
- inode, off, ret);
+ doutc(cl, "%llx.%llx %llu read inline data ret %x\n",
+ ceph_vinop(inode), off, ret);
}
out_restore:
ceph_restore_sigs(&oldset);
@@ -1676,6 +1700,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct ceph_cap_flush *prealloc_cf;
@@ -1702,8 +1727,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
else
len = offset_in_thp(page, size);
- dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
- inode, ceph_vinop(inode), off, len, size);
+ doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
+ ceph_vinop(inode), off, len, size);
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
else
@@ -1714,8 +1739,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
if (err < 0)
goto out_free;
- dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
- inode, off, len, ceph_cap_string(got));
+ doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
+ off, len, ceph_cap_string(got));
/* Update time before taking page lock */
file_update_time(vma->vm_file);
@@ -1763,8 +1788,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
__mark_inode_dirty(inode, dirty);
}
- dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n",
- inode, off, len, ceph_cap_string(got), ret);
+ doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n",
+ ceph_vinop(inode), off, len, ceph_cap_string(got), ret);
ceph_put_cap_refs_async(ci, got);
out_free:
ceph_restore_sigs(&oldset);
@@ -1778,6 +1803,7 @@ out_free:
void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -1798,8 +1824,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
}
- dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
- inode, ceph_vinop(inode), len, locked_page);
+ doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode,
+ ceph_vinop(inode), len, locked_page);
if (len > 0) {
void *kaddr = kmap_atomic(page);
@@ -1823,7 +1849,8 @@ int ceph_uninline_data(struct file *file)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_request *req = NULL;
struct ceph_cap_flush *prealloc_cf = NULL;
struct folio *folio = NULL;
@@ -1836,8 +1863,8 @@ int ceph_uninline_data(struct file *file)
inline_version = ci->i_inline_version;
spin_unlock(&ci->i_ceph_lock);
- dout("uninline_data %p %llx.%llx inline_version %llu\n",
- inode, ceph_vinop(inode), inline_version);
+ doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode),
+ inline_version);
if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
@@ -1949,8 +1976,8 @@ out_unlock:
}
out:
ceph_free_cap_flush(prealloc_cf);
- dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
- inode, ceph_vinop(inode), inline_version, err);
+ doutc(cl, "%llx.%llx inline_version %llu = %d\n",
+ ceph_vinop(inode), inline_version, err);
return err;
}
@@ -1977,8 +2004,9 @@ enum {
static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
s64 pool, struct ceph_string *pool_ns)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
struct rb_node **p, *parent;
struct ceph_pool_perm *perm;
@@ -2013,10 +2041,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
goto out;
if (pool_ns)
- dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
- pool, (int)pool_ns->len, pool_ns->str);
+ doutc(cl, "pool %lld ns %.*s no perm cached\n", pool,
+ (int)pool_ns->len, pool_ns->str);
else
- dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
+ doutc(cl, "pool %lld no perm cached\n", pool);
down_write(&mdsc->pool_perm_rwsem);
p = &mdsc->pool_perm_tree.rb_node;
@@ -2141,15 +2169,16 @@ out:
if (!err)
err = have;
if (pool_ns)
- dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
- pool, (int)pool_ns->len, pool_ns->str, err);
+ doutc(cl, "pool %lld ns %.*s result = %d\n", pool,
+ (int)pool_ns->len, pool_ns->str, err);
else
- dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
+ doutc(cl, "pool %lld result = %d\n", pool, err);
return err;
}
int ceph_pool_perm_check(struct inode *inode, int need)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_string *pool_ns;
s64 pool;
@@ -2168,7 +2197,7 @@ int ceph_pool_perm_check(struct inode *inode, int need)
return 0;
}
- if (ceph_test_mount_opt(ceph_inode_to_client(inode),
+ if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode),
NOPOOLPERM))
return 0;
@@ -2179,13 +2208,11 @@ int ceph_pool_perm_check(struct inode *inode, int need)
check:
if (flags & CEPH_I_POOL_PERM) {
if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
- dout("ceph_pool_perm_check pool %lld no read perm\n",
- pool);
+ doutc(cl, "pool %lld no read perm\n", pool);
return -EPERM;
}
if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
- dout("ceph_pool_perm_check pool %lld no write perm\n",
- pool);
+ doutc(cl, "pool %lld no write perm\n", pool);
return -EPERM;
}
return 0;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index de1dee46d3df..930fbd54d2c8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -15,7 +15,7 @@
void ceph_fscache_register_inode_cookie(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
/* No caching for filesystem? */
if (!fsc->fscache)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a104669fcf4c..2c0b8dc3dd0d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -186,10 +186,10 @@ static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
mdsc->caps_avail_count += nr_caps;
}
- dout("%s: caps %d = %d used + %d resv + %d avail\n",
- __func__,
- mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ doutc(mdsc->fsc->client,
+ "caps %d = %d used + %d resv + %d avail\n",
+ mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
mdsc->caps_reserve_count +
mdsc->caps_avail_count);
@@ -202,6 +202,7 @@ static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int i, j;
struct ceph_cap *cap;
int have;
@@ -212,7 +213,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s;
LIST_HEAD(newcaps);
- dout("reserve caps ctx=%p need=%d\n", ctx, need);
+ doutc(cl, "ctx=%p need=%d\n", ctx, need);
/* first reserve any caps that are already allocated */
spin_lock(&mdsc->caps_list_lock);
@@ -272,8 +273,8 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
continue;
}
- pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
- ctx, need, have + alloc);
+ pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need,
+ have + alloc);
err = -ENOMEM;
break;
}
@@ -298,20 +299,21 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->caps_list_lock);
- dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
- ctx, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx,
+ mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
return err;
}
void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx)
{
+ struct ceph_client *cl = mdsc->fsc->client;
bool reclaim = false;
if (!ctx->count)
return;
- dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+ doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count);
spin_lock(&mdsc->caps_list_lock);
__ceph_unreserve_caps(mdsc, ctx->count);
ctx->count = 0;
@@ -328,6 +330,7 @@ void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap *cap = NULL;
/* temporary, until we do something about cap import/export */
@@ -359,9 +362,9 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
}
spin_lock(&mdsc->caps_list_lock);
- dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
- ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx,
+ ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
BUG_ON(!ctx->count);
BUG_ON(ctx->count > mdsc->caps_reserve_count);
BUG_ON(list_empty(&mdsc->caps_list));
@@ -382,10 +385,12 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
{
+ struct ceph_client *cl = mdsc->fsc->client;
+
spin_lock(&mdsc->caps_list_lock);
- dout("put_cap %p %d = %d used + %d resv + %d avail\n",
- cap, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap,
+ mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
mdsc->caps_use_count--;
/*
* Keep some preallocated caps around (ceph_min_count), to
@@ -491,11 +496,13 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
+
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
- ci->i_hold_caps_max - jiffies);
+ doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode,
+ ceph_vinop(inode), ci->i_hold_caps_max - jiffies);
}
/*
@@ -509,8 +516,11 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
- ci->i_ceph_flags, ci->i_hold_caps_max);
+ struct inode *inode = &ci->netfs.inode;
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n",
+ inode, ceph_vinop(inode), ci->i_ceph_flags,
+ ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
if (!list_empty(&ci->i_cap_delay_list)) {
@@ -533,7 +543,9 @@ no_change:
static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
+ struct inode *inode = &ci->netfs.inode;
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
spin_lock(&mdsc->cap_delay_lock);
ci->i_ceph_flags |= CEPH_I_FLUSH;
if (!list_empty(&ci->i_cap_delay_list))
@@ -550,7 +562,9 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
+ struct inode *inode = &ci->netfs.inode;
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
if (list_empty(&ci->i_cap_delay_list))
return;
spin_lock(&mdsc->cap_delay_lock);
@@ -562,6 +576,9 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
unsigned issued)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
unsigned had = __ceph_caps_issued(ci, NULL);
lockdep_assert_held(&ci->i_ceph_lock);
@@ -586,7 +603,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
if (issued & CEPH_CAP_FILE_SHARED)
atomic_inc(&ci->i_shared_gen);
if (S_ISDIR(ci->netfs.inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->netfs.inode);
+ doutc(cl, " marking %p NOT complete\n", inode);
__ceph_dir_clear_complete(ci);
}
}
@@ -635,7 +652,8 @@ void ceph_add_cap(struct inode *inode,
unsigned seq, unsigned mseq, u64 realmino, int flags,
struct ceph_cap **new_cap)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap;
int mds = session->s_mds;
@@ -644,8 +662,9 @@ void ceph_add_cap(struct inode *inode,
lockdep_assert_held(&ci->i_ceph_lock);
- dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
- session->s_mds, cap_id, ceph_cap_string(issued), seq);
+ doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode,
+ ceph_vinop(inode), session->s_mds, cap_id,
+ ceph_cap_string(issued), seq);
gen = atomic_read(&session->s_cap_gen);
@@ -723,9 +742,9 @@ void ceph_add_cap(struct inode *inode,
actual_wanted = __ceph_caps_wanted(ci);
if ((wanted & ~actual_wanted) ||
(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
- dout(" issued %s, mds wanted %s, actual %s, queueing\n",
- ceph_cap_string(issued), ceph_cap_string(wanted),
- ceph_cap_string(actual_wanted));
+ doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n",
+ ceph_cap_string(issued), ceph_cap_string(wanted),
+ ceph_cap_string(actual_wanted));
__cap_delay_requeue(mdsc, ci);
}
@@ -742,9 +761,9 @@ void ceph_add_cap(struct inode *inode,
WARN_ON(ci->i_auth_cap == cap);
}
- dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
- inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
- ceph_cap_string(issued|cap->issued), seq, mds);
+ doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n",
+ inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+ ceph_cap_string(issued|cap->issued), seq, mds);
cap->cap_id = cap_id;
cap->issued = issued;
cap->implemented |= issued;
@@ -766,6 +785,8 @@ void ceph_add_cap(struct inode *inode,
*/
static int __cap_is_valid(struct ceph_cap *cap)
{
+ struct inode *inode = &cap->ci->netfs.inode;
+ struct ceph_client *cl = cap->session->s_mdsc->fsc->client;
unsigned long ttl;
u32 gen;
@@ -773,9 +794,9 @@ static int __cap_is_valid(struct ceph_cap *cap)
ttl = cap->session->s_cap_ttl;
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
- dout("__cap_is_valid %p cap %p issued %s "
- "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
- cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+ doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued), cap->cap_gen, gen);
return 0;
}
@@ -789,6 +810,8 @@ static int __cap_is_valid(struct ceph_cap *cap)
*/
int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int have = ci->i_snap_caps;
struct ceph_cap *cap;
struct rb_node *p;
@@ -799,8 +822,8 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
cap = rb_entry(p, struct ceph_cap, ci_node);
if (!__cap_is_valid(cap))
continue;
- dout("__ceph_caps_issued %p cap %p issued %s\n",
- &ci->netfs.inode, cap, ceph_cap_string(cap->issued));
+ doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode,
+ ceph_vinop(inode), cap, ceph_cap_string(cap->issued));
have |= cap->issued;
if (implemented)
*implemented |= cap->implemented;
@@ -843,16 +866,18 @@ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
*/
static void __touch_cap(struct ceph_cap *cap)
{
+ struct inode *inode = &cap->ci->netfs.inode;
struct ceph_mds_session *s = cap->session;
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
spin_lock(&s->s_cap_lock);
if (!s->s_cap_iterator) {
- dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
- s->s_mds);
+ doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode,
+ ceph_vinop(inode), cap, s->s_mds);
list_move_tail(&cap->session_caps, &s->s_caps);
} else {
- dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
- &cap->ci->netfs.inode, cap, s->s_mds);
+ doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n",
+ inode, ceph_vinop(inode), cap, s->s_mds);
}
spin_unlock(&s->s_cap_lock);
}
@@ -864,15 +889,16 @@ static void __touch_cap(struct ceph_cap *cap)
*/
int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap *cap;
struct rb_node *p;
int have = ci->i_snap_caps;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
- " (mask %s)\n", ceph_ino(&ci->netfs.inode),
- ceph_cap_string(have),
- ceph_cap_string(mask));
+ doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n",
+ inode, ceph_vinop(inode), ceph_cap_string(have),
+ ceph_cap_string(mask));
return 1;
}
@@ -881,10 +907,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if (!__cap_is_valid(cap))
continue;
if ((cap->issued & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
- " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
- ceph_cap_string(cap->issued),
- ceph_cap_string(mask));
+ doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
if (touch)
__touch_cap(cap);
return 1;
@@ -893,10 +919,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/* does a combination of caps satisfy mask? */
have |= cap->issued;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
- " (mask %s)\n", ceph_ino(&ci->netfs.inode),
- ceph_cap_string(cap->issued),
- ceph_cap_string(mask));
+ doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n",
+ inode, ceph_vinop(inode),
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
if (touch) {
struct rb_node *q;
@@ -922,7 +948,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
int touch)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
int r;
r = __ceph_caps_issued_mask(ci, mask, touch);
@@ -954,13 +980,14 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
{
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int ret;
spin_lock(&ci->i_ceph_lock);
ret = __ceph_caps_revoking_other(ci, NULL, mask);
spin_unlock(&ci->i_ceph_lock);
- dout("ceph_caps_revoking %p %s = %d\n", inode,
- ceph_cap_string(mask), ret);
+ doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode),
+ ceph_cap_string(mask), ret);
return ret;
}
@@ -996,7 +1023,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->netfs.inode)->mount_options;
+ ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
@@ -1107,21 +1134,23 @@ int ceph_is_any_caps(struct inode *inode)
void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
{
struct ceph_mds_session *session = cap->session;
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct ceph_inode_info *ci = cap->ci;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc;
int removed = 0;
/* 'ci' being NULL means the remove have already occurred */
if (!ci) {
- dout("%s: cap inode is NULL\n", __func__);
+ doutc(cl, "inode is NULL\n");
return;
}
lockdep_assert_held(&ci->i_ceph_lock);
- dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
+ doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode));
- mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc;
+ mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc;
/* remove from inode's cap rbtree, and clear auth cap */
rb_erase(&cap->ci_node, &ci->i_caps);
@@ -1132,8 +1161,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
spin_lock(&session->s_cap_lock);
if (session->s_cap_iterator == cap) {
/* not yet, we are iterating over this very cap */
- dout("__ceph_remove_cap delaying %p removal from session %p\n",
- cap, cap->session);
+ doutc(cl, "delaying %p removal from session %p\n", cap,
+ cap->session);
} else {
list_del_init(&cap->session_caps);
session->s_nr_caps--;
@@ -1178,20 +1207,21 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
}
}
-void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
+void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+ bool queue_release)
{
struct ceph_inode_info *ci = cap->ci;
struct ceph_fs_client *fsc;
/* 'ci' being NULL means the remove have already occurred */
if (!ci) {
- dout("%s: cap inode is NULL\n", __func__);
+ doutc(mdsc->fsc->client, "inode is NULL\n");
return;
}
lockdep_assert_held(&ci->i_ceph_lock);
- fsc = ceph_inode_to_client(&ci->netfs.inode);
+ fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
WARN_ON_ONCE(ci->i_auth_cap == cap &&
!list_empty(&ci->i_dirty_item) &&
!fsc->blocklisted &&
@@ -1227,15 +1257,19 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
{
struct ceph_mds_caps *fc;
void *p;
- struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
-
- dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
- __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
- ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
- ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
- arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
- arg->size, arg->max_size, arg->xattr_version,
- arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
+ struct ceph_mds_client *mdsc = arg->session->s_mdsc;
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+
+ doutc(mdsc->fsc->client,
+ "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u"
+ " tid %llu/%llu mseq %u follows %lld size %llu/%llu"
+ " xattr_ver %llu xattr_len %d\n",
+ ceph_cap_op_name(arg->op), arg->cid, arg->ino,
+ ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
+ ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
+ arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
+ arg->size, arg->max_size, arg->xattr_version,
+ arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
msg->hdr.version = cpu_to_le16(12);
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
@@ -1342,6 +1376,8 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
*/
void __ceph_remove_caps(struct ceph_inode_info *ci)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
struct rb_node *p;
/* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
@@ -1351,7 +1387,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
while (p) {
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
p = rb_next(p);
- ceph_remove_cap(cap, true);
+ ceph_remove_cap(mdsc, cap, true);
}
spin_unlock(&ci->i_ceph_lock);
}
@@ -1370,6 +1406,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
{
struct ceph_inode_info *ci = cap->ci;
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int held, revoking;
lockdep_assert_held(&ci->i_ceph_lock);
@@ -1378,10 +1415,10 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
revoking = cap->implemented & ~cap->issued;
retain &= ~revoking;
- dout("%s %p cap %p session %p %s -> %s (revoking %s)\n",
- __func__, inode, cap, cap->session,
- ceph_cap_string(held), ceph_cap_string(held & retain),
- ceph_cap_string(revoking));
+ doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n",
+ inode, ceph_vinop(inode), cap, cap->session,
+ ceph_cap_string(held), ceph_cap_string(held & retain),
+ ceph_cap_string(revoking));
BUG_ON((retain & CEPH_CAP_PIN) == 0);
ci->i_ceph_flags &= ~CEPH_I_FLUSH;
@@ -1497,13 +1534,16 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
{
struct ceph_msg *msg;
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS,
false);
if (!msg) {
- pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
- ceph_vinop(inode), ceph_cap_string(arg->dirty),
- arg->flush_tid);
+ pr_err_client(cl,
+ "error allocating cap msg: ino (%llx.%llx)"
+ " flushing %s tid %llu, requeuing cap.\n",
+ ceph_vinop(inode), ceph_cap_string(arg->dirty),
+ arg->flush_tid);
spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(arg->session->s_mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
@@ -1592,11 +1632,13 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
{
struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap_snap *capsnap;
u64 oldest_flush_tid = 0;
u64 first_tid = 1, last_tid = 0;
- dout("__flush_snaps %p session %p\n", inode, session);
+ doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode),
+ session);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
/*
@@ -1611,7 +1653,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
/* only flush each capsnap once */
if (capsnap->cap_flush.tid > 0) {
- dout(" already flushed %p, skipping\n", capsnap);
+ doutc(cl, "already flushed %p, skipping\n", capsnap);
continue;
}
@@ -1643,8 +1685,8 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
int ret;
if (!(cap && cap->session == session)) {
- dout("__flush_snaps %p auth cap %p not mds%d, "
- "stop\n", inode, cap, session->s_mds);
+ doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n",
+ inode, ceph_vinop(inode), cap, session->s_mds);
break;
}
@@ -1665,15 +1707,17 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
- dout("__flush_snaps %p capsnap %p tid %llu %s\n",
- inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+ doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode,
+ ceph_vinop(inode), capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
oldest_flush_tid);
if (ret < 0) {
- pr_err("__flush_snaps: error sending cap flushsnap, "
- "ino (%llx.%llx) tid %llu follows %llu\n",
- ceph_vinop(inode), cf->tid, capsnap->follows);
+ pr_err_client(cl, "error sending cap flushsnap, "
+ "ino (%llx.%llx) tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid,
+ capsnap->follows);
}
ceph_put_cap_snap(capsnap);
@@ -1685,28 +1729,29 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession)
{
struct inode *inode = &ci->netfs.inode;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_mds_session *session = NULL;
bool need_put = false;
int mds;
- dout("ceph_flush_snaps %p\n", inode);
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
if (psession)
session = *psession;
retry:
spin_lock(&ci->i_ceph_lock);
if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
- dout(" no capsnap needs flush, doing nothing\n");
+ doutc(cl, " no capsnap needs flush, doing nothing\n");
goto out;
}
if (!ci->i_auth_cap) {
- dout(" no auth cap (migrating?), doing nothing\n");
+ doutc(cl, " no auth cap (migrating?), doing nothing\n");
goto out;
}
mds = ci->i_auth_cap->session->s_mds;
if (session && session->s_mds != mds) {
- dout(" oops, wrong session %p mutex\n", session);
+ doutc(cl, " oops, wrong session %p mutex\n", session);
ceph_put_mds_session(session);
session = NULL;
}
@@ -1750,23 +1795,25 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
struct ceph_cap_flush **pcf)
{
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc;
+ ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc;
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int was = ci->i_dirty_caps;
int dirty = 0;
lockdep_assert_held(&ci->i_ceph_lock);
if (!ci->i_auth_cap) {
- pr_warn("__mark_dirty_caps %p %llx mask %s, "
- "but no auth cap (session was closed?)\n",
- inode, ceph_ino(inode), ceph_cap_string(mask));
+ pr_warn_client(cl, "%p %llx.%llx mask %s, "
+ "but no auth cap (session was closed?)\n",
+ inode, ceph_vinop(inode),
+ ceph_cap_string(mask));
return 0;
}
- dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,
- ceph_cap_string(mask), ceph_cap_string(was),
- ceph_cap_string(was | mask));
+ doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(mask),
+ ceph_cap_string(was), ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
if (was == 0) {
struct ceph_mds_session *session = ci->i_auth_cap->session;
@@ -1779,8 +1826,9 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
ci->i_head_snapc = ceph_get_snap_context(
ci->i_snap_realm->cached_context);
}
- dout(" inode %p now dirty snapc %p auth cap %p\n",
- &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);
+ doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n",
+ inode, ceph_vinop(inode), ci->i_head_snapc,
+ ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
list_add(&ci->i_dirty_item, &session->s_cap_dirty);
@@ -1873,7 +1921,8 @@ static u64 __mark_caps_flushing(struct inode *inode,
struct ceph_mds_session *session, bool wake,
u64 *oldest_flush_tid)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap_flush *cf = NULL;
int flushing;
@@ -1884,13 +1933,13 @@ static u64 __mark_caps_flushing(struct inode *inode,
BUG_ON(!ci->i_prealloc_cap_flush);
flushing = ci->i_dirty_caps;
- dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
- ceph_cap_string(flushing),
- ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps | flushing));
+ doutc(cl, "flushing %s, flushing_caps %s -> %s\n",
+ ceph_cap_string(flushing),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps | flushing));
ci->i_flushing_caps |= flushing;
ci->i_dirty_caps = 0;
- dout(" inode %p now !dirty\n", inode);
+ doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode));
swap(cf, ci->i_prealloc_cap_flush);
cf->caps = flushing;
@@ -1921,6 +1970,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u32 invalidating_gen = ci->i_rdcache_gen;
@@ -1932,12 +1982,13 @@ static int try_nonblocking_invalidate(struct inode *inode)
if (inode->i_data.nrpages == 0 &&
invalidating_gen == ci->i_rdcache_gen) {
/* success. */
- dout("try_nonblocking_invalidate %p success\n", inode);
+ doutc(cl, "%p %llx.%llx success\n", inode,
+ ceph_vinop(inode));
/* save any racing async invalidate some trouble */
ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
return 0;
}
- dout("try_nonblocking_invalidate %p failed\n", inode);
+ doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode));
return -1;
}
@@ -1969,6 +2020,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags)
{
struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
@@ -2043,9 +2095,9 @@ retry:
}
}
- dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode),
- ceph_cap_string(file_wanted),
+ doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s "
+ "flushing %s issued %s revoking %s retain %s %s%s%s\n",
+ inode, ceph_vinop(inode), ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
@@ -2066,10 +2118,10 @@ retry:
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
- dout("check_caps trying to invalidate on %llx.%llx\n",
- ceph_vinop(inode));
+ doutc(cl, "trying to invalidate on %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
if (try_nonblocking_invalidate(inode) < 0) {
- dout("check_caps queuing invalidate\n");
+ doutc(cl, "queuing invalidate\n");
queue_invalidate = true;
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
@@ -2097,35 +2149,35 @@ retry:
cap_used &= ~ci->i_auth_cap->issued;
revoking = cap->implemented & ~cap->issued;
- dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
- cap->mds, cap, ceph_cap_string(cap_used),
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->implemented),
- ceph_cap_string(revoking));
+ doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n",
+ cap->mds, cap, ceph_cap_string(cap_used),
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->implemented),
+ ceph_cap_string(revoking));
if (cap == ci->i_auth_cap &&
(cap->issued & CEPH_CAP_FILE_WR)) {
/* request larger max_size from MDS? */
if (ci->i_wanted_max_size > ci->i_max_size &&
ci->i_wanted_max_size > ci->i_requested_max_size) {
- dout("requesting new max_size\n");
+ doutc(cl, "requesting new max_size\n");
goto ack;
}
/* approaching file_max? */
if (__ceph_should_report_size(ci)) {
- dout("i_size approaching max_size\n");
+ doutc(cl, "i_size approaching max_size\n");
goto ack;
}
}
/* flush anything dirty? */
if (cap == ci->i_auth_cap) {
if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
- dout("flushing dirty caps\n");
+ doutc(cl, "flushing dirty caps\n");
goto ack;
}
if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
- dout("flushing snap caps\n");
+ doutc(cl, "flushing snap caps\n");
goto ack;
}
}
@@ -2133,7 +2185,7 @@ retry:
/* completed revocation? going down and there are no caps? */
if (revoking) {
if ((revoking & cap_used) == 0) {
- dout("completed revocation of %s\n",
+ doutc(cl, "completed revocation of %s\n",
ceph_cap_string(cap->implemented & ~cap->issued));
goto ack;
}
@@ -2232,7 +2284,7 @@ ack:
*/
static int try_flush_caps(struct inode *inode, u64 *ptid)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
int flushing = 0;
u64 flush_tid = 0, oldest_flush_tid = 0;
@@ -2310,7 +2362,8 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
*/
static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
int ret, err = 0;
@@ -2400,8 +2453,9 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
kfree(sessions);
}
- dout("%s %p wait on tid %llu %llu\n", __func__,
- inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
+ doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode,
+ ceph_vinop(inode), req1 ? req1->r_tid : 0ULL,
+ req2 ? req2->r_tid : 0ULL);
if (req1) {
ret = !wait_for_completion_timeout(&req1->r_safe_completion,
ceph_timeout_jiffies(req1->r_timeout));
@@ -2427,11 +2481,13 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
u64 flush_tid;
int ret, err;
int dirty;
- dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+ doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode),
+ datasync ? " datasync" : "");
ret = file_write_and_wait_range(file, start, end);
if (datasync)
@@ -2442,7 +2498,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
dirty = try_flush_caps(inode, &flush_tid);
- dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+ doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty));
err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
@@ -2463,7 +2519,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (err < 0)
ret = err;
out:
- dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
+ doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode),
+ datasync ? " datasync" : "", ret);
return ret;
}
@@ -2476,12 +2533,13 @@ out:
int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
u64 flush_tid;
int err = 0;
int dirty;
int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
- dout("write_inode %p wait=%d\n", inode, wait);
+ doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait);
ceph_fscache_unpin_writeback(inode, wbc);
if (wait) {
err = ceph_wait_on_async_create(inode);
@@ -2493,7 +2551,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
caps_are_flushed(inode, flush_tid));
} else {
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
+ ceph_sb_to_fs_client(inode->i_sb)->mdsc;
spin_lock(&ci->i_ceph_lock);
if (__ceph_caps_dirty(ci))
@@ -2511,6 +2569,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
__acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
int ret;
@@ -2536,8 +2595,8 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
- pr_err("%p auth cap %p not mds%d ???\n",
- inode, cap, session->s_mds);
+ pr_err_client(cl, "%p auth cap %p not mds%d ???\n",
+ inode, cap, session->s_mds);
break;
}
@@ -2546,8 +2605,9 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
if (!cf->is_capsnap) {
struct cap_msg_args arg;
- dout("kick_flushing_caps %p cap %p tid %llu %s\n",
- inode, cap, cf->tid, ceph_cap_string(cf->caps));
+ doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n",
+ inode, ceph_vinop(inode), cap, cf->tid,
+ ceph_cap_string(cf->caps));
__prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
(cf->tid < last_snap_flush ?
CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
@@ -2561,9 +2621,9 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
cap_flush);
- dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
- inode, capsnap, cf->tid,
- ceph_cap_string(capsnap->dirty));
+ doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n",
+ inode, ceph_vinop(inode), capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
@@ -2571,11 +2631,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
oldest_flush_tid);
if (ret < 0) {
- pr_err("kick_flushing_caps: error sending "
- "cap flushsnap, ino (%llx.%llx) "
- "tid %llu follows %llu\n",
- ceph_vinop(inode), cf->tid,
- capsnap->follows);
+ pr_err_client(cl, "error sending cap flushsnap,"
+ " %p %llx.%llx tid %llu follows %llu\n",
+ inode, ceph_vinop(inode), cf->tid,
+ capsnap->follows);
}
ceph_put_cap_snap(capsnap);
@@ -2588,22 +2647,26 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct ceph_cap *cap;
u64 oldest_flush_tid;
- dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+ doutc(cl, "mds%d\n", session->s_mds);
spin_lock(&mdsc->cap_dirty_lock);
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ struct inode *inode = &ci->netfs.inode;
+
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
- pr_err("%p auth cap %p not mds%d ???\n",
- &ci->netfs.inode, cap, session->s_mds);
+ pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
+ inode, ceph_vinop(inode), cap,
+ session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2636,24 +2699,28 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct ceph_cap *cap;
u64 oldest_flush_tid;
lockdep_assert_held(&session->s_mutex);
- dout("kick_flushing_caps mds%d\n", session->s_mds);
+ doutc(cl, "mds%d\n", session->s_mds);
spin_lock(&mdsc->cap_dirty_lock);
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ struct inode *inode = &ci->netfs.inode;
+
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
- pr_err("%p auth cap %p not mds%d ???\n",
- &ci->netfs.inode, cap, session->s_mds);
+ pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
+ inode, ceph_vinop(inode), cap,
+ session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2670,11 +2737,13 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
{
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap *cap = ci->i_auth_cap;
+ struct inode *inode = &ci->netfs.inode;
lockdep_assert_held(&ci->i_ceph_lock);
- dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,
- ceph_cap_string(ci->i_flushing_caps));
+ doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n",
+ inode, ceph_vinop(inode),
+ ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
u64 oldest_flush_tid;
@@ -2696,6 +2765,9 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
bool snap_rwsem_locked)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
lockdep_assert_held(&ci->i_ceph_lock);
if (got & CEPH_CAP_PIN)
@@ -2716,10 +2788,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
}
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
- ihold(&ci->netfs.inode);
+ ihold(inode);
ci->i_wb_ref++;
- dout("%s %p wb %d -> %d (?)\n", __func__,
- &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref);
+ doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
+ ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -2746,20 +2818,23 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
loff_t endoff, int flags, int *got)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int ret = 0;
int have, implemented;
bool snap_rwsem_locked = false;
- dout("get_cap_refs %p need %s want %s\n", inode,
- ceph_cap_string(need), ceph_cap_string(want));
+ doutc(cl, "%p %llx.%llx need %s want %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(need),
+ ceph_cap_string(want));
again:
spin_lock(&ci->i_ceph_lock);
if ((flags & CHECK_FILELOCK) &&
(ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
- dout("try_get_cap_refs %p error filelock\n", inode);
+ doutc(cl, "%p %llx.%llx error filelock\n", inode,
+ ceph_vinop(inode));
ret = -EIO;
goto out_unlock;
}
@@ -2779,8 +2854,8 @@ again:
if (have & need & CEPH_CAP_FILE_WR) {
if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
- dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
- inode, endoff, ci->i_max_size);
+ doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n",
+ inode, ceph_vinop(inode), endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size)
ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
goto out_unlock;
@@ -2790,7 +2865,8 @@ again:
* can get a final snapshot value for size+mtime.
*/
if (__ceph_have_pending_cap_snap(ci)) {
- dout("get_cap_refs %p cap_snap_pending\n", inode);
+ doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode,
+ ceph_vinop(inode));
goto out_unlock;
}
}
@@ -2808,9 +2884,9 @@ again:
int not = want & ~(have & need);
int revoking = implemented & ~have;
int exclude = revoking & not;
- dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
- inode, ceph_cap_string(have), ceph_cap_string(not),
- ceph_cap_string(revoking));
+ doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n",
+ inode, ceph_vinop(inode), ceph_cap_string(have),
+ ceph_cap_string(not), ceph_cap_string(revoking));
if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) {
if (!snap_rwsem_locked &&
!ci->i_head_snapc &&
@@ -2850,28 +2926,31 @@ again:
spin_unlock(&s->s_cap_lock);
}
if (session_readonly) {
- dout("get_cap_refs %p need %s but mds%d readonly\n",
- inode, ceph_cap_string(need), ci->i_auth_cap->mds);
+ doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n",
+ inode, ceph_vinop(inode), ceph_cap_string(need),
+ ci->i_auth_cap->mds);
ret = -EROFS;
goto out_unlock;
}
if (ceph_inode_is_shutdown(inode)) {
- dout("get_cap_refs %p inode is shutdown\n", inode);
+ doutc(cl, "%p %llx.%llx inode is shutdown\n",
+ inode, ceph_vinop(inode));
ret = -ESTALE;
goto out_unlock;
}
mds_wanted = __ceph_caps_mds_wanted(ci, false);
if (need & ~mds_wanted) {
- dout("get_cap_refs %p need %s > mds_wanted %s\n",
- inode, ceph_cap_string(need),
- ceph_cap_string(mds_wanted));
+ doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n",
+ inode, ceph_vinop(inode), ceph_cap_string(need),
+ ceph_cap_string(mds_wanted));
ret = -EUCLEAN;
goto out_unlock;
}
- dout("get_cap_refs %p have %s need %s\n", inode,
- ceph_cap_string(have), ceph_cap_string(need));
+ doutc(cl, "%p %llx.%llx have %s need %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(have),
+ ceph_cap_string(need));
}
out_unlock:
@@ -2886,8 +2965,8 @@ out_unlock:
else if (ret == 1)
ceph_update_cap_hit(&mdsc->metric);
- dout("get_cap_refs %p ret %d got %s\n", inode,
- ret, ceph_cap_string(*got));
+ doutc(cl, "%p %llx.%llx ret %d got %s\n", inode,
+ ceph_vinop(inode), ret, ceph_cap_string(*got));
return ret;
}
@@ -2899,13 +2978,14 @@ out_unlock:
static void check_max_size(struct inode *inode, loff_t endoff)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int check = 0;
/* do we need to explicitly request a larger max_size? */
spin_lock(&ci->i_ceph_lock);
if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
- dout("write %p at large endoff %llu, req max_size\n",
- inode, endoff);
+ doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n",
+ inode, ceph_vinop(inode), endoff);
ci->i_wanted_max_size = endoff;
}
/* duplicate ceph_check_caps()'s logic */
@@ -2964,7 +3044,7 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
int want, loff_t endoff, int *got)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
int ret, _got, flags;
ret = ceph_pool_perm_check(inode, need);
@@ -3115,10 +3195,12 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
if (!capsnap->need_flush &&
!capsnap->writing && !capsnap->dirty_pages) {
- dout("dropping cap_snap %p follows %llu\n",
- capsnap, capsnap->follows);
+ doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows);
BUG_ON(capsnap->cap_flush.tid > 0);
ceph_put_snap_context(capsnap->context);
if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
@@ -3150,6 +3232,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
enum put_cap_refs_mode mode)
{
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int last = 0, put = 0, flushsnaps = 0, wake = 0;
bool check_flushsnaps = false;
@@ -3172,8 +3255,8 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
put++;
check_flushsnaps = true;
}
- dout("put_cap_refs %p wb %d -> %d (?)\n",
- inode, ci->i_wb_ref+1, ci->i_wb_ref);
+ doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
+ ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref);
}
if (had & CEPH_CAP_FILE_WR) {
if (--ci->i_wr_ref == 0) {
@@ -3213,8 +3296,8 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
}
spin_unlock(&ci->i_ceph_lock);
- dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
- last ? " last" : "", put ? " put" : "");
+ doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode),
+ ceph_cap_string(had), last ? " last" : "", put ? " put" : "");
switch (mode) {
case PUT_CAP_REFS_SYNC:
@@ -3264,6 +3347,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap_snap *capsnap = NULL, *iter;
int put = 0;
bool last = false;
@@ -3287,11 +3371,10 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
}
- dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
- inode,
- ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
- ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
- last ? " LAST" : "");
+ doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n",
+ inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr,
+ ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref,
+ ci->i_wrbuffer_ref_head, last ? " LAST" : "");
} else {
list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
if (iter->context == snapc) {
@@ -3321,13 +3404,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
}
}
}
- dout("put_wrbuffer_cap_refs on %p cap_snap %p "
- " snap %lld %d/%d -> %d/%d %s%s\n",
- inode, capsnap, capsnap->context->seq,
- ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
- ci->i_wrbuffer_ref, capsnap->dirty_pages,
- last ? " (wrbuffer last)" : "",
- complete_capsnap ? " (complete capsnap)" : "");
+ doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n",
+ inode, ceph_vinop(inode), capsnap, capsnap->context->seq,
+ ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+ ci->i_wrbuffer_ref, capsnap->dirty_pages,
+ last ? " (wrbuffer last)" : "",
+ complete_capsnap ? " (complete capsnap)" : "");
}
unlock:
@@ -3350,9 +3432,10 @@ unlock:
*/
static void invalidate_aliases(struct inode *inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct dentry *dn, *prev = NULL;
- dout("invalidate_aliases inode %p\n", inode);
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
d_prune_aliases(inode);
/*
* For non-directory inode, d_find_alias() only returns
@@ -3411,6 +3494,7 @@ static void handle_cap_grant(struct inode *inode,
__releases(ci->i_ceph_lock)
__releases(session->s_mdsc->snap_rwsem)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
int seq = le32_to_cpu(grant->seq);
int newcaps = le32_to_cpu(grant->caps);
@@ -3434,10 +3518,11 @@ static void handle_cap_grant(struct inode *inode,
if (IS_ENCRYPTED(inode) && size)
size = extra_info->fscrypt_file_size;
- dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
- inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
- dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
- i_size_read(inode));
+ doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode,
+ ceph_vinop(inode), cap, session->s_mds, seq,
+ ceph_cap_string(newcaps));
+ doutc(cl, " size %llu max_size %llu, i_size %llu\n", size,
+ max_size, i_size_read(inode));
/*
@@ -3497,15 +3582,17 @@ static void handle_cap_grant(struct inode *inode,
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
ci->i_btime = extra_info->btime;
- dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- from_kuid(&init_user_ns, inode->i_uid),
- from_kgid(&init_user_ns, inode->i_gid));
+ doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
+ ceph_vinop(inode), inode->i_mode,
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len ||
memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth,
ci->fscrypt_auth_len))
- pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
- __func__, ci->fscrypt_auth_len,
+ pr_warn_ratelimited_client(cl,
+ "cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
+ ci->fscrypt_auth_len,
extra_info->fscrypt_auth_len);
#endif
}
@@ -3523,8 +3610,8 @@ static void handle_cap_grant(struct inode *inode,
u64 version = le64_to_cpu(grant->xattr_version);
if (version > ci->i_xattrs.version) {
- dout(" got new xattrs v%llu on %p len %d\n",
- version, inode, len);
+ doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n",
+ version, inode, ceph_vinop(inode), len);
if (ci->i_xattrs.blob)
ceph_buffer_put(ci->i_xattrs.blob);
ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
@@ -3575,8 +3662,8 @@ static void handle_cap_grant(struct inode *inode,
if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
if (max_size != ci->i_max_size) {
- dout("max_size %lld -> %llu\n",
- ci->i_max_size, max_size);
+ doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size,
+ max_size);
ci->i_max_size = max_size;
if (max_size >= ci->i_wanted_max_size) {
ci->i_wanted_max_size = 0; /* reset */
@@ -3590,10 +3677,9 @@ static void handle_cap_grant(struct inode *inode,
wanted = __ceph_caps_wanted(ci);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
- dout(" my wanted = %s, used = %s, dirty %s\n",
- ceph_cap_string(wanted),
- ceph_cap_string(used),
- ceph_cap_string(dirty));
+ doutc(cl, " my wanted = %s, used = %s, dirty %s\n",
+ ceph_cap_string(wanted), ceph_cap_string(used),
+ ceph_cap_string(dirty));
if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
(wanted & ~(cap->mds_wanted | newcaps))) {
@@ -3614,10 +3700,9 @@ static void handle_cap_grant(struct inode *inode,
if (cap->issued & ~newcaps) {
int revoking = cap->issued & ~newcaps;
- dout("revocation: %s -> %s (revoking %s)\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(newcaps),
- ceph_cap_string(revoking));
+ doutc(cl, "revocation: %s -> %s (revoking %s)\n",
+ ceph_cap_string(cap->issued), ceph_cap_string(newcaps),
+ ceph_cap_string(revoking));
if (S_ISREG(inode->i_mode) &&
(revoking & used & CEPH_CAP_FILE_BUFFER))
writeback = true; /* initiate writeback; will delay ack */
@@ -3635,11 +3720,12 @@ static void handle_cap_grant(struct inode *inode,
cap->issued = newcaps;
cap->implemented |= newcaps;
} else if (cap->issued == newcaps) {
- dout("caps unchanged: %s -> %s\n",
- ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+ doutc(cl, "caps unchanged: %s -> %s\n",
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
} else {
- dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
- ceph_cap_string(newcaps));
+ doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
/* non-auth MDS is revoking the newly grant caps ? */
if (cap == ci->i_auth_cap &&
__ceph_caps_revoking_other(ci, cap, newcaps))
@@ -3727,7 +3813,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
__releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap_flush *cf, *tmp_cf;
LIST_HEAD(to_remove);
unsigned seq = le32_to_cpu(m->seq);
@@ -3764,11 +3851,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
}
}
- dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
- " flushing %s -> %s\n",
- inode, session->s_mds, seq, ceph_cap_string(dirty),
- ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+ doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n",
+ inode, ceph_vinop(inode), session->s_mds, seq,
+ ceph_cap_string(dirty), ceph_cap_string(cleaned),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps & ~cleaned));
if (list_empty(&to_remove) && !cleaned)
goto out;
@@ -3784,18 +3871,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
if (list_empty(&ci->i_cap_flush_list)) {
list_del_init(&ci->i_flushing_item);
if (!list_empty(&session->s_cap_flushing)) {
- dout(" mds%d still flushing cap on %p\n",
- session->s_mds,
- &list_first_entry(&session->s_cap_flushing,
- struct ceph_inode_info,
- i_flushing_item)->netfs.inode);
+ struct inode *inode =
+ &list_first_entry(&session->s_cap_flushing,
+ struct ceph_inode_info,
+ i_flushing_item)->netfs.inode;
+ doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n",
+ session->s_mds, inode, ceph_vinop(inode));
}
}
mdsc->num_cap_flushing--;
- dout(" inode %p now !flushing\n", inode);
+ doutc(cl, " %p %llx.%llx now !flushing\n", inode,
+ ceph_vinop(inode));
if (ci->i_dirty_caps == 0) {
- dout(" inode %p now clean\n", inode);
+ doutc(cl, " %p %llx.%llx now clean\n", inode,
+ ceph_vinop(inode));
BUG_ON(!list_empty(&ci->i_dirty_item));
drop = true;
if (ci->i_wr_ref == 0 &&
@@ -3833,12 +3923,14 @@ void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
bool *wake_ci, bool *wake_mdsc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
bool ret;
lockdep_assert_held(&ci->i_ceph_lock);
- dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);
+ doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap,
+ inode, ceph_vinop(inode), ci);
list_del_init(&capsnap->ci_item);
ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
@@ -3877,29 +3969,31 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
u64 follows = le64_to_cpu(m->snap_follows);
struct ceph_cap_snap *capsnap = NULL, *iter;
bool wake_ci = false;
bool wake_mdsc = false;
- dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
- inode, ci, session->s_mds, follows);
+ doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode,
+ ceph_vinop(inode), ci, session->s_mds, follows);
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
if (iter->follows == follows) {
if (iter->cap_flush.tid != flush_tid) {
- dout(" cap_snap %p follows %lld tid %lld !="
- " %lld\n", iter, follows,
- flush_tid, iter->cap_flush.tid);
+ doutc(cl, " cap_snap %p follows %lld "
+ "tid %lld != %lld\n", iter,
+ follows, flush_tid,
+ iter->cap_flush.tid);
break;
}
capsnap = iter;
break;
} else {
- dout(" skipping cap_snap %p follows %lld\n",
- iter, iter->follows);
+ doutc(cl, " skipping cap_snap %p follows %lld\n",
+ iter, iter->follows);
}
}
if (capsnap)
@@ -3928,6 +4022,7 @@ static bool handle_cap_trunc(struct inode *inode,
struct cap_extra_info *extra_info)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int mds = session->s_mds;
int seq = le32_to_cpu(trunc->seq);
u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
@@ -3950,8 +4045,8 @@ static bool handle_cap_trunc(struct inode *inode,
if (IS_ENCRYPTED(inode) && size)
size = extra_info->fscrypt_file_size;
- dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n",
- __func__, inode, mds, seq, truncate_size, truncate_seq);
+ doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n",
+ inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq);
queue_trunc = ceph_fill_file_size(inode, issued,
truncate_seq, truncate_size, size);
return queue_trunc;
@@ -3969,7 +4064,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
struct ceph_mds_cap_peer *ph,
struct ceph_mds_session *session)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *tsession = NULL;
struct ceph_cap *cap, *tcap, *new_cap = NULL;
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -3989,8 +4085,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
target = -1;
}
- dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
- inode, ci, mds, mseq, target);
+ doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n",
+ inode, ceph_vinop(inode), ci, mds, mseq, target);
retry:
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
@@ -3999,7 +4095,7 @@ retry:
goto out_unlock;
if (target < 0) {
- ceph_remove_cap(cap, false);
+ ceph_remove_cap(mdsc, cap, false);
goto out_unlock;
}
@@ -4010,12 +4106,13 @@ retry:
issued = cap->issued;
if (issued != cap->implemented)
- pr_err_ratelimited("handle_cap_export: issued != implemented: "
- "ino (%llx.%llx) mds%d seq %d mseq %d "
- "issued %s implemented %s\n",
- ceph_vinop(inode), mds, cap->seq, cap->mseq,
- ceph_cap_string(issued),
- ceph_cap_string(cap->implemented));
+ pr_err_ratelimited_client(cl, "issued != implemented: "
+ "%p %llx.%llx mds%d seq %d mseq %d"
+ " issued %s implemented %s\n",
+ inode, ceph_vinop(inode), mds,
+ cap->seq, cap->mseq,
+ ceph_cap_string(issued),
+ ceph_cap_string(cap->implemented));
tcap = __get_cap_for_mds(ci, target);
@@ -4023,7 +4120,8 @@ retry:
/* already have caps from the target */
if (tcap->cap_id == t_cap_id &&
ceph_seq_cmp(tcap->seq, t_seq) < 0) {
- dout(" updating import cap %p mds%d\n", tcap, target);
+ doutc(cl, " updating import cap %p mds%d\n", tcap,
+ target);
tcap->cap_id = t_cap_id;
tcap->seq = t_seq - 1;
tcap->issue_seq = t_seq - 1;
@@ -4034,7 +4132,7 @@ retry:
change_auth_cap_ses(ci, tcap->session);
}
}
- ceph_remove_cap(cap, false);
+ ceph_remove_cap(mdsc, cap, false);
goto out_unlock;
} else if (tsession) {
/* add placeholder for the export tagert */
@@ -4051,7 +4149,7 @@ retry:
spin_unlock(&mdsc->cap_dirty_lock);
}
- ceph_remove_cap(cap, false);
+ ceph_remove_cap(mdsc, cap, false);
goto out_unlock;
}
@@ -4104,6 +4202,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
struct ceph_cap **target_cap, int *old_issued)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap *cap, *ocap, *new_cap = NULL;
int mds = session->s_mds;
int issued;
@@ -4124,8 +4223,8 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
peer = -1;
}
- dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
- inode, ci, mds, mseq, peer);
+ doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n",
+ inode, ceph_vinop(inode), ci, mds, mseq, peer);
retry:
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
@@ -4151,20 +4250,20 @@ retry:
ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
if (ocap && ocap->cap_id == p_cap_id) {
- dout(" remove export cap %p mds%d flags %d\n",
- ocap, peer, ph->flags);
+ doutc(cl, " remove export cap %p mds%d flags %d\n",
+ ocap, peer, ph->flags);
if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
(ocap->seq != le32_to_cpu(ph->seq) ||
ocap->mseq != le32_to_cpu(ph->mseq))) {
- pr_err_ratelimited("handle_cap_import: "
- "mismatched seq/mseq: ino (%llx.%llx) "
- "mds%d seq %d mseq %d importer mds%d "
- "has peer seq %d mseq %d\n",
- ceph_vinop(inode), peer, ocap->seq,
- ocap->mseq, mds, le32_to_cpu(ph->seq),
+ pr_err_ratelimited_client(cl, "mismatched seq/mseq: "
+ "%p %llx.%llx mds%d seq %d mseq %d"
+ " importer mds%d has peer seq %d mseq %d\n",
+ inode, ceph_vinop(inode), peer,
+ ocap->seq, ocap->mseq, mds,
+ le32_to_cpu(ph->seq),
le32_to_cpu(ph->mseq));
}
- ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+ ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
}
*old_issued = issued;
@@ -4227,6 +4326,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_cap *cap;
@@ -4245,7 +4345,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
bool close_sessions = false;
bool do_cap_release = false;
- dout("handle_caps from mds%d\n", session->s_mds);
+ doutc(cl, "from mds%d\n", session->s_mds);
if (!ceph_inc_mds_stopping_blocker(mdsc, session))
return;
@@ -4347,15 +4447,15 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/* lookup ino */
inode = ceph_find_inode(mdsc->fsc->sb, vino);
- dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
- vino.snap, inode);
+ doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op),
+ vino.ino, vino.snap, inode);
mutex_lock(&session->s_mutex);
- dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
- (unsigned)seq);
+ doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds,
+ session->s_seq, (unsigned)seq);
if (!inode) {
- dout(" i don't have ino %llx\n", vino.ino);
+ doutc(cl, " i don't have ino %llx\n", vino.ino);
switch (op) {
case CEPH_CAP_OP_IMPORT:
@@ -4410,9 +4510,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
if (!cap) {
- dout(" no cap on %p ino %llx.%llx from mds%d\n",
- inode, ceph_ino(inode), ceph_snap(inode),
- session->s_mds);
+ doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n",
+ inode, ceph_ino(inode), ceph_snap(inode),
+ session->s_mds);
spin_unlock(&ci->i_ceph_lock);
switch (op) {
case CEPH_CAP_OP_REVOKE:
@@ -4450,8 +4550,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
default:
spin_unlock(&ci->i_ceph_lock);
- pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
- ceph_cap_op_name(op));
+ pr_err_client(cl, "unknown cap op %d %s\n", op,
+ ceph_cap_op_name(op));
}
done:
@@ -4492,7 +4592,7 @@ flush_cap_releases:
goto done;
bad:
- pr_err("ceph_handle_caps: corrupt message\n");
+ pr_err_client(cl, "corrupt message\n");
ceph_msg_dump(msg);
goto out;
}
@@ -4506,6 +4606,7 @@ bad:
*/
unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
@@ -4513,14 +4614,14 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
unsigned long loop_start = jiffies;
unsigned long delay = 0;
- dout("check_delayed_caps\n");
+ doutc(cl, "begin\n");
spin_lock(&mdsc->cap_delay_lock);
while (!list_empty(&mdsc->cap_delay_list)) {
ci = list_first_entry(&mdsc->cap_delay_list,
struct ceph_inode_info,
i_cap_delay_list);
if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
- dout("%s caps added recently. Exiting loop", __func__);
+ doutc(cl, "caps added recently. Exiting loop");
delay = ci->i_hold_caps_max;
break;
}
@@ -4532,13 +4633,15 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
inode = igrab(&ci->netfs.inode);
if (inode) {
spin_unlock(&mdsc->cap_delay_lock);
- dout("check_delayed_caps on %p\n", inode);
+ doutc(cl, "on %p %llx.%llx\n", inode,
+ ceph_vinop(inode));
ceph_check_caps(ci, 0);
iput(inode);
spin_lock(&mdsc->cap_delay_lock);
}
}
spin_unlock(&mdsc->cap_delay_lock);
+ doutc(cl, "done\n");
return delay;
}
@@ -4549,17 +4652,18 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
static void flush_dirty_session_caps(struct ceph_mds_session *s)
{
struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct inode *inode;
- dout("flush_dirty_caps\n");
+ doutc(cl, "begin\n");
spin_lock(&mdsc->cap_dirty_lock);
while (!list_empty(&s->s_cap_dirty)) {
ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
i_dirty_item);
inode = &ci->netfs.inode;
ihold(inode);
- dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
ceph_wait_on_async_create(inode);
ceph_check_caps(ci, CHECK_CAPS_FLUSH);
@@ -4567,7 +4671,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
spin_lock(&mdsc->cap_dirty_lock);
}
spin_unlock(&mdsc->cap_dirty_lock);
- dout("flush_dirty_caps done\n");
+ doutc(cl, "done\n");
}
void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
@@ -4672,7 +4776,7 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
- ceph_inode_to_client(inode)->mdsc;
+ ceph_inode_to_fs_client(inode)->mdsc;
__cap_delay_requeue_front(mdsc, ci);
}
}
@@ -4692,6 +4796,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
int mds, int drop, int unless, int force)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap *cap;
struct ceph_mds_request_release *rel = *p;
int used, dirty;
@@ -4701,9 +4806,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
- dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
- inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
- ceph_cap_string(unless));
+ doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n",
+ inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty),
+ ceph_cap_string(drop), ceph_cap_string(unless));
/* only drop unused, clean caps */
drop &= ~(used | dirty);
@@ -4725,12 +4830,13 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
if (force || (cap->issued & drop)) {
if (cap->issued & drop) {
int wanted = __ceph_caps_wanted(ci);
- dout("encode_inode_release %p cap %p "
- "%s -> %s, wanted %s -> %s\n", inode, cap,
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & ~drop),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(wanted));
+ doutc(cl, "%p %llx.%llx cap %p %s -> %s, "
+ "wanted %s -> %s\n", inode,
+ ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & ~drop),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(wanted));
cap->issued &= ~drop;
cap->implemented &= ~drop;
@@ -4739,9 +4845,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
!(wanted & CEPH_CAP_ANY_FILE_WR))
ci->i_requested_max_size = 0;
} else {
- dout("encode_inode_release %p cap %p %s"
- " (force)\n", inode, cap,
- ceph_cap_string(cap->issued));
+ doutc(cl, "%p %llx.%llx cap %p %s (force)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued));
}
rel->ino = cpu_to_le64(ceph_ino(inode));
@@ -4756,8 +4862,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
*p += sizeof(*rel);
ret = 1;
} else {
- dout("encode_inode_release %p cap %p %s (noop)\n",
- inode, cap, ceph_cap_string(cap->issued));
+ doutc(cl, "%p %llx.%llx cap %p %s (noop)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued));
}
}
spin_unlock(&ci->i_ceph_lock);
@@ -4783,6 +4890,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
struct dentry *parent = NULL;
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct ceph_client *cl;
int force = 0;
int ret;
@@ -4804,10 +4912,11 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
dput(parent);
+ cl = ceph_inode_to_client(dir);
spin_lock(&dentry->d_lock);
if (ret && di->lease_session && di->lease_session->s_mds == mds) {
- dout("encode_dentry_release %p mds%d seq %d\n",
- dentry, mds, (int)di->lease_seq);
+ doutc(cl, "%p mds%d seq %d\n", dentry, mds,
+ (int)di->lease_seq);
rel->dname_seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
spin_unlock(&dentry->d_lock);
@@ -4833,12 +4942,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap_snap *capsnap;
int capsnap_release = 0;
lockdep_assert_held(&ci->i_ceph_lock);
- dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
+ doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n",
+ ci, inode, ceph_vinop(inode));
while (!list_empty(&ci->i_cap_snaps)) {
capsnap = list_first_entry(&ci->i_cap_snaps,
@@ -4855,8 +4966,9 @@ static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_client *cl = fsc->client;
struct ceph_inode_info *ci = ceph_inode(inode);
bool is_auth;
bool dirty_dropped = false;
@@ -4864,8 +4976,8 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
lockdep_assert_held(&ci->i_ceph_lock);
- dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->netfs.inode);
+ doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n",
+ cap, ci, inode, ceph_vinop(inode));
is_auth = (cap == ci->i_auth_cap);
__ceph_remove_cap(cap, false);
@@ -4892,19 +5004,19 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
}
if (!list_empty(&ci->i_dirty_item)) {
- pr_warn_ratelimited(
- " dropping dirty %s state for %p %lld\n",
+ pr_warn_ratelimited_client(cl,
+ " dropping dirty %s state for %p %llx.%llx\n",
ceph_cap_string(ci->i_dirty_caps),
- inode, ceph_ino(inode));
+ inode, ceph_vinop(inode));
ci->i_dirty_caps = 0;
list_del_init(&ci->i_dirty_item);
dirty_dropped = true;
}
if (!list_empty(&ci->i_flushing_item)) {
- pr_warn_ratelimited(
- " dropping dirty+flushing %s state for %p %lld\n",
+ pr_warn_ratelimited_client(cl,
+ " dropping dirty+flushing %s state for %p %llx.%llx\n",
ceph_cap_string(ci->i_flushing_caps),
- inode, ceph_ino(inode));
+ inode, ceph_vinop(inode));
ci->i_flushing_caps = 0;
list_del_init(&ci->i_flushing_item);
mdsc->num_cap_flushing--;
@@ -4927,8 +5039,9 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
if (atomic_read(&ci->i_filelock_ref) > 0) {
/* make further file lock syscall return -EIO */
ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
- pr_warn_ratelimited(" dropping file locks for %p %lld\n",
- inode, ceph_ino(inode));
+ pr_warn_ratelimited_client(cl,
+ " dropping file locks for %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
}
if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index e3b1c3fab412..3b3c4d8d401e 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -113,7 +113,7 @@ static int ceph_crypt_set_context(struct inode *inode, const void *ctx,
cia.fscrypt_auth = cfa;
- ret = __ceph_setattr(inode, &attr, &cia);
+ ret = __ceph_setattr(&nop_mnt_idmap, inode, &attr, &cia);
if (ret == 0)
inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
kfree(cia.fscrypt_auth);
@@ -129,7 +129,7 @@ static bool ceph_crypt_empty_dir(struct inode *inode)
static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
{
- return ceph_sb_to_client(sb)->fsc_dummy_enc_policy.policy;
+ return ceph_sb_to_fs_client(sb)->fsc_dummy_enc_policy.policy;
}
static struct fscrypt_operations ceph_fscrypt_ops = {
@@ -212,6 +212,7 @@ void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
static struct inode *parse_longname(const struct inode *parent,
const char *name, int *name_len)
{
+ struct ceph_client *cl = ceph_inode_to_client(parent);
struct inode *dir = NULL;
struct ceph_vino vino = { .snap = CEPH_NOSNAP };
char *inode_number;
@@ -223,12 +224,12 @@ static struct inode *parse_longname(const struct inode *parent,
name++;
name_end = strrchr(name, '_');
if (!name_end) {
- dout("Failed to parse long snapshot name: %s\n", name);
+ doutc(cl, "failed to parse long snapshot name: %s\n", name);
return ERR_PTR(-EIO);
}
*name_len = (name_end - name);
if (*name_len <= 0) {
- pr_err("Failed to parse long snapshot name\n");
+ pr_err_client(cl, "failed to parse long snapshot name\n");
return ERR_PTR(-EIO);
}
@@ -240,7 +241,7 @@ static struct inode *parse_longname(const struct inode *parent,
return ERR_PTR(-ENOMEM);
ret = kstrtou64(inode_number, 10, &vino.ino);
if (ret) {
- dout("Failed to parse inode number: %s\n", name);
+ doutc(cl, "failed to parse inode number: %s\n", name);
dir = ERR_PTR(ret);
goto out;
}
@@ -251,7 +252,7 @@ static struct inode *parse_longname(const struct inode *parent,
/* This can happen if we're not mounting cephfs on the root */
dir = ceph_get_inode(parent->i_sb, vino, NULL);
if (IS_ERR(dir))
- dout("Can't find inode %s (%s)\n", inode_number, name);
+ doutc(cl, "can't find inode %s (%s)\n", inode_number, name);
}
out:
@@ -262,6 +263,7 @@ out:
int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
char *buf)
{
+ struct ceph_client *cl = ceph_inode_to_client(parent);
struct inode *dir = parent;
struct qstr iname;
u32 len;
@@ -330,7 +332,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
/* base64 encode the encrypted name */
elen = ceph_base64_encode(cryptbuf, len, buf);
- dout("base64-encoded ciphertext name = %.*s\n", elen, buf);
+ doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, buf);
/* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */
WARN_ON(elen > 240);
@@ -505,7 +507,10 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
unsigned int offs, u64 lblk_num)
{
- dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
+ ceph_vinop(inode), len, offs, lblk_num);
return fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num);
}
@@ -514,7 +519,10 @@ int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
unsigned int offs, u64 lblk_num,
gfp_t gfp_flags)
{
- dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
+ ceph_vinop(inode), len, offs, lblk_num);
return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num,
gfp_flags);
}
@@ -583,6 +591,7 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
u64 off, struct ceph_sparse_extent *map,
u32 ext_cnt)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int i, ret = 0;
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
@@ -590,7 +599,8 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
/* Nothing to do for empty array */
if (ext_cnt == 0) {
- dout("%s: empty array, ret 0\n", __func__);
+ doutc(cl, "%p %llx.%llx empty array, ret 0\n", inode,
+ ceph_vinop(inode));
return 0;
}
@@ -604,14 +614,17 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
int fret;
if ((ext->off | ext->len) & ~CEPH_FSCRYPT_BLOCK_MASK) {
- pr_warn("%s: bad encrypted sparse extent idx %d off %llx len %llx\n",
- __func__, i, ext->off, ext->len);
+ pr_warn_client(cl,
+ "%p %llx.%llx bad encrypted sparse extent "
+ "idx %d off %llx len %llx\n",
+ inode, ceph_vinop(inode), i, ext->off,
+ ext->len);
return -EIO;
}
fret = ceph_fscrypt_decrypt_pages(inode, &page[pgidx],
off + pgsoff, ext->len);
- dout("%s: [%d] 0x%llx~0x%llx fret %d\n", __func__, i,
- ext->off, ext->len, fret);
+ doutc(cl, "%p %llx.%llx [%d] 0x%llx~0x%llx fret %d\n", inode,
+ ceph_vinop(inode), i, ext->off, ext->len, fret);
if (fret < 0) {
if (ret == 0)
ret = fret;
@@ -619,7 +632,7 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
}
ret = pgsoff + fret;
}
- dout("%s: ret %d\n", __func__, ret);
+ doutc(cl, "ret %d\n", ret);
return ret;
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3904333fa6c3..24c08078f5aa 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -81,7 +81,7 @@ static int mdsc_show(struct seq_file *s, void *p)
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
} else if (req->r_dentry) {
- path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
&pathbase, 0);
if (IS_ERR(path))
path = NULL;
@@ -100,7 +100,7 @@ static int mdsc_show(struct seq_file *s, void *p)
}
if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+ path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
&pathbase, 0);
if (IS_ERR(path))
path = NULL;
@@ -398,7 +398,7 @@ DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
{
- dout("ceph_fs_debugfs_cleanup\n");
+ doutc(fsc->client, "begin\n");
debugfs_remove(fsc->debugfs_bdi);
debugfs_remove(fsc->debugfs_congestion_kb);
debugfs_remove(fsc->debugfs_mdsmap);
@@ -407,13 +407,14 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
debugfs_remove(fsc->debugfs_status);
debugfs_remove(fsc->debugfs_mdsc);
debugfs_remove_recursive(fsc->debugfs_metrics_dir);
+ doutc(fsc->client, "done\n");
}
void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
char name[100];
- dout("ceph_fs_debugfs_init\n");
+ doutc(fsc->client, "begin\n");
fsc->debugfs_congestion_kb =
debugfs_create_file("writeback_congestion_kb",
0600,
@@ -469,6 +470,7 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
&metrics_size_fops);
debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc,
&metrics_caps_fops);
+ doutc(fsc->client, "done\n");
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 854cbdd66661..91709934c8b1 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -109,7 +109,9 @@ static int fpos_cmp(loff_t l, loff_t r)
* regardless of what dir changes take place on the
* server.
*/
-static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
+static int note_last_dentry(struct ceph_fs_client *fsc,
+ struct ceph_dir_file_info *dfi,
+ const char *name,
int len, unsigned next_offset)
{
char *buf = kmalloc(len+1, GFP_KERNEL);
@@ -120,7 +122,7 @@ static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
memcpy(dfi->last_name, name, len);
dfi->last_name[len] = 0;
dfi->next_offset = next_offset;
- dout("note_last_dentry '%s'\n", dfi->last_name);
+ doutc(fsc->client, "'%s'\n", dfi->last_name);
return 0;
}
@@ -130,6 +132,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
struct ceph_readdir_cache_control *cache_ctl)
{
struct inode *dir = d_inode(parent);
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct dentry *dentry;
unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
loff_t ptr_pos = idx * sizeof(struct dentry *);
@@ -142,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
ceph_readdir_cache_release(cache_ctl);
cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
if (!cache_ctl->page) {
- dout(" page %lu not found\n", ptr_pgoff);
+ doutc(cl, " page %lu not found\n", ptr_pgoff);
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
@@ -185,13 +188,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
struct ceph_dir_file_info *dfi = file->private_data;
struct dentry *parent = file->f_path.dentry;
struct inode *dir = d_inode(parent);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(dir);
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct dentry *dentry, *last = NULL;
struct ceph_dentry_info *di;
struct ceph_readdir_cache_control cache_ctl = {};
u64 idx = 0;
int err = 0;
- dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos);
+ doutc(cl, "%p %llx.%llx v%u at %llx\n", dir, ceph_vinop(dir),
+ (unsigned)shared_gen, ctx->pos);
/* search start position */
if (ctx->pos > 2) {
@@ -221,7 +227,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
dput(dentry);
}
- dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
+ doutc(cl, "%p %llx.%llx cache idx %llu\n", dir,
+ ceph_vinop(dir), idx);
}
@@ -257,8 +264,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
spin_unlock(&dentry->d_lock);
if (emit_dentry) {
- dout(" %llx dentry %p %pd %p\n", di->offset,
- dentry, dentry, d_inode(dentry));
+ doutc(cl, " %llx dentry %p %pd %p\n", di->offset,
+ dentry, dentry, d_inode(dentry));
ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
@@ -281,7 +288,8 @@ out:
if (last) {
int ret;
di = ceph_dentry(last);
- ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len,
+ ret = note_last_dentry(fsc, dfi, last->d_name.name,
+ last->d_name.len,
fpos_off(di->offset) + 1);
if (ret < 0)
err = ret;
@@ -310,20 +318,23 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_dir_file_info *dfi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_client *cl = fsc->client;
int i;
int err;
unsigned frag = -1;
struct ceph_mds_reply_info_parsed *rinfo;
- dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
+ doutc(cl, "%p %llx.%llx file %p pos %llx\n", inode,
+ ceph_vinop(inode), file, ctx->pos);
if (dfi->file_info.flags & CEPH_F_ATEND)
return 0;
/* always start with . and .. */
if (ctx->pos == 0) {
- dout("readdir off 0 -> '.'\n");
+ doutc(cl, "%p %llx.%llx off 0 -> '.'\n", inode,
+ ceph_vinop(inode));
if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
inode->i_mode >> 12))
return 0;
@@ -337,7 +348,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ino = ceph_present_inode(dentry->d_parent->d_inode);
spin_unlock(&dentry->d_lock);
- dout("readdir off 1 -> '..'\n");
+ doutc(cl, "%p %llx.%llx off 1 -> '..'\n", inode,
+ ceph_vinop(inode));
if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
return 0;
ctx->pos = 2;
@@ -391,8 +403,8 @@ more:
frag = fpos_frag(ctx->pos);
}
- dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
- ceph_vinop(inode), frag, dfi->last_name);
+ doutc(cl, "fetching %p %llx.%llx frag %x offset '%s'\n",
+ inode, ceph_vinop(inode), frag, dfi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -446,12 +458,12 @@ more:
ceph_mdsc_put_request(req);
return err;
}
- dout("readdir got and parsed readdir result=%d on "
- "frag %x, end=%d, complete=%d, hash_order=%d\n",
- err, frag,
- (int)req->r_reply_info.dir_end,
- (int)req->r_reply_info.dir_complete,
- (int)req->r_reply_info.hash_order);
+ doutc(cl, "%p %llx.%llx got and parsed readdir result=%d"
+ "on frag %x, end=%d, complete=%d, hash_order=%d\n",
+ inode, ceph_vinop(inode), err, frag,
+ (int)req->r_reply_info.dir_end,
+ (int)req->r_reply_info.dir_complete,
+ (int)req->r_reply_info.hash_order);
rinfo = &req->r_reply_info;
if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
@@ -481,7 +493,8 @@ more:
dfi->dir_ordered_count = req->r_dir_ordered_cnt;
}
} else {
- dout("readdir !did_prepopulate\n");
+ doutc(cl, "%p %llx.%llx !did_prepopulate\n", inode,
+ ceph_vinop(inode));
/* disable readdir cache */
dfi->readdir_cache_idx = -1;
/* preclude from marking dir complete */
@@ -494,8 +507,8 @@ more:
rinfo->dir_entries + (rinfo->dir_nr-1);
unsigned next_offset = req->r_reply_info.dir_end ?
2 : (fpos_off(rde->offset) + 1);
- err = note_last_dentry(dfi, rde->name, rde->name_len,
- next_offset);
+ err = note_last_dentry(fsc, dfi, rde->name,
+ rde->name_len, next_offset);
if (err) {
ceph_mdsc_put_request(dfi->last_readdir);
dfi->last_readdir = NULL;
@@ -508,9 +521,9 @@ more:
}
rinfo = &dfi->last_readdir->r_reply_info;
- dout("readdir frag %x num %d pos %llx chunk first %llx\n",
- dfi->frag, rinfo->dir_nr, ctx->pos,
- rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
+ doutc(cl, "%p %llx.%llx frag %x num %d pos %llx chunk first %llx\n",
+ inode, ceph_vinop(inode), dfi->frag, rinfo->dir_nr, ctx->pos,
+ rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
i = 0;
/* search start position */
@@ -530,8 +543,9 @@ more:
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
if (rde->offset < ctx->pos) {
- pr_warn("%s: rde->offset 0x%llx ctx->pos 0x%llx\n",
- __func__, rde->offset, ctx->pos);
+ pr_warn_client(cl,
+ "%p %llx.%llx rde->offset 0x%llx ctx->pos 0x%llx\n",
+ inode, ceph_vinop(inode), rde->offset, ctx->pos);
return -EIO;
}
@@ -539,9 +553,9 @@ more:
return -EIO;
ctx->pos = rde->offset;
- dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
- i, rinfo->dir_nr, ctx->pos,
- rde->name_len, rde->name, &rde->inode.in);
+ doutc(cl, "%p %llx.%llx (%d/%d) -> %llx '%.*s' %p\n", inode,
+ ceph_vinop(inode), i, rinfo->dir_nr, ctx->pos,
+ rde->name_len, rde->name, &rde->inode.in);
if (!dir_emit(ctx, rde->name, rde->name_len,
ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
@@ -552,7 +566,7 @@ more:
* doesn't have enough memory, etc. So for next readdir
* it will continue.
*/
- dout("filldir stopping us...\n");
+ doutc(cl, "filldir stopping us...\n");
return 0;
}
@@ -583,7 +597,8 @@ more:
kfree(dfi->last_name);
dfi->last_name = NULL;
}
- dout("readdir next frag is %x\n", frag);
+ doutc(cl, "%p %llx.%llx next frag is %x\n", inode,
+ ceph_vinop(inode), frag);
goto more;
}
dfi->file_info.flags |= CEPH_F_ATEND;
@@ -598,20 +613,23 @@ more:
spin_lock(&ci->i_ceph_lock);
if (dfi->dir_ordered_count ==
atomic64_read(&ci->i_ordered_count)) {
- dout(" marking %p complete and ordered\n", inode);
+ doutc(cl, " marking %p %llx.%llx complete and ordered\n",
+ inode, ceph_vinop(inode));
/* use i_size to track number of entries in
* readdir cache */
BUG_ON(dfi->readdir_cache_idx < 0);
i_size_write(inode, dfi->readdir_cache_idx *
sizeof(struct dentry*));
} else {
- dout(" marking %p complete\n", inode);
+ doutc(cl, " marking %llx.%llx complete\n",
+ ceph_vinop(inode));
}
__ceph_dir_set_complete(ci, dfi->dir_release_count,
dfi->dir_ordered_count);
spin_unlock(&ci->i_ceph_lock);
}
- dout("readdir %p file %p done.\n", inode, file);
+ doutc(cl, "%p %llx.%llx file %p done.\n", inode, ceph_vinop(inode),
+ file);
return 0;
}
@@ -657,6 +675,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct ceph_dir_file_info *dfi = file->private_data;
struct inode *inode = file->f_mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
loff_t retval;
inode_lock(inode);
@@ -676,7 +695,8 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
if (offset >= 0) {
if (need_reset_readdir(dfi, offset)) {
- dout("dir_llseek dropping %p content\n", file);
+ doutc(cl, "%p %llx.%llx dropping %p content\n",
+ inode, ceph_vinop(inode), file);
reset_readdir(dfi);
} else if (is_hash_order(offset) && offset > file->f_pos) {
/* for hash offset, we don't know if a forward seek
@@ -703,8 +723,9 @@ out:
struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
+ struct ceph_client *cl = ceph_inode_to_client(parent);
/* .snap dir? */
if (ceph_snap(parent) == CEPH_NOSNAP &&
@@ -713,8 +734,9 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct inode *inode = ceph_get_snapdir(parent);
res = d_splice_alias(inode, dentry);
- dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n",
- dentry, dentry, inode, res);
+ doutc(cl, "ENOENT on snapdir %p '%pd', linking to "
+ "snapdir %p %llx.%llx. Spliced dentry %p\n",
+ dentry, dentry, inode, ceph_vinop(inode), res);
if (res)
dentry = res;
}
@@ -735,12 +757,15 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err)
{
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
+
if (err == -ENOENT) {
/* no trace? */
err = 0;
if (!req->r_reply_info.head->is_dentry) {
- dout("ENOENT and no trace, dentry %p inode %p\n",
- dentry, d_inode(dentry));
+ doutc(cl,
+ "ENOENT and no trace, dentry %p inode %llx.%llx\n",
+ dentry, ceph_vinop(d_inode(dentry)));
if (d_really_is_positive(dentry)) {
d_drop(dentry);
err = -ENOENT;
@@ -771,15 +796,16 @@ static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_request *req;
int op;
int mask;
int err;
- dout("lookup %p dentry %p '%pd'\n",
- dir, dentry, dentry);
+ doutc(cl, "%p %llx.%llx/'%pd' dentry %p\n", dir, ceph_vinop(dir),
+ dentry, dentry);
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
@@ -802,7 +828,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&ci->i_ceph_lock);
- dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
+ doutc(cl, " dir %llx.%llx flags are 0x%lx\n",
+ ceph_vinop(dir), ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
@@ -812,7 +839,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
__ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
spin_unlock(&ci->i_ceph_lock);
- dout(" dir %p complete, -ENOENT\n", dir);
+ doutc(cl, " dir %llx.%llx complete, -ENOENT\n",
+ ceph_vinop(dir));
d_add(dentry, NULL);
di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
return NULL;
@@ -850,7 +878,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
}
dentry = ceph_finish_lookup(req, dentry, err);
ceph_mdsc_put_request(req); /* will dput(dentry) */
- dout("lookup result=%p\n", dentry);
+ doutc(cl, "result=%p\n", dentry);
return dentry;
}
@@ -885,6 +913,7 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@@ -901,8 +930,8 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
goto out;
}
- dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
- dir, dentry, mode, rdev);
+ doutc(cl, "%p %llx.%llx/'%pd' dentry %p mode 0%ho rdev %d\n",
+ dir, ceph_vinop(dir), dentry, dentry, mode, rdev);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -924,6 +953,7 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
req->r_parent = dir;
ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
req->r_args.mknod.mode = cpu_to_le32(mode);
req->r_args.mknod.rdev = cpu_to_le32(rdev);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
@@ -993,6 +1023,7 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *dest)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
umode_t mode = S_IFLNK | 0777;
@@ -1010,7 +1041,8 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
goto out;
}
- dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+ doutc(cl, "%p %llx.%llx/'%pd' to '%s'\n", dir, ceph_vinop(dir), dentry,
+ dest);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -1040,6 +1072,7 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
}
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
@@ -1064,6 +1097,7 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@@ -1076,10 +1110,11 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
op = CEPH_MDS_OP_MKSNAP;
- dout("mksnap dir %p snap '%pd' dn %p\n", dir,
- dentry, dentry);
+ doutc(cl, "mksnap %llx.%llx/'%pd' dentry %p\n",
+ ceph_vinop(dir), dentry, dentry);
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
+ doutc(cl, "mkdir %llx.%llx/'%pd' dentry %p mode 0%ho\n",
+ ceph_vinop(dir), dentry, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
err = -EROFS;
@@ -1117,6 +1152,8 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
req->r_parent = dir;
ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ if (op == CEPH_MDS_OP_MKDIR)
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
req->r_args.mkdir.mode = cpu_to_le32(mode);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
CEPH_CAP_XATTR_EXCL;
@@ -1144,6 +1181,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
int err;
@@ -1161,8 +1199,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
if (err)
return err;
- dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n",
- dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry);
+ doutc(cl, "%p %llx.%llx/'%pd' to '%pd'\n", dir, ceph_vinop(dir),
+ old_dentry, dentry);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
d_drop(dentry);
@@ -1199,14 +1237,16 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
struct dentry *dentry = req->r_dentry;
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_dentry_info *di = ceph_dentry(dentry);
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
- pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
- __func__, dentry, dentry);
+ pr_warn_client(cl,
+ "dentry %p:%pd async unlink bit is not set\n",
+ dentry, dentry);
spin_lock(&fsc->async_unlink_conflict_lock);
hash_del_rcu(&di->hnode);
@@ -1226,7 +1266,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
if (result) {
int pathlen = 0;
u64 base = 0;
- char *path = ceph_mdsc_build_path(dentry, &pathlen,
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
&base, 0);
/* mark error on parent + clear complete */
@@ -1240,8 +1280,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
/* mark inode itself for an error (since metadata is bogus) */
mapping_set_error(req->r_old_inode->i_mapping, result);
- pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
+ pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
}
out:
@@ -1290,7 +1330,8 @@ static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
*/
static int ceph_unlink(struct inode *dir, struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
@@ -1300,11 +1341,12 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
- dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry);
+ doutc(cl, "rmsnap %llx.%llx/'%pd' dn\n", ceph_vinop(dir),
+ dentry);
op = CEPH_MDS_OP_RMSNAP;
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("unlink/rmdir dir %p dn %p inode %p\n",
- dir, dentry, inode);
+ doutc(cl, "unlink/rmdir %llx.%llx/'%pd' inode %llx.%llx\n",
+ ceph_vinop(dir), dentry, ceph_vinop(inode));
op = d_is_dir(dentry) ?
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
@@ -1327,9 +1369,9 @@ retry:
(req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
struct ceph_dentry_info *di = ceph_dentry(dentry);
- dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
- dentry->d_name.len, dentry->d_name.name,
- ceph_cap_string(req->r_dir_caps));
+ doutc(cl, "async unlink on %llx.%llx/'%pd' caps=%s",
+ ceph_vinop(dir), dentry,
+ ceph_cap_string(req->r_dir_caps));
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
req->r_callback = ceph_async_unlink_cb;
req->r_old_inode = d_inode(dentry);
@@ -1384,6 +1426,7 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct dentry *new_dentry, unsigned int flags)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
int op = CEPH_MDS_OP_RENAME;
int err;
@@ -1413,8 +1456,9 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (err)
return err;
- dout("rename dir %p dentry %p to dir %p dentry %p\n",
- old_dir, old_dentry, new_dir, new_dentry);
+ doutc(cl, "%llx.%llx/'%pd' to %llx.%llx/'%pd'\n",
+ ceph_vinop(old_dir), old_dentry, ceph_vinop(new_dir),
+ new_dentry);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1459,9 +1503,10 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
{
struct dentry *dn = di->dentry;
- struct ceph_mds_client *mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
- dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
+ doutc(cl, "%p %p '%pd'\n", di, dn, dn);
di->flags |= CEPH_DENTRY_LEASE_LIST;
if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
@@ -1469,7 +1514,6 @@ void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
return;
}
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_list_lock);
list_move_tail(&di->lease_list, &mdsc->dentry_leases);
spin_unlock(&mdsc->dentry_list_lock);
@@ -1493,10 +1537,10 @@ static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
{
struct dentry *dn = di->dentry;
- struct ceph_mds_client *mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
- dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n",
- di, dn, dn, di->offset);
+ doutc(cl, "%p %p '%pd' (offset 0x%llx)\n", di, dn, dn, di->offset);
if (!list_empty(&di->lease_list)) {
if (di->flags & CEPH_DENTRY_LEASE_LIST) {
@@ -1516,7 +1560,6 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
return;
}
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_list_lock);
__dentry_dir_lease_touch(mdsc, di),
spin_unlock(&mdsc->dentry_list_lock);
@@ -1530,7 +1573,7 @@ static void __dentry_lease_unlist(struct ceph_dentry_info *di)
if (list_empty(&di->lease_list))
return;
- mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc;
+ mdsc = ceph_sb_to_fs_client(di->dentry->d_sb)->mdsc;
spin_lock(&mdsc->dentry_list_lock);
list_del_init(&di->lease_list);
spin_unlock(&mdsc->dentry_list_lock);
@@ -1757,6 +1800,8 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
{
struct ceph_dentry_info *di;
struct ceph_mds_session *session = NULL;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
u32 seq = 0;
int valid = 0;
@@ -1789,7 +1834,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
CEPH_MDS_LEASE_RENEW, seq);
ceph_put_mds_session(session);
}
- dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+ doutc(cl, "dentry %p = %d\n", dentry, valid);
return valid;
}
@@ -1832,6 +1877,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
struct ceph_mds_client *mdsc)
{
struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_client *cl = mdsc->fsc->client;
int valid;
int shared_gen;
@@ -1853,8 +1899,9 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
valid = 0;
spin_unlock(&dentry->d_lock);
}
- dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n",
- dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid);
+ doutc(cl, "dir %p %llx.%llx v%u dentry %p '%pd' = %d\n", dir,
+ ceph_vinop(dir), (unsigned)atomic_read(&ci->i_shared_gen),
+ dentry, dentry, valid);
return valid;
}
@@ -1863,10 +1910,11 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
*/
static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
int valid = 0;
struct dentry *parent;
struct inode *dir, *inode;
- struct ceph_mds_client *mdsc;
valid = fscrypt_d_revalidate(dentry, flags);
if (valid <= 0)
@@ -1884,16 +1932,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
inode = d_inode(dentry);
}
- dout("d_revalidate %p '%pd' inode %p offset 0x%llx nokey %d\n", dentry,
- dentry, inode, ceph_dentry(dentry)->offset,
- !!(dentry->d_flags & DCACHE_NOKEY_NAME));
+ doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n",
+ dentry, dentry, inode, ceph_dentry(dentry)->offset,
+ !!(dentry->d_flags & DCACHE_NOKEY_NAME));
- mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
+ mdsc = ceph_sb_to_fs_client(dir->i_sb)->mdsc;
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
- dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
- dentry, inode);
+ doutc(cl, "%p '%pd' inode %p is SNAPPED\n", dentry,
+ dentry, inode);
valid = 1;
} else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
valid = 1;
@@ -1948,14 +1996,14 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
break;
}
ceph_mdsc_put_request(req);
- dout("d_revalidate %p lookup result=%d\n",
- dentry, err);
+ doutc(cl, "%p '%pd', lookup result=%d\n", dentry,
+ dentry, err);
}
} else {
percpu_counter_inc(&mdsc->metric.d_lease_hit);
}
- dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
+ doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid");
if (!valid)
ceph_dir_clear_complete(dir);
@@ -1995,9 +2043,9 @@ static int ceph_d_delete(const struct dentry *dentry)
static void ceph_d_release(struct dentry *dentry)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
- dout("d_release %p\n", dentry);
+ doutc(fsc->client, "dentry %p '%pd'\n", dentry, dentry);
atomic64_dec(&fsc->mdsc->metric.total_dentries);
@@ -2018,10 +2066,12 @@ static void ceph_d_release(struct dentry *dentry)
*/
static void ceph_d_prune(struct dentry *dentry)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *dir_ci;
struct ceph_dentry_info *di;
- dout("ceph_d_prune %pd %p\n", dentry, dentry);
+ doutc(cl, "dentry %p '%pd'\n", dentry, dentry);
/* do we have a valid parent? */
if (IS_ROOT(dentry))
@@ -2064,7 +2114,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
int left;
const int bufsize = 1024;
- if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+ if (!ceph_test_mount_opt(ceph_sb_to_fs_client(inode->i_sb), DIRSTAT))
return -EISDIR;
if (!dfi->dir_info) {
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 8559990a59a5..726af69d4d62 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -36,6 +36,7 @@ struct ceph_nfs_snapfh {
static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
static const int snap_handle_length =
sizeof(struct ceph_nfs_snapfh) >> 2;
struct ceph_nfs_snapfh *sfh = (void *)rawfh;
@@ -79,13 +80,14 @@ static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
*max_len = snap_handle_length;
ret = FILEID_BTRFS_WITH_PARENT;
out:
- dout("encode_snapfh %llx.%llx ret=%d\n", ceph_vinop(inode), ret);
+ doutc(cl, "%p %llx.%llx ret=%d\n", inode, ceph_vinop(inode), ret);
return ret;
}
static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
static const int handle_length =
sizeof(struct ceph_nfs_fh) >> 2;
static const int connected_handle_length =
@@ -105,15 +107,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
if (parent_inode) {
struct ceph_nfs_confh *cfh = (void *)rawfh;
- dout("encode_fh %llx with parent %llx\n",
- ceph_ino(inode), ceph_ino(parent_inode));
+ doutc(cl, "%p %llx.%llx with parent %p %llx.%llx\n", inode,
+ ceph_vinop(inode), parent_inode, ceph_vinop(parent_inode));
cfh->ino = ceph_ino(inode);
cfh->parent_ino = ceph_ino(parent_inode);
*max_len = connected_handle_length;
type = FILEID_INO32_GEN_PARENT;
} else {
struct ceph_nfs_fh *fh = (void *)rawfh;
- dout("encode_fh %llx\n", ceph_ino(inode));
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
fh->ino = ceph_ino(inode);
*max_len = handle_length;
type = FILEID_INO32_GEN;
@@ -123,7 +125,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
struct inode *inode;
struct ceph_vino vino;
int err;
@@ -205,7 +207,8 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
struct ceph_nfs_snapfh *sfh,
bool want_parent)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct inode *inode;
struct ceph_vino vino;
@@ -278,11 +281,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
ceph_mdsc_put_request(req);
if (want_parent) {
- dout("snapfh_to_parent %llx.%llx\n err=%d\n",
- vino.ino, vino.snap, err);
+ doutc(cl, "%llx.%llx\n err=%d\n", vino.ino, vino.snap, err);
} else {
- dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d",
- vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err);
+ doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino,
+ vino.snap, sfh->parent_ino, sfh->hash, err);
}
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -297,6 +299,7 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
struct fid *fid,
int fh_len, int fh_type)
{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_nfs_fh *fh = (void *)fid->raw;
if (fh_type == FILEID_BTRFS_WITH_PARENT) {
@@ -310,14 +313,14 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
if (fh_len < sizeof(*fh) / 4)
return NULL;
- dout("fh_to_dentry %llx\n", fh->ino);
+ doutc(fsc->client, "%llx\n", fh->ino);
return __fh_to_dentry(sb, fh->ino);
}
static struct dentry *__get_parent(struct super_block *sb,
struct dentry *child, u64 ino)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
struct ceph_mds_request *req;
struct inode *inode;
int mask;
@@ -363,6 +366,7 @@ static struct dentry *__get_parent(struct super_block *sb,
static struct dentry *ceph_get_parent(struct dentry *child)
{
struct inode *inode = d_inode(child);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct dentry *dn;
if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -402,8 +406,8 @@ static struct dentry *ceph_get_parent(struct dentry *child)
dn = __get_parent(child->d_sb, child, 0);
}
out:
- dout("get_parent %p ino %llx.%llx err=%ld\n",
- child, ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn));
+ doutc(cl, "child %p %p %llx.%llx err=%ld\n", child, inode,
+ ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn));
return dn;
}
@@ -414,6 +418,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
struct fid *fid,
int fh_len, int fh_type)
{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_nfs_confh *cfh = (void *)fid->raw;
struct dentry *dentry;
@@ -427,7 +432,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
if (fh_len < sizeof(*cfh) / 4)
return NULL;
- dout("fh_to_parent %llx\n", cfh->parent_ino);
+ doutc(fsc->client, "%llx\n", cfh->parent_ino);
dentry = __get_parent(sb, NULL, cfh->ino);
if (unlikely(dentry == ERR_PTR(-ENOENT)))
dentry = __fh_to_dentry(sb, cfh->parent_ino);
@@ -439,7 +444,7 @@ static int __get_snap_name(struct dentry *parent, char *name,
{
struct inode *inode = d_inode(child);
struct inode *dir = d_inode(parent);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_mds_request *req = NULL;
char *last_name = NULL;
unsigned next_offset = 2;
@@ -526,8 +531,8 @@ out:
if (req)
ceph_mdsc_put_request(req);
kfree(last_name);
- dout("get_snap_name %p ino %llx.%llx err=%d\n",
- child, ceph_vinop(inode), err);
+ doutc(fsc->client, "child dentry %p %p %llx.%llx err=%d\n", child,
+ inode, ceph_vinop(inode), err);
return err;
}
@@ -544,7 +549,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
if (ceph_snap(inode) != CEPH_NOSNAP)
return __get_snap_name(parent, name, child);
- mdsc = ceph_inode_to_client(inode)->mdsc;
+ mdsc = ceph_inode_to_fs_client(inode)->mdsc;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
USE_ANY_MDS);
if (IS_ERR(req))
@@ -588,9 +593,9 @@ static int ceph_get_name(struct dentry *parent, char *name,
ceph_fname_free_buffer(dir, &oname);
}
out:
- dout("get_name %p ino %llx.%llx err %d %s%s\n",
- child, ceph_vinop(inode), err,
- err ? "" : "name ", err ? "" : name);
+ doutc(mdsc->fsc->client, "child dentry %p %p %llx.%llx err %d %s%s\n",
+ child, inode, ceph_vinop(inode), err, err ? "" : "name ",
+ err ? "" : name);
ceph_mdsc_put_request(req);
return err;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 649600d0a7b6..3b5aae29e944 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -19,8 +19,9 @@
#include "io.h"
#include "metric.h"
-static __le32 ceph_flags_sys2wire(u32 flags)
+static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags)
{
+ struct ceph_client *cl = mdsc->fsc->client;
u32 wire_flags = 0;
switch (flags & O_ACCMODE) {
@@ -48,7 +49,7 @@ static __le32 ceph_flags_sys2wire(u32 flags)
#undef ceph_sys2wire
if (flags)
- dout("unused open flags: %x\n", flags);
+ doutc(cl, "unused open flags: %x\n", flags);
return cpu_to_le32(wire_flags);
}
@@ -189,7 +190,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
if (IS_ERR(req))
goto out;
req->r_fmode = ceph_flags_to_mode(flags);
- req->r_args.open.flags = ceph_flags_sys2wire(flags);
+ req->r_args.open.flags = ceph_flags_sys2wire(mdsc, flags);
req->r_args.open.mode = cpu_to_le32(create_mode);
out:
return req;
@@ -200,12 +201,13 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->netfs.inode)->mount_options;
+ ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_file_info *fi;
int ret;
- dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
- inode->i_mode, isdir ? "dir" : "regular");
+ doutc(cl, "%p %llx.%llx %p 0%o (%s)\n", inode, ceph_vinop(inode),
+ file, inode->i_mode, isdir ? "dir" : "regular");
BUG_ON(inode->i_fop->release != ceph_release);
if (isdir) {
@@ -234,7 +236,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
- fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
+ fi->filp_gen = READ_ONCE(ceph_inode_to_fs_client(inode)->filp_gen);
if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) {
ret = ceph_uninline_data(file);
@@ -259,6 +261,7 @@ error:
*/
static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int ret = 0;
switch (inode->i_mode & S_IFMT) {
@@ -271,13 +274,13 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
break;
case S_IFLNK:
- dout("init_file %p %p 0%o (symlink)\n", inode, file,
- inode->i_mode);
+ doutc(cl, "%p %llx.%llx %p 0%o (symlink)\n", inode,
+ ceph_vinop(inode), file, inode->i_mode);
break;
default:
- dout("init_file %p %p 0%o (special)\n", inode, file,
- inode->i_mode);
+ doutc(cl, "%p %llx.%llx %p 0%o (special)\n", inode,
+ ceph_vinop(inode), file, inode->i_mode);
/*
* we need to drop the open ref now, since we don't
* have .release set to ceph_release.
@@ -296,6 +299,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
int ceph_renew_caps(struct inode *inode, int fmode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
@@ -307,8 +311,9 @@ int ceph_renew_caps(struct inode *inode, int fmode)
(!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
int issued = __ceph_caps_issued(ci, NULL);
spin_unlock(&ci->i_ceph_lock);
- dout("renew caps %p want %s issued %s updating mds_wanted\n",
- inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+ doutc(cl, "%p %llx.%llx want %s issued %s updating mds_wanted\n",
+ inode, ceph_vinop(inode), ceph_cap_string(wanted),
+ ceph_cap_string(issued));
ceph_check_caps(ci, 0);
return 0;
}
@@ -339,7 +344,8 @@ int ceph_renew_caps(struct inode *inode, int fmode)
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
out:
- dout("renew caps %p open result=%d\n", inode, err);
+ doutc(cl, "%p %llx.%llx open result=%d\n", inode, ceph_vinop(inode),
+ err);
return err < 0 ? err : 0;
}
@@ -352,7 +358,8 @@ out:
int ceph_open(struct inode *inode, struct file *file)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct ceph_file_info *fi = file->private_data;
@@ -360,7 +367,7 @@ int ceph_open(struct inode *inode, struct file *file)
int flags, fmode, wanted;
if (fi) {
- dout("open file %p is already opened\n", file);
+ doutc(cl, "file %p is already opened\n", file);
return 0;
}
@@ -374,8 +381,8 @@ int ceph_open(struct inode *inode, struct file *file)
return err;
}
- dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
- ceph_vinop(inode), file, flags, file->f_flags);
+ doutc(cl, "%p %llx.%llx file %p flags %d (%d)\n", inode,
+ ceph_vinop(inode), file, flags, file->f_flags);
fmode = ceph_flags_to_mode(flags);
wanted = ceph_caps_for_mode(fmode);
@@ -399,9 +406,9 @@ int ceph_open(struct inode *inode, struct file *file)
int mds_wanted = __ceph_caps_mds_wanted(ci, true);
int issued = __ceph_caps_issued(ci, NULL);
- dout("open %p fmode %d want %s issued %s using existing\n",
- inode, fmode, ceph_cap_string(wanted),
- ceph_cap_string(issued));
+ doutc(cl, "open %p fmode %d want %s issued %s using existing\n",
+ inode, fmode, ceph_cap_string(wanted),
+ ceph_cap_string(issued));
__ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
@@ -421,7 +428,7 @@ int ceph_open(struct inode *inode, struct file *file)
spin_unlock(&ci->i_ceph_lock);
- dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+ doutc(cl, "open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
req = prepare_open_request(inode->i_sb, flags, 0);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -435,7 +442,7 @@ int ceph_open(struct inode *inode, struct file *file)
if (!err)
err = ceph_init_file(inode, file, req->r_fmode);
ceph_mdsc_put_request(req);
- dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+ doutc(cl, "open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
out:
return err;
}
@@ -515,6 +522,7 @@ no_async:
static void restore_deleg_ino(struct inode *dir, u64 ino)
{
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_mds_session *s = NULL;
@@ -525,7 +533,8 @@ static void restore_deleg_ino(struct inode *dir, u64 ino)
if (s) {
int err = ceph_restore_deleg_ino(s, ino);
if (err)
- pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
+ pr_warn_client(cl,
+ "unable to restore delegated ino 0x%llx to session: %d\n",
ino, err);
ceph_put_mds_session(s);
}
@@ -557,6 +566,7 @@ static void wake_async_create_waiters(struct inode *inode,
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct dentry *dentry = req->r_dentry;
struct inode *dinode = d_inode(dentry);
struct inode *tinode = req->r_target_inode;
@@ -574,10 +584,11 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
if (result) {
int pathlen = 0;
u64 base = 0;
- char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
&base, 0);
- pr_warn("async create failure path=(%llx)%s result=%d!\n",
+ pr_warn_client(cl,
+ "async create failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
@@ -596,14 +607,15 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
u64 ino = ceph_vino(tinode).ino;
if (req->r_deleg_ino != ino)
- pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
- __func__, req->r_err, req->r_deleg_ino, ino);
+ pr_warn_client(cl,
+ "inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
+ req->r_err, req->r_deleg_ino, ino);
mapping_set_error(tinode->i_mapping, result);
wake_async_create_waiters(tinode, req->r_session);
} else if (!result) {
- pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
- req->r_deleg_ino);
+ pr_warn_client(cl, "no req->r_target_inode for 0x%llx\n",
+ req->r_deleg_ino);
}
out:
ceph_mdsc_release_dir_caps(req);
@@ -625,6 +637,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
struct timespec64 now;
struct ceph_string *pool_ns;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_vino vino = { .ino = req->r_deleg_ino,
.snap = CEPH_NOSNAP };
@@ -655,7 +668,9 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
in.truncate_seq = cpu_to_le32(1);
in.truncate_size = cpu_to_le64(-1ULL);
in.xattr_version = cpu_to_le64(1);
- in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
+ in.uid = cpu_to_le32(from_kuid(&init_user_ns,
+ mapped_fsuid(req->r_mnt_idmap,
+ &init_user_ns)));
if (dir->i_mode & S_ISGID) {
in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
@@ -663,7 +678,9 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
if (S_ISDIR(mode))
mode |= S_ISGID;
} else {
- in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns,
+ mapped_fsgid(req->r_mnt_idmap,
+ &init_user_ns)));
}
in.mode = cpu_to_le32((u32)mode);
@@ -683,7 +700,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
req->r_fmode, NULL);
up_read(&mdsc->snap_rwsem);
if (ret) {
- dout("%s failed to fill inode: %d\n", __func__, ret);
+ doutc(cl, "failed to fill inode: %d\n", ret);
ceph_dir_clear_complete(dir);
if (!d_unhashed(dentry))
d_drop(dentry);
@@ -691,8 +708,8 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
} else {
struct dentry *dn;
- dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
- vino.ino, ceph_ino(dir), dentry->d_name.name);
+ doutc(cl, "d_adding new inode 0x%llx to 0x%llx/%s\n",
+ vino.ino, ceph_ino(dir), dentry->d_name.name);
ceph_dir_clear_ordered(dir);
ceph_init_inode_acls(inode, as_ctx);
if (inode->i_state & I_NEW) {
@@ -730,7 +747,9 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct inode *new_inode = NULL;
@@ -740,9 +759,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
int mask;
int err;
- dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
- dir, dentry, dentry,
- d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
+ doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
+ dir, ceph_vinop(dir), dentry, dentry,
+ d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
@@ -788,6 +807,8 @@ retry:
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.open.mask = cpu_to_le32(mask);
req->r_parent = dir;
+ if (req->r_op == CEPH_MDS_OP_CREATE)
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
ihold(dir);
if (IS_ENCRYPTED(dir)) {
set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
@@ -880,17 +901,18 @@ retry:
goto out_req;
if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
/* make vfs retry on splice, ENOENT, or symlink */
- dout("atomic_open finish_no_open on dn %p\n", dn);
+ doutc(cl, "finish_no_open on dn %p\n", dn);
err = finish_no_open(file, dn);
} else {
if (IS_ENCRYPTED(dir) &&
!fscrypt_has_permitted_context(dir, d_inode(dentry))) {
- pr_warn("Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
+ pr_warn_client(cl,
+ "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
goto out_req;
}
- dout("atomic_open finish_open on dn %p\n", dn);
+ doutc(cl, "finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
struct inode *newino = d_inode(dentry);
@@ -905,17 +927,19 @@ out_req:
iput(new_inode);
out_ctx:
ceph_release_acl_sec_ctx(&as_ctx);
- dout("atomic_open result=%d\n", err);
+ doutc(cl, "result=%d\n", err);
return err;
}
int ceph_release(struct inode *inode, struct file *file)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
if (S_ISDIR(inode->i_mode)) {
struct ceph_dir_file_info *dfi = file->private_data;
- dout("release inode %p dir file %p\n", inode, file);
+ doutc(cl, "%p %llx.%llx dir file %p\n", inode,
+ ceph_vinop(inode), file);
WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
ceph_put_fmode(ci, dfi->file_info.fmode, 1);
@@ -927,7 +951,8 @@ int ceph_release(struct inode *inode, struct file *file)
kmem_cache_free(ceph_dir_file_cachep, dfi);
} else {
struct ceph_file_info *fi = file->private_data;
- dout("release inode %p regular file %p\n", inode, file);
+ doutc(cl, "%p %llx.%llx regular file %p\n", inode,
+ ceph_vinop(inode), file);
WARN_ON(!list_empty(&fi->rw_contexts));
ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
@@ -962,7 +987,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
u64 *last_objver)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_client *osdc = &fsc->client->osdc;
ssize_t ret;
u64 off = *ki_pos;
@@ -971,7 +997,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
u64 objver = 0;
- dout("sync_read on inode %p %llx~%llx\n", inode, *ki_pos, len);
+ doutc(cl, "on inode %p %llx.%llx %llx~%llx\n", inode,
+ ceph_vinop(inode), *ki_pos, len);
if (ceph_inode_is_shutdown(inode))
return -EIO;
@@ -1005,8 +1032,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
/* determine new offset/length if encrypted */
ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
- dout("sync_read orig %llu~%llu reading %llu~%llu",
- off, len, read_off, read_len);
+ doutc(cl, "orig %llu~%llu reading %llu~%llu", off, len,
+ read_off, read_len);
req = ceph_osdc_new_request(osdc, &ci->i_layout,
ci->i_vino, read_off, &read_len, 0, 1,
@@ -1059,8 +1086,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
objver = req->r_version;
i_size = i_size_read(inode);
- dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
- off, len, ret, i_size, (more ? " MORE" : ""));
+ doutc(cl, "%llu~%llu got %zd i_size %llu%s\n", off, len,
+ ret, i_size, (more ? " MORE" : ""));
/* Fix it to go to end of extent map */
if (sparse && ret >= 0)
@@ -1101,8 +1128,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
int zlen = min(len - ret, i_size - off - ret);
int zoff = page_off + ret;
- dout("sync_read zero gap %llu~%llu\n",
- off + ret, off + ret + zlen);
+ doutc(cl, "zero gap %llu~%llu\n", off + ret,
+ off + ret + zlen);
ceph_zero_page_vector_range(zoff, zlen, pages);
ret += zlen;
}
@@ -1151,7 +1178,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
if (last_objver)
*last_objver = objver;
}
- dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
+ doutc(cl, "result %zd retry_op %d\n", ret, *retry_op);
return ret;
}
@@ -1160,9 +1187,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
- dout("sync_read on file %p %llx~%zx %s\n", file, iocb->ki_pos,
- iov_iter_count(to), (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+ doutc(cl, "on file %p %llx~%zx %s\n", file, iocb->ki_pos,
+ iov_iter_count(to),
+ (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL);
}
@@ -1190,6 +1219,7 @@ static void ceph_aio_retry_work(struct work_struct *work);
static void ceph_aio_complete(struct inode *inode,
struct ceph_aio_request *aio_req)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
int ret;
@@ -1203,7 +1233,7 @@ static void ceph_aio_complete(struct inode *inode,
if (!ret)
ret = aio_req->total_len;
- dout("ceph_aio_complete %p rc %d\n", inode, ret);
+ doutc(cl, "%p %llx.%llx rc %d\n", inode, ceph_vinop(inode), ret);
if (ret >= 0 && aio_req->write) {
int dirty;
@@ -1242,11 +1272,13 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
unsigned int len = osd_data->bvec_pos.iter.bi_size;
bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
BUG_ON(!osd_data->num_bvecs);
- dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len);
+ doutc(cl, "req %p inode %p %llx.%llx, rc %d bytes %u\n", req,
+ inode, ceph_vinop(inode), rc, len);
if (rc == -EOLDSNAPC) {
struct ceph_aio_work *aio_work;
@@ -1256,7 +1288,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
if (aio_work) {
INIT_WORK(&aio_work->work, ceph_aio_retry_work);
aio_work->req = req;
- queue_work(ceph_inode_to_client(inode)->inode_wq,
+ queue_work(ceph_inode_to_fs_client(inode)->inode_wq,
&aio_work->work);
return;
}
@@ -1386,7 +1418,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_client_metric *metric = &fsc->mdsc->metric;
struct ceph_vino vino;
struct ceph_osd_request *req;
@@ -1405,9 +1438,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
- (write ? "write" : "read"), file, pos, (unsigned)count,
- snapc, snapc ? snapc->seq : 0);
+ doutc(cl, "sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
+ (write ? "write" : "read"), file, pos, (unsigned)count,
+ snapc, snapc ? snapc->seq : 0);
if (write) {
int ret2;
@@ -1418,7 +1451,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
pos >> PAGE_SHIFT,
(pos + count - 1) >> PAGE_SHIFT);
if (ret2 < 0)
- dout("invalidate_inode_pages2_range returned %d\n", ret2);
+ doutc(cl, "invalidate_inode_pages2_range returned %d\n",
+ ret2);
flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
} else {
@@ -1610,7 +1644,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
struct page **pages;
@@ -1625,8 +1660,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
- file, pos, (unsigned)count, snapc, snapc->seq);
+ doutc(cl, "on file %p %lld~%u snapc %p seq %lld\n", file, pos,
+ (unsigned)count, snapc, snapc->seq);
ret = filemap_write_and_wait_range(inode->i_mapping,
pos, pos + count - 1);
@@ -1670,9 +1705,9 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
last = (pos + len) != (write_pos + write_len);
rmw = first || last;
- dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
- ci->i_vino.ino, pos, len, write_pos, write_len,
- rmw ? "" : "no ");
+ doutc(cl, "ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
+ ci->i_vino.ino, pos, len, write_pos, write_len,
+ rmw ? "" : "no ");
/*
* The data is emplaced into the page as it would be if it were
@@ -1881,7 +1916,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
left -= ret;
}
if (ret < 0) {
- dout("sync_write write failed with %d\n", ret);
+ doutc(cl, "write failed with %d\n", ret);
ceph_release_page_vector(pages, num_pages);
break;
}
@@ -1891,7 +1926,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
write_pos, write_len,
GFP_KERNEL);
if (ret < 0) {
- dout("encryption failed with %d\n", ret);
+ doutc(cl, "encryption failed with %d\n", ret);
ceph_release_page_vector(pages, num_pages);
break;
}
@@ -1910,7 +1945,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
break;
}
- dout("sync_write write op %lld~%llu\n", write_pos, write_len);
+ doutc(cl, "write op %lld~%llu\n", write_pos, write_len);
osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len,
offset_in_page(write_pos), false,
true);
@@ -1941,7 +1976,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
req->r_end_latency, len, ret);
ceph_osdc_put_request(req);
if (ret != 0) {
- dout("sync_write osd write returned %d\n", ret);
+ doutc(cl, "osd write returned %d\n", ret);
/* Version changed! Must re-do the rmw cycle */
if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) ||
(!assert_ver && ret == -EEXIST)) {
@@ -1971,13 +2006,13 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
pos >> PAGE_SHIFT,
(pos + len - 1) >> PAGE_SHIFT);
if (ret < 0) {
- dout("invalidate_inode_pages2_range returned %d\n",
- ret);
+ doutc(cl, "invalidate_inode_pages2_range returned %d\n",
+ ret);
ret = 0;
}
pos += len;
written += len;
- dout("sync_write written %d\n", written);
+ doutc(cl, "written %d\n", written);
if (pos > i_size_read(inode)) {
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
@@ -1991,7 +2026,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
ret = written;
iocb->ki_pos = pos;
}
- dout("sync_write returning %d\n", ret);
+ doutc(cl, "returning %d\n", ret);
return ret;
}
@@ -2010,13 +2045,14 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
ssize_t ret;
int want = 0, got = 0;
int retry_op = 0, read = 0;
again:
- dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
- inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+ doutc(cl, "%llu~%u trying to get caps on %p %llx.%llx\n",
+ iocb->ki_pos, (unsigned)len, inode, ceph_vinop(inode));
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
@@ -2044,9 +2080,9 @@ again:
(iocb->ki_flags & IOCB_DIRECT) ||
(fi->flags & CEPH_F_SYNC)) {
- dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
- ceph_cap_string(got));
+ doutc(cl, "sync %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
if (!ceph_has_inline_data(ci)) {
if (!retry_op &&
@@ -2064,16 +2100,16 @@ again:
}
} else {
CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
- dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
- ceph_cap_string(got));
+ doutc(cl, "async %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
ceph_add_rw_context(fi, &rw_ctx);
ret = generic_file_read_iter(iocb, to);
ceph_del_rw_context(fi, &rw_ctx);
}
- dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
- inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+ doutc(cl, "%p %llx.%llx dropping cap refs on %s = %d\n",
+ inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
ceph_put_cap_refs(ci, got);
if (direct_lock)
@@ -2133,8 +2169,8 @@ again:
/* hit EOF or hole? */
if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
ret < len) {
- dout("sync_read hit hole, ppos %lld < size %lld"
- ", reading more\n", iocb->ki_pos, i_size);
+ doutc(cl, "hit hole, ppos %lld < size %lld, reading more\n",
+ iocb->ki_pos, i_size);
read += ret;
len -= ret;
@@ -2228,7 +2264,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
@@ -2296,8 +2333,9 @@ retry_snap:
if (err)
goto out;
- dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos, count, i_size_read(inode));
+ doutc(cl, "%p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
+ inode, ceph_vinop(inode), pos, count,
+ i_size_read(inode));
if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
want |= CEPH_CAP_FILE_BUFFER;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
@@ -2313,8 +2351,8 @@ retry_snap:
inode_inc_iversion_raw(inode);
- dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
+ doutc(cl, "%p %llx.%llx %llu~%zd got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
@@ -2374,14 +2412,14 @@ retry_snap:
ceph_check_caps(ci, CHECK_CAPS_FLUSH);
}
- dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)count,
- ceph_cap_string(got));
+ doutc(cl, "%p %llx.%llx %llu~%u dropping cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, (unsigned)count,
+ ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
if (written == -EOLDSNAPC) {
- dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
- inode, ceph_vinop(inode), pos, (unsigned)count);
+ doutc(cl, "%p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
+ inode, ceph_vinop(inode), pos, (unsigned)count);
goto retry_snap;
}
@@ -2462,7 +2500,7 @@ static int ceph_zero_partial_object(struct inode *inode,
loff_t offset, loff_t *length)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_osd_request *req;
int ret = 0;
loff_t zero = 0;
@@ -2553,14 +2591,15 @@ static long ceph_fallocate(struct file *file, int mode,
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap_flush *prealloc_cf;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int want, got = 0;
int dirty;
int ret = 0;
loff_t endoff = 0;
loff_t size;
- dout("%s %p %llx.%llx mode %x, offset %llu length %llu\n", __func__,
- inode, ceph_vinop(inode), mode, offset, length);
+ doutc(cl, "%p %llx.%llx mode %x, offset %llu length %llu\n",
+ inode, ceph_vinop(inode), mode, offset, length);
if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
@@ -2689,6 +2728,7 @@ static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
loff_t src_off, loff_t dst_off, size_t len)
{
+ struct ceph_client *cl = ceph_inode_to_client(src_inode);
loff_t size, endoff;
size = i_size_read(src_inode);
@@ -2699,8 +2739,8 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
* inode.
*/
if (src_off + len > size) {
- dout("Copy beyond EOF (%llu + %zu > %llu)\n",
- src_off, len, size);
+ doutc(cl, "Copy beyond EOF (%llu + %zu > %llu)\n", src_off,
+ len, size);
return -EOPNOTSUPP;
}
size = i_size_read(dst_inode);
@@ -2776,6 +2816,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
u32 src_objlen, dst_objlen;
u32 object_size = src_ci->i_layout.object_size;
+ struct ceph_client *cl = fsc->client;
int ret;
src_oloc.pool = src_ci->i_layout.pool_id;
@@ -2817,9 +2858,10 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
if (ret) {
if (ret == -EOPNOTSUPP) {
fsc->have_copy_from2 = false;
- pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
+ pr_notice_client(cl,
+ "OSDs don't support copy-from2; disabling copy offload\n");
}
- dout("ceph_osdc_copy_from returned %d\n", ret);
+ doutc(cl, "returned %d\n", ret);
if (!bytes)
bytes = ret;
goto out;
@@ -2845,7 +2887,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct ceph_inode_info *src_ci = ceph_inode(src_inode);
struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
struct ceph_cap_flush *prealloc_cf;
- struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
+ struct ceph_fs_client *src_fsc = ceph_inode_to_fs_client(src_inode);
+ struct ceph_client *cl = src_fsc->client;
loff_t size;
ssize_t ret = -EIO, bytes;
u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
@@ -2853,7 +2896,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
int src_got = 0, dst_got = 0, err, dirty;
if (src_inode->i_sb != dst_inode->i_sb) {
- struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
+ struct ceph_fs_client *dst_fsc = ceph_inode_to_fs_client(dst_inode);
if (ceph_fsid_compare(&src_fsc->client->fsid,
&dst_fsc->client->fsid)) {
@@ -2888,7 +2931,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
(src_ci->i_layout.stripe_count != 1) ||
(dst_ci->i_layout.stripe_count != 1) ||
(src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
- dout("Invalid src/dst files layout\n");
+ doutc(cl, "Invalid src/dst files layout\n");
return -EOPNOTSUPP;
}
@@ -2906,12 +2949,12 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
/* Start by sync'ing the source and destination files */
ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
if (ret < 0) {
- dout("failed to write src file (%zd)\n", ret);
+ doutc(cl, "failed to write src file (%zd)\n", ret);
goto out;
}
ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
if (ret < 0) {
- dout("failed to write dst file (%zd)\n", ret);
+ doutc(cl, "failed to write dst file (%zd)\n", ret);
goto out;
}
@@ -2923,7 +2966,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
err = get_rd_wr_caps(src_file, &src_got,
dst_file, (dst_off + len), &dst_got);
if (err < 0) {
- dout("get_rd_wr_caps returned %d\n", err);
+ doutc(cl, "get_rd_wr_caps returned %d\n", err);
ret = -EOPNOTSUPP;
goto out;
}
@@ -2938,7 +2981,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
dst_off >> PAGE_SHIFT,
(dst_off + len) >> PAGE_SHIFT);
if (ret < 0) {
- dout("Failed to invalidate inode pages (%zd)\n", ret);
+ doutc(cl, "Failed to invalidate inode pages (%zd)\n",
+ ret);
ret = 0; /* XXX */
}
ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
@@ -2959,7 +3003,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
* starting at the src_off
*/
if (src_objoff) {
- dout("Initial partial copy of %u bytes\n", src_objlen);
+ doutc(cl, "Initial partial copy of %u bytes\n", src_objlen);
/*
* we need to temporarily drop all caps as we'll be calling
@@ -2970,7 +3014,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
&dst_off, src_objlen, flags);
/* Abort on short copies or on error */
if (ret < (long)src_objlen) {
- dout("Failed partial copy (%zd)\n", ret);
+ doutc(cl, "Failed partial copy (%zd)\n", ret);
goto out;
}
len -= ret;
@@ -2992,7 +3036,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
ret = bytes;
goto out_caps;
}
- dout("Copied %zu bytes out of %zu\n", bytes, len);
+ doutc(cl, "Copied %zu bytes out of %zu\n", bytes, len);
len -= bytes;
ret += bytes;
@@ -3020,13 +3064,13 @@ out_caps:
* there were errors in remote object copies (len >= object_size).
*/
if (len && (len < src_ci->i_layout.object_size)) {
- dout("Final partial copy of %zu bytes\n", len);
+ doutc(cl, "Final partial copy of %zu bytes\n", len);
bytes = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, len, flags);
if (bytes > 0)
ret += bytes;
else
- dout("Failed partial copy (%zd)\n", bytes);
+ doutc(cl, "Failed partial copy (%zd)\n", bytes);
}
out:
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2e2a303b9e64..0679240f06db 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -129,6 +129,8 @@ void ceph_as_ctx_to_req(struct ceph_mds_request *req,
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
struct inode *newino)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct inode *inode;
if (ceph_vino_is_reserved(vino))
@@ -145,12 +147,13 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
}
if (!inode) {
- dout("No inode found for %llx.%llx\n", vino.ino, vino.snap);
+ doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap);
return ERR_PTR(-ENOMEM);
}
- dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
- ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
+ doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
+ ceph_present_inode(inode), ceph_vinop(inode), inode,
+ !!(inode->i_state & I_NEW));
return inode;
}
@@ -159,6 +162,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
*/
struct inode *ceph_get_snapdir(struct inode *parent)
{
+ struct ceph_client *cl = ceph_inode_to_client(parent);
struct ceph_vino vino = {
.ino = ceph_ino(parent),
.snap = CEPH_SNAPDIR,
@@ -171,14 +175,14 @@ struct inode *ceph_get_snapdir(struct inode *parent)
return inode;
if (!S_ISDIR(parent->i_mode)) {
- pr_warn_once("bad snapdir parent type (mode=0%o)\n",
- parent->i_mode);
+ pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n",
+ parent->i_mode);
goto err;
}
if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
- pr_warn_once("bad snapdir inode type (mode=0%o)\n",
- inode->i_mode);
+ pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
+ inode->i_mode);
goto err;
}
@@ -203,7 +207,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_flags |= S_ENCRYPTED;
ci->fscrypt_auth_len = pci->fscrypt_auth_len;
} else {
- dout("Failed to alloc snapdir fscrypt_auth\n");
+ doutc(cl, "Failed to alloc snapdir fscrypt_auth\n");
ret = -ENOMEM;
goto err;
}
@@ -249,6 +253,8 @@ const struct inode_operations ceph_file_iops = {
static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
u32 f)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct rb_node **p;
struct rb_node *parent = NULL;
struct ceph_inode_frag *frag;
@@ -279,8 +285,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
rb_link_node(&frag->node, parent, p);
rb_insert_color(&frag->node, &ci->i_fragtree);
- dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->netfs.inode), f);
+ doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f);
return frag;
}
@@ -313,6 +318,7 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
struct ceph_inode_frag *pfrag, int *found)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
u32 t = ceph_frag_make(0, 0);
struct ceph_inode_frag *frag;
unsigned nway, i;
@@ -336,8 +342,8 @@ static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
/* choose child */
nway = 1 << frag->split_by;
- dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
- frag->split_by, nway);
+ doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t,
+ frag->split_by, nway);
for (i = 0; i < nway; i++) {
n = ceph_frag_make_child(t, frag->split_by, i);
if (ceph_frag_contains_value(n, v)) {
@@ -347,7 +353,7 @@ static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
}
BUG_ON(i == nway);
}
- dout("choose_frag(%x) = %x\n", v, t);
+ doutc(cl, "frag(%x) = %x\n", v, t);
return t;
}
@@ -371,6 +377,7 @@ static int ceph_fill_dirfrag(struct inode *inode,
struct ceph_mds_reply_dirfrag *dirinfo)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_frag *frag;
u32 id = le32_to_cpu(dirinfo->frag);
int mds = le32_to_cpu(dirinfo->auth);
@@ -395,14 +402,14 @@ static int ceph_fill_dirfrag(struct inode *inode,
goto out;
if (frag->split_by == 0) {
/* tree leaf, remove */
- dout("fill_dirfrag removed %llx.%llx frag %x"
- " (no ref)\n", ceph_vinop(inode), id);
+ doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n",
+ inode, ceph_vinop(inode), id);
rb_erase(&frag->node, &ci->i_fragtree);
kfree(frag);
} else {
/* tree branch, keep and clear */
- dout("fill_dirfrag cleared %llx.%llx frag %x"
- " referral\n", ceph_vinop(inode), id);
+ doutc(cl, "cleared %p %llx.%llx frag %x referral\n",
+ inode, ceph_vinop(inode), id);
frag->mds = -1;
frag->ndist = 0;
}
@@ -415,8 +422,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
if (IS_ERR(frag)) {
/* this is not the end of the world; we can continue
with bad/inaccurate delegation info */
- pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
- ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+ pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n",
+ inode, ceph_vinop(inode),
+ le32_to_cpu(dirinfo->frag));
err = -ENOMEM;
goto out;
}
@@ -425,8 +433,8 @@ static int ceph_fill_dirfrag(struct inode *inode,
frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
for (i = 0; i < frag->ndist; i++)
frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
- dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
- ceph_vinop(inode), frag->frag, frag->ndist);
+ doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode,
+ ceph_vinop(inode), frag->frag, frag->ndist);
out:
mutex_unlock(&ci->i_fragtree_mutex);
@@ -454,6 +462,7 @@ static int ceph_fill_fragtree(struct inode *inode,
struct ceph_frag_tree_head *fragtree,
struct ceph_mds_reply_dirfrag *dirinfo)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_frag *frag, *prev_frag = NULL;
struct rb_node *rb_node;
@@ -489,15 +498,15 @@ static int ceph_fill_fragtree(struct inode *inode,
frag_tree_split_cmp, NULL);
}
- dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
rb_node = rb_first(&ci->i_fragtree);
for (i = 0; i < nsplits; i++) {
id = le32_to_cpu(fragtree->splits[i].frag);
split_by = le32_to_cpu(fragtree->splits[i].by);
if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
- pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
- "frag %x split by %d\n", ceph_vinop(inode),
- i, nsplits, id, split_by);
+ pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, "
+ "frag %x split by %d\n", inode,
+ ceph_vinop(inode), i, nsplits, id, split_by);
continue;
}
frag = NULL;
@@ -529,7 +538,7 @@ static int ceph_fill_fragtree(struct inode *inode,
if (frag->split_by == 0)
ci->i_fragtree_nsplits++;
frag->split_by = split_by;
- dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+ doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by);
prev_frag = frag;
}
while (rb_node) {
@@ -554,6 +563,7 @@ out_unlock:
*/
struct inode *ceph_alloc_inode(struct super_block *sb)
{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_inode_info *ci;
int i;
@@ -561,7 +571,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
if (!ci)
return NULL;
- dout("alloc_inode %p\n", &ci->netfs.inode);
+ doutc(fsc->client, "%p\n", &ci->netfs.inode);
/* Set parameters for the netfs library */
netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
@@ -675,10 +685,11 @@ void ceph_evict_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_frag *frag;
struct rb_node *n;
- dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+ doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode));
percpu_counter_dec(&mdsc->metric.total_inodes);
@@ -701,8 +712,8 @@ void ceph_evict_inode(struct inode *inode)
*/
if (ci->i_snap_realm) {
if (ceph_snap(inode) == CEPH_NOSNAP) {
- dout(" dropping residual ref to snap realm %p\n",
- ci->i_snap_realm);
+ doutc(cl, " dropping residual ref to snap realm %p\n",
+ ci->i_snap_realm);
ceph_change_snap_realm(inode, NULL);
} else {
ceph_put_snapid_map(mdsc, ci->i_snapid_map);
@@ -743,15 +754,16 @@ static inline blkcnt_t calc_inode_blocks(u64 size)
int ceph_fill_file_size(struct inode *inode, int issued,
u32 truncate_seq, u64 truncate_size, u64 size)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
int queue_trunc = 0;
loff_t isize = i_size_read(inode);
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
(truncate_seq == ci->i_truncate_seq && size > isize)) {
- dout("size %lld -> %llu\n", isize, size);
+ doutc(cl, "size %lld -> %llu\n", isize, size);
if (size > 0 && S_ISDIR(inode->i_mode)) {
- pr_err("fill_file_size non-zero size for directory\n");
+ pr_err_client(cl, "non-zero size for directory\n");
size = 0;
}
i_size_write(inode, size);
@@ -764,8 +776,8 @@ int ceph_fill_file_size(struct inode *inode, int issued,
ceph_fscache_update(inode);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
- dout("%s truncate_seq %u -> %u\n", __func__,
- ci->i_truncate_seq, truncate_seq);
+ doutc(cl, "truncate_seq %u -> %u\n",
+ ci->i_truncate_seq, truncate_seq);
ci->i_truncate_seq = truncate_seq;
/* the MDS should have revoked these caps */
@@ -794,14 +806,15 @@ int ceph_fill_file_size(struct inode *inode, int issued,
* anyway.
*/
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
- dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__,
- ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode));
+ doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n",
+ ci->i_truncate_size, truncate_size,
+ !!IS_ENCRYPTED(inode));
ci->i_truncate_size = truncate_size;
if (IS_ENCRYPTED(inode)) {
- dout("%s truncate_pagecache_size %lld -> %llu\n",
- __func__, ci->i_truncate_pagecache_size, size);
+ doutc(cl, "truncate_pagecache_size %lld -> %llu\n",
+ ci->i_truncate_pagecache_size, size);
ci->i_truncate_pagecache_size = size;
} else {
ci->i_truncate_pagecache_size = truncate_size;
@@ -814,6 +827,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
u64 time_warp_seq, struct timespec64 *ctime,
struct timespec64 *mtime, struct timespec64 *atime)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct timespec64 ictime = inode_get_ctime(inode);
int warn = 0;
@@ -825,7 +839,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
CEPH_CAP_XATTR_EXCL)) {
if (ci->i_version == 0 ||
timespec64_compare(ctime, &ictime) > 0) {
- dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
+ doutc(cl, "ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
ictime.tv_sec, ictime.tv_nsec,
ctime->tv_sec, ctime->tv_nsec);
inode_set_ctime_to_ts(inode, *ctime);
@@ -833,8 +847,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
if (ci->i_version == 0 ||
ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
/* the MDS did a utimes() */
- dout("mtime %lld.%09ld -> %lld.%09ld "
- "tw %d -> %d\n",
+ doutc(cl, "mtime %lld.%09ld -> %lld.%09ld tw %d -> %d\n",
inode_get_mtime_sec(inode),
inode_get_mtime_nsec(inode),
mtime->tv_sec, mtime->tv_nsec,
@@ -849,14 +862,14 @@ void ceph_fill_file_time(struct inode *inode, int issued,
/* nobody did utimes(); take the max */
ts = inode_get_mtime(inode);
if (timespec64_compare(mtime, &ts) > 0) {
- dout("mtime %lld.%09ld -> %lld.%09ld inc\n",
+ doutc(cl, "mtime %lld.%09ld -> %lld.%09ld inc\n",
ts.tv_sec, ts.tv_nsec,
mtime->tv_sec, mtime->tv_nsec);
inode_set_mtime_to_ts(inode, *mtime);
}
ts = inode_get_atime(inode);
if (timespec64_compare(atime, &ts) > 0) {
- dout("atime %lld.%09ld -> %lld.%09ld inc\n",
+ doutc(cl, "atime %lld.%09ld -> %lld.%09ld inc\n",
ts.tv_sec, ts.tv_nsec,
atime->tv_sec, atime->tv_nsec);
inode_set_atime_to_ts(inode, *atime);
@@ -878,13 +891,16 @@ void ceph_fill_file_time(struct inode *inode, int issued,
}
}
if (warn) /* time_warp_seq shouldn't go backwards */
- dout("%p mds time_warp_seq %llu < %u\n",
- inode, time_warp_seq, ci->i_time_warp_seq);
+ doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode,
+ time_warp_seq, ci->i_time_warp_seq);
}
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
+static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
+ const char *encsym,
+ int enclen, u8 **decsym)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int declen;
u8 *sym;
@@ -894,8 +910,9 @@ static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
declen = ceph_base64_decode(encsym, enclen, sym);
if (declen < 0) {
- pr_err("%s: can't decode symlink (%d). Content: %.*s\n",
- __func__, declen, enclen, encsym);
+ pr_err_client(cl,
+ "can't decode symlink (%d). Content: %.*s\n",
+ declen, enclen, encsym);
kfree(sym);
return -EIO;
}
@@ -904,7 +921,9 @@ static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
return declen;
}
#else
-static int decode_encrypted_symlink(const char *encsym, int symlen, u8 **decsym)
+static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
+ const char *encsym,
+ int symlen, u8 **decsym)
{
return -EOPNOTSUPP;
}
@@ -921,6 +940,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_cap_reservation *caps_reservation)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
int issued, new_issued, info_caps;
@@ -939,25 +959,26 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
lockdep_assert_held(&mdsc->snap_rwsem);
- dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
- inode, ceph_vinop(inode), le64_to_cpu(info->version),
- ci->i_version);
+ doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode),
+ le64_to_cpu(info->version), ci->i_version);
/* Once I_NEW is cleared, we can't change type or dev numbers */
if (inode->i_state & I_NEW) {
inode->i_mode = mode;
} else {
if (inode_wrong_type(inode, mode)) {
- pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
- ceph_vinop(inode), inode->i_mode, mode);
+ pr_warn_once_client(cl,
+ "inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
+ ceph_vinop(inode), inode->i_mode, mode);
return -ESTALE;
}
if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
- pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
- ceph_vinop(inode), MAJOR(inode->i_rdev),
- MINOR(inode->i_rdev), MAJOR(rdev),
- MINOR(rdev));
+ pr_warn_once_client(cl,
+ "dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
+ ceph_vinop(inode), MAJOR(inode->i_rdev),
+ MINOR(inode->i_rdev), MAJOR(rdev),
+ MINOR(rdev));
return -ESTALE;
}
}
@@ -979,8 +1000,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if (iinfo->xattr_len > 4) {
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
if (!xattr_blob)
- pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
- iinfo->xattr_len);
+ pr_err_client(cl, "ENOMEM xattr blob %d bytes\n",
+ iinfo->xattr_len);
}
if (iinfo->pool_ns_len > 0)
@@ -1034,9 +1055,10 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
inode->i_mode = mode;
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
- dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- from_kuid(&init_user_ns, inode->i_uid),
- from_kgid(&init_user_ns, inode->i_gid));
+ doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
+ ceph_vinop(inode), inode->i_mode,
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
}
@@ -1092,7 +1114,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
size = fsize;
} else {
- pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
+ pr_warn_client(cl,
+ "fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
info->size, size);
}
}
@@ -1104,8 +1127,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
/* only update max_size on auth cap */
if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
ci->i_max_size != le64_to_cpu(info->max_size)) {
- dout("max_size %lld -> %llu\n", ci->i_max_size,
- le64_to_cpu(info->max_size));
+ doutc(cl, "max_size %lld -> %llu\n",
+ ci->i_max_size, le64_to_cpu(info->max_size));
ci->i_max_size = le64_to_cpu(info->max_size);
}
}
@@ -1168,15 +1191,17 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if (IS_ENCRYPTED(inode)) {
if (symlen != i_size_read(inode))
- pr_err("%s %llx.%llx BAD symlink size %lld\n",
- __func__, ceph_vinop(inode),
+ pr_err_client(cl,
+ "%p %llx.%llx BAD symlink size %lld\n",
+ inode, ceph_vinop(inode),
i_size_read(inode));
- err = decode_encrypted_symlink(iinfo->symlink,
+ err = decode_encrypted_symlink(mdsc, iinfo->symlink,
symlen, (u8 **)&sym);
if (err < 0) {
- pr_err("%s decoding encrypted symlink failed: %d\n",
- __func__, err);
+ pr_err_client(cl,
+ "decoding encrypted symlink failed: %d\n",
+ err);
goto out;
}
symlen = err;
@@ -1184,8 +1209,9 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
inode->i_blocks = calc_inode_blocks(symlen);
} else {
if (symlen != i_size_read(inode)) {
- pr_err("%s %llx.%llx BAD symlink size %lld\n",
- __func__, ceph_vinop(inode),
+ pr_err_client(cl,
+ "%p %llx.%llx BAD symlink size %lld\n",
+ inode, ceph_vinop(inode),
i_size_read(inode));
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
@@ -1220,8 +1246,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
inode->i_fop = &ceph_dir_fops;
break;
default:
- pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
- ceph_vinop(inode), inode->i_mode);
+ pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode,
+ ceph_vinop(inode), inode->i_mode);
}
/* were we issued a capability? */
@@ -1242,7 +1268,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
(info_caps & CEPH_CAP_FILE_SHARED) &&
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
!__ceph_dir_is_complete(ci)) {
- dout(" marking %p complete (empty)\n", inode);
+ doutc(cl, " marking %p complete (empty)\n",
+ inode);
i_size_write(inode, 0);
__ceph_dir_set_complete(ci,
atomic64_read(&ci->i_release_count),
@@ -1251,8 +1278,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
wake = true;
} else {
- dout(" %p got snap_caps %s\n", inode,
- ceph_cap_string(info_caps));
+ doutc(cl, " %p got snap_caps %s\n", inode,
+ ceph_cap_string(info_caps));
ci->i_snap_caps |= info_caps;
}
}
@@ -1268,8 +1295,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if (cap_fmode >= 0) {
if (!info_caps)
- pr_warn("mds issued no caps on %llx.%llx\n",
- ceph_vinop(inode));
+ pr_warn_client(cl, "mds issued no caps on %llx.%llx\n",
+ ceph_vinop(inode));
__ceph_touch_fmode(ci, mdsc, cap_fmode);
}
@@ -1315,14 +1342,14 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
unsigned long from_time,
struct ceph_mds_session **old_lease_session)
{
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct ceph_dentry_info *di = ceph_dentry(dentry);
unsigned mask = le16_to_cpu(lease->mask);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
- dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
- dentry, duration, ttl);
+ doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl);
/* only track leases on regular dentries */
if (ceph_snap(dir) != CEPH_NOSNAP)
@@ -1423,6 +1450,7 @@ out_unlock:
*/
static int splice_dentry(struct dentry **pdn, struct inode *in)
{
+ struct ceph_client *cl = ceph_inode_to_client(in);
struct dentry *dn = *pdn;
struct dentry *realdn;
@@ -1454,23 +1482,21 @@ static int splice_dentry(struct dentry **pdn, struct inode *in)
d_drop(dn);
realdn = d_splice_alias(in, dn);
if (IS_ERR(realdn)) {
- pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
- PTR_ERR(realdn), dn, in, ceph_vinop(in));
+ pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n",
+ PTR_ERR(realdn), dn, in, ceph_vinop(in));
return PTR_ERR(realdn);
}
if (realdn) {
- dout("dn %p (%d) spliced with %p (%d) "
- "inode %p ino %llx.%llx\n",
- dn, d_count(dn),
- realdn, d_count(realdn),
- d_inode(realdn), ceph_vinop(d_inode(realdn)));
+ doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n",
+ dn, d_count(dn), realdn, d_count(realdn),
+ d_inode(realdn), ceph_vinop(d_inode(realdn)));
dput(dn);
*pdn = realdn;
} else {
BUG_ON(!ceph_dentry(dn));
- dout("dn %p attached to %p ino %llx.%llx\n",
- dn, d_inode(dn), ceph_vinop(d_inode(dn)));
+ doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn,
+ d_inode(dn), ceph_vinop(d_inode(dn)));
}
return 0;
}
@@ -1492,14 +1518,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct inode *in = NULL;
struct ceph_vino tvino, dvino;
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_client *cl = fsc->client;
int err = 0;
- dout("fill_trace %p is_dentry %d is_target %d\n", req,
- rinfo->head->is_dentry, rinfo->head->is_target);
+ doutc(cl, "%p is_dentry %d is_target %d\n", req,
+ rinfo->head->is_dentry, rinfo->head->is_target);
if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
- dout("fill_trace reply is empty!\n");
+ doutc(cl, "reply is empty!\n");
if (rinfo->head->result == 0 && req->r_parent)
ceph_invalidate_dir_request(req);
return 0;
@@ -1556,13 +1583,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
retry_lookup:
dn = d_lookup(parent, &dname);
- dout("d_lookup on parent=%p name=%.*s got %p\n",
- parent, dname.len, dname.name, dn);
+ doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
if (!dn) {
dn = d_alloc(parent, &dname);
- dout("d_alloc %p '%.*s' = %p\n", parent,
- dname.len, dname.name, dn);
+ doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
if (!dn) {
dput(parent);
ceph_fname_free_buffer(dir, &oname);
@@ -1578,8 +1605,8 @@ retry_lookup:
} else if (d_really_is_positive(dn) &&
(ceph_ino(d_inode(dn)) != tvino.ino ||
ceph_snap(d_inode(dn)) != tvino.snap)) {
- dout(" dn %p points to wrong inode %p\n",
- dn, d_inode(dn));
+ doutc(cl, " dn %p points to wrong inode %p\n",
+ dn, d_inode(dn));
ceph_dir_clear_ordered(dir);
d_delete(dn);
dput(dn);
@@ -1604,8 +1631,8 @@ retry_lookup:
rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
- pr_err("ceph_fill_inode badness %p %llx.%llx\n",
- in, ceph_vinop(in));
+ pr_err_client(cl, "badness %p %llx.%llx\n", in,
+ ceph_vinop(in));
req->r_target_inode = NULL;
if (in->i_state & I_NEW)
discard_new_inode(in);
@@ -1655,36 +1682,32 @@ retry_lookup:
have_lease = have_dir_cap ||
le32_to_cpu(rinfo->dlease->duration_ms);
if (!have_lease)
- dout("fill_trace no dentry lease or dir cap\n");
+ doutc(cl, "no dentry lease or dir cap\n");
/* rename? */
if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
struct inode *olddir = req->r_old_dentry_dir;
BUG_ON(!olddir);
- dout(" src %p '%pd' dst %p '%pd'\n",
- req->r_old_dentry,
- req->r_old_dentry,
- dn, dn);
- dout("fill_trace doing d_move %p -> %p\n",
- req->r_old_dentry, dn);
+ doutc(cl, " src %p '%pd' dst %p '%pd'\n",
+ req->r_old_dentry, req->r_old_dentry, dn, dn);
+ doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
d_move(req->r_old_dentry, dn);
- dout(" src %p '%pd' dst %p '%pd'\n",
- req->r_old_dentry,
- req->r_old_dentry,
- dn, dn);
+ doutc(cl, " src %p '%pd' dst %p '%pd'\n",
+ req->r_old_dentry, req->r_old_dentry, dn, dn);
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
- dout("dn %p gets new offset %lld\n", req->r_old_dentry,
- ceph_dentry(req->r_old_dentry)->offset);
+ doutc(cl, "dn %p gets new offset %lld\n",
+ req->r_old_dentry,
+ ceph_dentry(req->r_old_dentry)->offset);
/* swap r_dentry and r_old_dentry in case that
* splice_dentry() gets called later. This is safe
@@ -1696,9 +1719,9 @@ retry_lookup:
/* null dentry? */
if (!rinfo->head->is_target) {
- dout("fill_trace null dentry\n");
+ doutc(cl, "null dentry\n");
if (d_really_is_positive(dn)) {
- dout("d_delete %p\n", dn);
+ doutc(cl, "d_delete %p\n", dn);
ceph_dir_clear_ordered(dir);
d_delete(dn);
} else if (have_lease) {
@@ -1722,9 +1745,9 @@ retry_lookup:
goto done;
dn = req->r_dentry; /* may have spliced */
} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
- dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
- dn, d_inode(dn), ceph_vinop(d_inode(dn)),
- ceph_vinop(in));
+ doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n",
+ dn, d_inode(dn), ceph_vinop(d_inode(dn)),
+ ceph_vinop(in));
d_invalidate(dn);
have_lease = false;
}
@@ -1734,7 +1757,7 @@ retry_lookup:
rinfo->dlease, session,
req->r_request_started);
}
- dout(" final dn %p\n", dn);
+ doutc(cl, " final dn %p\n", dn);
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
req->r_op == CEPH_MDS_OP_MKSNAP) &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
@@ -1745,7 +1768,8 @@ retry_lookup:
BUG_ON(!dir);
BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
BUG_ON(!req->r_dentry);
- dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
+ doutc(cl, " linking snapped dir %p to dn %p\n", in,
+ req->r_dentry);
ceph_dir_clear_ordered(dir);
ihold(in);
err = splice_dentry(&req->r_dentry, in);
@@ -1767,7 +1791,7 @@ retry_lookup:
&dvino, ptvino);
}
done:
- dout("fill_trace done err=%d\n", err);
+ doutc(cl, "done err=%d\n", err);
return err;
}
@@ -1778,6 +1802,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
int i, err = 0;
for (i = 0; i < rinfo->dir_nr; i++) {
@@ -1792,14 +1817,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
if (IS_ERR(in)) {
err = PTR_ERR(in);
- dout("new_inode badness got %d\n", err);
+ doutc(cl, "badness got %d\n", err);
continue;
}
rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
-1, &req->r_caps_reservation);
if (rc < 0) {
- pr_err("ceph_fill_inode badness on %p got %d\n",
- in, rc);
+ pr_err_client(cl, "inode badness on %p got %d\n", in,
+ rc);
err = rc;
if (in->i_state & I_NEW) {
ihold(in);
@@ -1828,6 +1853,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
struct ceph_readdir_cache_control *ctl,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct ceph_inode_info *ci = ceph_inode(dir);
unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
unsigned idx = ctl->index % nsize;
@@ -1853,11 +1879,11 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
- dout("readdir cache dn %p idx %d\n", dn, ctl->index);
+ doutc(cl, "dn %p idx %d\n", dn, ctl->index);
ctl->dentries[idx] = dn;
ctl->index++;
} else {
- dout("disable readdir cache\n");
+ doutc(cl, "disable readdir cache\n");
ctl->index = -1;
}
return 0;
@@ -1870,6 +1896,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct inode *inode = d_inode(parent);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct qstr dname;
struct dentry *dn;
struct inode *in;
@@ -1897,19 +1924,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
- dout("readdir_prepopulate got new frag %x -> %x\n",
- frag, le32_to_cpu(rinfo->dir_dir->frag));
+ doutc(cl, "got new frag %x -> %x\n", frag,
+ le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag);
if (!rinfo->hash_order)
req->r_readdir_offset = 2;
}
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
- dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
- rinfo->dir_nr, parent);
+ doutc(cl, "%d items under SNAPDIR dn %p\n",
+ rinfo->dir_nr, parent);
} else {
- dout("readdir_prepopulate %d items under dn %p\n",
- rinfo->dir_nr, parent);
+ doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent);
if (rinfo->dir_dir)
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
@@ -1953,15 +1979,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
retry_lookup:
dn = d_lookup(parent, &dname);
- dout("d_lookup on parent=%p name=%.*s got %p\n",
- parent, dname.len, dname.name, dn);
+ doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
if (!dn) {
dn = d_alloc(parent, &dname);
- dout("d_alloc %p '%.*s' = %p\n", parent,
- dname.len, dname.name, dn);
+ doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
if (!dn) {
- dout("d_alloc badness\n");
+ doutc(cl, "d_alloc badness\n");
err = -ENOMEM;
goto out;
}
@@ -1974,8 +2000,8 @@ retry_lookup:
(ceph_ino(d_inode(dn)) != tvino.ino ||
ceph_snap(d_inode(dn)) != tvino.snap)) {
struct ceph_dentry_info *di = ceph_dentry(dn);
- dout(" dn %p points to wrong inode %p\n",
- dn, d_inode(dn));
+ doutc(cl, " dn %p points to wrong inode %p\n",
+ dn, d_inode(dn));
spin_lock(&dn->d_lock);
if (di->offset > 0 &&
@@ -1997,7 +2023,7 @@ retry_lookup:
} else {
in = ceph_get_inode(parent->d_sb, tvino, NULL);
if (IS_ERR(in)) {
- dout("new_inode badness\n");
+ doutc(cl, "new_inode badness\n");
d_drop(dn);
dput(dn);
err = PTR_ERR(in);
@@ -2008,7 +2034,8 @@ retry_lookup:
ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
-1, &req->r_caps_reservation);
if (ret < 0) {
- pr_err("ceph_fill_inode badness on %p\n", in);
+ pr_err_client(cl, "badness on %p %llx.%llx\n", in,
+ ceph_vinop(in));
if (d_really_is_negative(dn)) {
if (in->i_state & I_NEW) {
ihold(in);
@@ -2025,8 +2052,8 @@ retry_lookup:
if (d_really_is_negative(dn)) {
if (ceph_security_xattr_deadlock(in)) {
- dout(" skip splicing dn %p to inode %p"
- " (security xattr deadlock)\n", dn, in);
+ doutc(cl, " skip splicing dn %p to inode %p"
+ " (security xattr deadlock)\n", dn, in);
iput(in);
skipped++;
goto next_item;
@@ -2058,17 +2085,18 @@ out:
req->r_readdir_cache_idx = cache_ctl.index;
}
ceph_readdir_cache_release(&cache_ctl);
- dout("readdir_prepopulate done\n");
+ doutc(cl, "done\n");
return err;
}
bool ceph_inode_set_size(struct inode *inode, loff_t size)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
bool ret;
spin_lock(&ci->i_ceph_lock);
- dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
+ doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
i_size_write(inode, size);
ceph_fscache_update(inode);
inode->i_blocks = calc_inode_blocks(size);
@@ -2082,22 +2110,25 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
void ceph_queue_inode_work(struct inode *inode, int work_bit)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_inode_info *ci = ceph_inode(inode);
set_bit(work_bit, &ci->i_work_mask);
ihold(inode);
if (queue_work(fsc->inode_wq, &ci->i_work)) {
- dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
+ doutc(cl, "%p %llx.%llx mask=%lx\n", inode,
+ ceph_vinop(inode), ci->i_work_mask);
} else {
- dout("queue_inode_work %p already queued, mask=%lx\n",
- inode, ci->i_work_mask);
+ doutc(cl, "%p %llx.%llx already queued, mask=%lx\n",
+ inode, ceph_vinop(inode), ci->i_work_mask);
iput(inode);
}
}
static void ceph_do_invalidate_pages(struct inode *inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u32 orig_gen;
int check = 0;
@@ -2107,8 +2138,9 @@ static void ceph_do_invalidate_pages(struct inode *inode)
mutex_lock(&ci->i_truncate_mutex);
if (ceph_inode_is_shutdown(inode)) {
- pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n",
- __func__, ceph_vinop(inode));
+ pr_warn_ratelimited_client(cl,
+ "%p %llx.%llx is shut down\n", inode,
+ ceph_vinop(inode));
mapping_set_error(inode->i_mapping, -EIO);
truncate_pagecache(inode, 0);
mutex_unlock(&ci->i_truncate_mutex);
@@ -2116,8 +2148,8 @@ static void ceph_do_invalidate_pages(struct inode *inode)
}
spin_lock(&ci->i_ceph_lock);
- dout("invalidate_pages %p gen %d revoking %d\n", inode,
- ci->i_rdcache_gen, ci->i_rdcache_revoking);
+ doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode,
+ ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking);
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
check = 1;
@@ -2129,21 +2161,21 @@ static void ceph_do_invalidate_pages(struct inode *inode)
spin_unlock(&ci->i_ceph_lock);
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
- pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
- ceph_vinop(inode));
+ pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n",
+ ceph_vinop(inode));
}
spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen &&
orig_gen == ci->i_rdcache_revoking) {
- dout("invalidate_pages %p gen %d successful\n", inode,
- ci->i_rdcache_gen);
+ doutc(cl, "%p %llx.%llx gen %d successful\n", inode,
+ ceph_vinop(inode), ci->i_rdcache_gen);
ci->i_rdcache_revoking--;
check = 1;
} else {
- dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
- inode, orig_gen, ci->i_rdcache_gen,
- ci->i_rdcache_revoking);
+ doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n",
+ inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen,
+ ci->i_rdcache_revoking);
if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
check = 1;
}
@@ -2160,6 +2192,7 @@ out:
*/
void __ceph_do_pending_vmtruncate(struct inode *inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 to;
int wrbuffer_refs, finish = 0;
@@ -2168,7 +2201,8 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
retry:
spin_lock(&ci->i_ceph_lock);
if (ci->i_truncate_pending == 0) {
- dout("%s %p none pending\n", __func__, inode);
+ doutc(cl, "%p %llx.%llx none pending\n", inode,
+ ceph_vinop(inode));
spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
return;
@@ -2180,7 +2214,8 @@ retry:
*/
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
spin_unlock(&ci->i_ceph_lock);
- dout("%s %p flushing snaps first\n", __func__, inode);
+ doutc(cl, "%p %llx.%llx flushing snaps first\n", inode,
+ ceph_vinop(inode));
filemap_write_and_wait_range(&inode->i_data, 0,
inode->i_sb->s_maxbytes);
goto retry;
@@ -2191,8 +2226,8 @@ retry:
to = ci->i_truncate_pagecache_size;
wrbuffer_refs = ci->i_wrbuffer_ref;
- dout("%s %p (%d) to %lld\n", __func__, inode,
- ci->i_truncate_pending, to);
+ doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode),
+ ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
ceph_fscache_resize(inode, to);
@@ -2220,9 +2255,10 @@ static void ceph_inode_work(struct work_struct *work)
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_work);
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
- dout("writeback %p\n", inode);
+ doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode));
filemap_fdatawrite(&inode->i_data);
}
if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
@@ -2294,6 +2330,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
struct ceph_mds_request *req,
struct iattr *attr)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
loff_t pos, orig_pos = round_down(attr->ia_size,
@@ -2316,9 +2353,9 @@ static int fill_fscrypt_truncate(struct inode *inode,
issued = __ceph_caps_issued(ci, NULL);
- dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__,
- i_size, attr->ia_size, ceph_cap_string(got),
- ceph_cap_string(issued));
+ doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n",
+ i_size, attr->ia_size, ceph_cap_string(got),
+ ceph_cap_string(issued));
/* Try to writeback the dirty pagecaches */
if (issued & (CEPH_CAP_FILE_BUFFER)) {
@@ -2373,8 +2410,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
* If the Rados object doesn't exist, it will be set to 0.
*/
if (!objver) {
- dout("%s hit hole, ppos %lld < size %lld\n", __func__,
- pos, i_size);
+ doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size);
header.data_len = cpu_to_le32(8 + 8 + 4);
header.file_offset = 0;
@@ -2383,8 +2419,8 @@ static int fill_fscrypt_truncate(struct inode *inode,
header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
header.file_offset = cpu_to_le64(orig_pos);
- dout("%s encrypt block boff/bsize %d/%lu\n", __func__,
- boff, CEPH_FSCRYPT_BLOCK_SIZE);
+ doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff,
+ CEPH_FSCRYPT_BLOCK_SIZE);
/* truncate and zero out the extra contents for the last block */
memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
@@ -2412,8 +2448,8 @@ static int fill_fscrypt_truncate(struct inode *inode,
}
req->r_pagelist = pagelist;
out:
- dout("%s %p size dropping cap refs on %s\n", __func__,
- inode, ceph_cap_string(got));
+ doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
if (iov.iov_base)
kunmap_local(iov.iov_base);
@@ -2424,13 +2460,14 @@ out:
return ret;
}
-int __ceph_setattr(struct inode *inode, struct iattr *attr,
- struct ceph_iattr *cia)
+int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
+ struct iattr *attr, struct ceph_iattr *cia)
{
struct ceph_inode_info *ci = ceph_inode(inode);
unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap_flush *prealloc_cf;
loff_t isize = i_size_read(inode);
int issued;
@@ -2469,7 +2506,8 @@ retry:
}
}
- dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+ doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode),
+ ceph_cap_string(issued));
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
if (cia && cia->fscrypt_auth) {
u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
@@ -2480,8 +2518,8 @@ retry:
goto out;
}
- dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n",
- ceph_vinop(inode), ci->fscrypt_auth_len, len);
+ doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode,
+ ceph_vinop(inode), ci->fscrypt_auth_len, len);
/* It should never be re-set once set */
WARN_ON_ONCE(ci->fscrypt_auth);
@@ -2509,38 +2547,44 @@ retry:
#endif /* CONFIG_FS_ENCRYPTION */
if (ia_valid & ATTR_UID) {
- dout("setattr %p uid %d -> %d\n", inode,
- from_kuid(&init_user_ns, inode->i_uid),
- from_kuid(&init_user_ns, attr->ia_uid));
+ kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid);
+
+ doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode,
+ ceph_vinop(inode),
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kuid(&init_user_ns, attr->ia_uid));
if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_uid = attr->ia_uid;
+ inode->i_uid = fsuid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- !uid_eq(attr->ia_uid, inode->i_uid)) {
+ !uid_eq(fsuid, inode->i_uid)) {
req->r_args.setattr.uid = cpu_to_le32(
- from_kuid(&init_user_ns, attr->ia_uid));
+ from_kuid(&init_user_ns, fsuid));
mask |= CEPH_SETATTR_UID;
release |= CEPH_CAP_AUTH_SHARED;
}
}
if (ia_valid & ATTR_GID) {
- dout("setattr %p gid %d -> %d\n", inode,
- from_kgid(&init_user_ns, inode->i_gid),
- from_kgid(&init_user_ns, attr->ia_gid));
+ kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid);
+
+ doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode,
+ ceph_vinop(inode),
+ from_kgid(&init_user_ns, inode->i_gid),
+ from_kgid(&init_user_ns, attr->ia_gid));
if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_gid = attr->ia_gid;
+ inode->i_gid = fsgid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- !gid_eq(attr->ia_gid, inode->i_gid)) {
+ !gid_eq(fsgid, inode->i_gid)) {
req->r_args.setattr.gid = cpu_to_le32(
- from_kgid(&init_user_ns, attr->ia_gid));
+ from_kgid(&init_user_ns, fsgid));
mask |= CEPH_SETATTR_GID;
release |= CEPH_CAP_AUTH_SHARED;
}
}
if (ia_valid & ATTR_MODE) {
- dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
- attr->ia_mode);
+ doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode,
+ ceph_vinop(inode), inode->i_mode, attr->ia_mode);
if (issued & CEPH_CAP_AUTH_EXCL) {
inode->i_mode = attr->ia_mode;
dirtied |= CEPH_CAP_AUTH_EXCL;
@@ -2556,9 +2600,10 @@ retry:
if (ia_valid & ATTR_ATIME) {
struct timespec64 atime = inode_get_atime(inode);
- dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode,
- atime.tv_sec, atime.tv_nsec,
- attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+ doutc(cl, "%p %llx.%llx atime %lld.%09ld -> %lld.%09ld\n",
+ inode, ceph_vinop(inode),
+ atime.tv_sec, atime.tv_nsec,
+ attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
if (issued & CEPH_CAP_FILE_EXCL) {
ci->i_time_warp_seq++;
inode_set_atime_to_ts(inode, attr->ia_atime);
@@ -2578,7 +2623,8 @@ retry:
}
}
if (ia_valid & ATTR_SIZE) {
- dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
+ doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode,
+ ceph_vinop(inode), isize, attr->ia_size);
/*
* Only when the new size is smaller and not aligned to
* CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
@@ -2631,9 +2677,10 @@ retry:
if (ia_valid & ATTR_MTIME) {
struct timespec64 mtime = inode_get_mtime(inode);
- dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
- mtime.tv_sec, mtime.tv_nsec,
- attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+ doutc(cl, "%p %llx.%llx mtime %lld.%09ld -> %lld.%09ld\n",
+ inode, ceph_vinop(inode),
+ mtime.tv_sec, mtime.tv_nsec,
+ attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
if (issued & CEPH_CAP_FILE_EXCL) {
ci->i_time_warp_seq++;
inode_set_mtime_to_ts(inode, attr->ia_mtime);
@@ -2656,11 +2703,12 @@ retry:
if (ia_valid & ATTR_CTIME) {
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
- dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
- inode_get_ctime_sec(inode),
- inode_get_ctime_nsec(inode),
- attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
- only ? "ctime only" : "ignored");
+ doutc(cl, "%p %llx.%llx ctime %lld.%09ld -> %lld.%09ld (%s)\n",
+ inode, ceph_vinop(inode),
+ inode_get_ctime_sec(inode),
+ inode_get_ctime_nsec(inode),
+ attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+ only ? "ctime only" : "ignored");
if (only) {
/*
* if kernel wants to dirty ctime but nothing else,
@@ -2678,7 +2726,8 @@ retry:
}
}
if (ia_valid & ATTR_FILE)
- dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+ doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode,
+ ceph_vinop(inode));
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
@@ -2719,16 +2768,17 @@ retry:
*/
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err == -EAGAIN && truncate_retry--) {
- dout("setattr %p result=%d (%s locally, %d remote), retry it!\n",
- inode, err, ceph_cap_string(dirtied), mask);
+ doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n",
+ inode, ceph_vinop(inode), err,
+ ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
ceph_free_cap_flush(prealloc_cf);
goto retry;
}
}
out:
- dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
- ceph_cap_string(dirtied), mask);
+ doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode,
+ ceph_vinop(inode), err, ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
ceph_free_cap_flush(prealloc_cf);
@@ -2746,7 +2796,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
int err;
if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -2759,7 +2809,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (err)
return err;
- err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+ err = setattr_prepare(idmap, dentry, attr);
if (err != 0)
return err;
@@ -2771,10 +2821,10 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
return -EDQUOT;
- err = __ceph_setattr(inode, attr, NULL);
+ err = __ceph_setattr(idmap, inode, attr, NULL);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
- err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode);
+ err = posix_acl_chmod(idmap, dentry, attr->ia_mode);
return err;
}
@@ -2816,19 +2866,21 @@ int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
int mask, bool force)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int mode;
int err;
if (ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("do_getattr inode %p SNAPDIR\n", inode);
+ doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode,
+ ceph_vinop(inode));
return 0;
}
- dout("do_getattr inode %p mask %s mode 0%o\n",
- inode, ceph_cap_string(mask), inode->i_mode);
+ doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode,
+ ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode);
if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
return 0;
@@ -2855,14 +2907,15 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
}
}
ceph_mdsc_put_request(req);
- dout("do_getattr result=%d\n", err);
+ doutc(cl, "result=%d\n", err);
return err;
}
int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int mode = USE_AUTH_MDS;
@@ -2892,7 +2945,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
xattr_value = req->r_reply_info.xattr_info.xattr_value;
xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
- dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+ doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
err = (int)xattr_value_len;
if (size == 0)
@@ -2907,7 +2960,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
put:
ceph_mdsc_put_request(req);
out:
- dout("do_getvxattr result=%d\n", err);
+ doutc(cl, "result=%d\n", err);
return err;
}
@@ -2927,7 +2980,7 @@ int ceph_permission(struct mnt_idmap *idmap, struct inode *inode,
err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
if (!err)
- err = generic_permission(&nop_mnt_idmap, inode, mask);
+ err = generic_permission(idmap, inode, mask);
return err;
}
@@ -2984,7 +3037,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
return err;
}
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
stat->ino = ceph_present_inode(inode);
/*
@@ -3007,7 +3060,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
if (S_ISDIR(inode->i_mode)) {
- if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) {
+ if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) {
stat->size = ci->i_rbytes;
} else if (ceph_snap(inode) == CEPH_SNAPDIR) {
struct ceph_inode_info *pci;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 91a84917d203..e861de3c79b9 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -65,7 +65,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
struct ceph_inode_info *ci = ceph_inode(file_inode(file));
@@ -140,7 +140,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
int err;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
/* copy and validate */
if (copy_from_user(&l, arg, sizeof(l)))
@@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
- &ceph_sb_to_client(inode->i_sb)->client->osdc;
+ &ceph_sb_to_fs_client(inode->i_sb)->client->osdc;
struct ceph_object_locator oloc;
CEPH_DEFINE_OID_ONSTACK(oid);
u32 xlen;
@@ -244,7 +244,8 @@ static long ceph_ioctl_lazyio(struct file *file)
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
@@ -252,11 +253,13 @@ static long ceph_ioctl_lazyio(struct file *file)
ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
__ceph_touch_fmode(ci, mdsc, fi->fmode);
spin_unlock(&ci->i_ceph_lock);
- dout("ioctl_layzio: file %p marked lazy\n", file);
+ doutc(cl, "file %p %p %llx.%llx marked lazy\n", file, inode,
+ ceph_vinop(inode));
ceph_check_caps(ci, 0);
} else {
- dout("ioctl_layzio: file %p already lazy\n", file);
+ doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode,
+ ceph_vinop(inode));
}
return 0;
}
@@ -355,10 +358,12 @@ static const char *ceph_ioctl_cmd_name(const unsigned int cmd)
long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
+ struct inode *inode = file_inode(file);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
int ret;
- dout("ioctl file %p cmd %s arg %lu\n", file,
- ceph_ioctl_cmd_name(cmd), arg);
+ doutc(fsc->client, "file %p %p %llx.%llx cmd %s arg %lu\n", file,
+ inode, ceph_vinop(inode), ceph_ioctl_cmd_name(cmd), arg);
switch (cmd) {
case CEPH_IOC_GET_LAYOUT:
return ceph_ioctl_get_layout(file, (void __user *)arg);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index cb51c7e9c8e2..e07ad29ff8b9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -77,6 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
int cmd, u8 wait, struct file_lock *fl)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
int err;
u64 length = 0;
@@ -111,10 +112,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
owner = secure_addr(fl->fl_owner);
- dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
- "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
- (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
- wait, fl->fl_type);
+ doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, "
+ "start: %llu, length: %llu, wait: %d, type: %d\n",
+ (int)lock_type, (int)operation, owner, (u64)fl->fl_pid,
+ fl->fl_start, length, wait, fl->fl_type);
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
@@ -147,16 +148,17 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
}
ceph_mdsc_put_request(req);
- dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
- (int)operation, (u64)fl->fl_pid, fl->fl_start,
- length, wait, fl->fl_type, err);
+ doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, "
+ "length: %llu, wait: %d, type: %d, err code %d\n",
+ (int)lock_type, (int)operation, (u64)fl->fl_pid,
+ fl->fl_start, length, wait, fl->fl_type, err);
return err;
}
static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *intr_req;
struct inode *inode = req->r_inode;
int err, lock_type;
@@ -174,8 +176,7 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
if (!err)
return 0;
- dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
- req->r_tid);
+ doutc(cl, "request %llu was interrupted\n", req->r_tid);
mutex_lock(&mdsc->mutex);
if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
@@ -246,6 +247,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int err = 0;
u16 op = CEPH_MDS_OP_SETFILELOCK;
u8 wait = 0;
@@ -257,7 +259,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
- dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
+ doutc(cl, "fl_owner: %p\n", fl->fl_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
if (IS_GETLK(cmd))
@@ -292,7 +294,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
- dout("mds locked, locking locally\n");
+ doutc(cl, "locking locally\n");
err = posix_lock_file(file, fl, NULL);
if (err) {
/* undo! This should only happen if
@@ -300,8 +302,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
* deadlock. */
ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on posix_lock_file, undid lock\n",
- err);
+ doutc(cl, "got %d on posix_lock_file, undid lock\n",
+ err);
}
}
}
@@ -312,6 +314,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int err = 0;
u8 wait = 0;
u8 lock_cmd;
@@ -322,7 +325,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
- dout("ceph_flock, fl_file: %p\n", fl->fl_file);
+ doutc(cl, "fl_file: %p\n", fl->fl_file);
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
@@ -359,7 +362,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
inode, CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on locks_lock_file_wait, undid lock\n", err);
+ doutc(cl, "got %d on locks_lock_file_wait, undid lock\n",
+ err);
}
}
return err;
@@ -371,6 +375,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
*/
void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct file_lock *lock;
struct file_lock_context *ctx;
@@ -386,17 +391,20 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
++(*flock_count);
spin_unlock(&ctx->flc_lock);
}
- dout("counted %d flock locks and %d fcntl locks\n",
- *flock_count, *fcntl_count);
+ doutc(cl, "counted %d flock locks and %d fcntl locks\n",
+ *flock_count, *fcntl_count);
}
/*
* Given a pointer to a lock, convert it to a ceph filelock
*/
-static int lock_to_ceph_filelock(struct file_lock *lock,
+static int lock_to_ceph_filelock(struct inode *inode,
+ struct file_lock *lock,
struct ceph_filelock *cephlock)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int err = 0;
+
cephlock->start = cpu_to_le64(lock->fl_start);
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
cephlock->client = cpu_to_le64(0);
@@ -414,7 +422,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock,
cephlock->type = CEPH_LOCK_UNLOCK;
break;
default:
- dout("Have unknown lock type %d\n", lock->fl_type);
+ doutc(cl, "Have unknown lock type %d\n", lock->fl_type);
err = -EINVAL;
}
@@ -432,13 +440,14 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
{
struct file_lock *lock;
struct file_lock_context *ctx = locks_inode_context(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
int l = 0;
- dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
- num_fcntl_locks);
+ doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks,
+ num_fcntl_locks);
if (!ctx)
return 0;
@@ -450,7 +459,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
err = -ENOSPC;
goto fail;
}
- err = lock_to_ceph_filelock(lock, &flocks[l]);
+ err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
if (err)
goto fail;
++l;
@@ -461,7 +470,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
err = -ENOSPC;
goto fail;
}
- err = lock_to_ceph_filelock(lock, &flocks[l]);
+ err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
if (err)
goto fail;
++l;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index de798444bb97..d95eb525519a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -12,6 +12,7 @@
#include <linux/bits.h>
#include <linux/ktime.h>
#include <linux/bitmap.h>
+#include <linux/mnt_idmapping.h>
#include "super.h"
#include "mds_client.h"
@@ -411,6 +412,7 @@ static int parse_reply_info_readdir(void **p, void *end,
u64 features)
{
struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
u32 num, i = 0;
int err;
@@ -433,7 +435,7 @@ static int parse_reply_info_readdir(void **p, void *end,
BUG_ON(!info->dir_entries);
if ((unsigned long)(info->dir_entries + num) >
(unsigned long)info->dir_entries + info->dir_buf_size) {
- pr_err("dir contents are larger than expected\n");
+ pr_err_client(cl, "dir contents are larger than expected\n");
WARN_ON(1);
goto bad;
}
@@ -454,7 +456,7 @@ static int parse_reply_info_readdir(void **p, void *end,
ceph_decode_need(p, end, _name_len, bad);
_name = *p;
*p += _name_len;
- dout("parsed dir dname '%.*s'\n", _name_len, _name);
+ doutc(cl, "parsed dir dname '%.*s'\n", _name_len, _name);
if (info->hash_order)
rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
@@ -514,8 +516,8 @@ static int parse_reply_info_readdir(void **p, void *end,
rde->is_nokey = false;
err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey);
if (err) {
- pr_err("%s unable to decode %.*s, got %d\n", __func__,
- _name_len, _name, err);
+ pr_err_client(cl, "unable to decode %.*s, got %d\n",
+ _name_len, _name, err);
goto out_bad;
}
rde->name = oname.name;
@@ -539,7 +541,7 @@ done:
bad:
err = -EIO;
out_bad:
- pr_err("problem parsing dir contents %d\n", err);
+ pr_err_client(cl, "problem parsing dir contents %d\n", err);
return err;
}
@@ -570,10 +572,11 @@ bad:
static int ceph_parse_deleg_inos(void **p, void *end,
struct ceph_mds_session *s)
{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
u32 sets;
ceph_decode_32_safe(p, end, sets, bad);
- dout("got %u sets of delegated inodes\n", sets);
+ doutc(cl, "got %u sets of delegated inodes\n", sets);
while (sets--) {
u64 start, len;
@@ -582,8 +585,9 @@ static int ceph_parse_deleg_inos(void **p, void *end,
/* Don't accept a delegation of system inodes */
if (start < CEPH_INO_SYSTEM_BASE) {
- pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
- start, len);
+ pr_warn_ratelimited_client(cl,
+ "ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
+ start, len);
continue;
}
while (len--) {
@@ -591,10 +595,10 @@ static int ceph_parse_deleg_inos(void **p, void *end,
DELEGATED_INO_AVAILABLE,
GFP_KERNEL);
if (!err) {
- dout("added delegated inode 0x%llx\n",
- start - 1);
+ doutc(cl, "added delegated inode 0x%llx\n", start - 1);
} else if (err == -EBUSY) {
- pr_warn("MDS delegated inode 0x%llx more than once.\n",
+ pr_warn_client(cl,
+ "MDS delegated inode 0x%llx more than once.\n",
start - 1);
} else {
return err;
@@ -744,6 +748,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
struct ceph_mds_request *req, u64 features)
{
struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
void *p, *end;
u32 len;
int err;
@@ -783,7 +788,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
bad:
err = -EIO;
out_bad:
- pr_err("mds parse_reply err %d\n", err);
+ pr_err_client(cl, "mds parse_reply err %d\n", err);
ceph_msg_dump(msg);
return err;
}
@@ -830,7 +835,8 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
*/
int ceph_wait_on_conflict_unlink(struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
+ struct ceph_client *cl = fsc->client;
struct dentry *pdentry = dentry->d_parent;
struct dentry *udentry, *found = NULL;
struct ceph_dentry_info *di;
@@ -855,8 +861,8 @@ int ceph_wait_on_conflict_unlink(struct dentry *dentry)
goto next;
if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
- pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
- __func__, dentry, dentry);
+ pr_warn_client(cl, "dentry %p:%pd async unlink bit is not set\n",
+ dentry, dentry);
if (!d_same_name(udentry, pdentry, &dname))
goto next;
@@ -872,8 +878,8 @@ next:
if (likely(!found))
return 0;
- dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__,
- dentry, dentry, found, found);
+ doutc(cl, "dentry %p:%pd conflict with old %p:%pd\n", dentry, dentry,
+ found, found);
err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
TASK_KILLABLE);
@@ -957,6 +963,7 @@ static int __verify_registered_session(struct ceph_mds_client *mdsc,
static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
int mds)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *s;
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
@@ -973,7 +980,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
int newmax = 1 << get_count_order(mds + 1);
struct ceph_mds_session **sa;
- dout("%s: realloc to %d\n", __func__, newmax);
+ doutc(cl, "realloc to %d\n", newmax);
sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
if (!sa)
goto fail_realloc;
@@ -986,7 +993,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
mdsc->max_sessions = newmax;
}
- dout("%s: mds%d\n", __func__, mds);
+ doutc(cl, "mds%d\n", mds);
s->s_mdsc = mdsc;
s->s_mds = mds;
s->s_state = CEPH_MDS_SESSION_NEW;
@@ -1029,7 +1036,7 @@ fail_realloc:
static void __unregister_session(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s)
{
- dout("__unregister_session mds%d %p\n", s->s_mds, s);
+ doutc(mdsc->fsc->client, "mds%d %p\n", s->s_mds, s);
BUG_ON(mdsc->sessions[s->s_mds] != s);
mdsc->sessions[s->s_mds] = NULL;
ceph_con_close(&s->s_con);
@@ -1116,6 +1123,8 @@ void ceph_mdsc_release_request(struct kref *kref)
kfree(req->r_path1);
kfree(req->r_path2);
put_cred(req->r_cred);
+ if (req->r_mnt_idmap)
+ mnt_idmap_put(req->r_mnt_idmap);
if (req->r_pagelist)
ceph_pagelist_release(req->r_pagelist);
kfree(req->r_fscrypt_auth);
@@ -1155,6 +1164,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req,
struct inode *dir)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int ret = 0;
req->r_tid = ++mdsc->last_tid;
@@ -1162,18 +1172,20 @@ static void __register_request(struct ceph_mds_client *mdsc,
ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
req->r_num_caps);
if (ret < 0) {
- pr_err("__register_request %p "
- "failed to reserve caps: %d\n", req, ret);
+ pr_err_client(cl, "%p failed to reserve caps: %d\n",
+ req, ret);
/* set req->r_err to fail early from __do_request */
req->r_err = ret;
return;
}
}
- dout("__register_request %p tid %lld\n", req, req->r_tid);
+ doutc(cl, "%p tid %lld\n", req, req->r_tid);
ceph_mdsc_get_request(req);
insert_request(&mdsc->request_tree, req);
req->r_cred = get_current_cred();
+ if (!req->r_mnt_idmap)
+ req->r_mnt_idmap = &nop_mnt_idmap;
if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
mdsc->oldest_tid = req->r_tid;
@@ -1192,7 +1204,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
static void __unregister_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
- dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+ doutc(mdsc->fsc->client, "%p tid %lld\n", req, req->r_tid);
/* Never leave an unregistered request on an unsafe list! */
list_del_init(&req->r_unsafe_item);
@@ -1278,6 +1290,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
int mds = -1;
u32 hash = req->r_direct_hash;
bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+ struct ceph_client *cl = mdsc->fsc->client;
if (random)
*random = false;
@@ -1289,8 +1302,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
if (req->r_resend_mds >= 0 &&
(__have_session(mdsc, req->r_resend_mds) ||
ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
- dout("%s using resend_mds mds%d\n", __func__,
- req->r_resend_mds);
+ doutc(cl, "using resend_mds mds%d\n", req->r_resend_mds);
return req->r_resend_mds;
}
@@ -1307,7 +1319,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
rcu_read_lock();
inode = get_nonsnap_parent(req->r_dentry);
rcu_read_unlock();
- dout("%s using snapdir's parent %p\n", __func__, inode);
+ doutc(cl, "using snapdir's parent %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
}
} else if (req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */
@@ -1327,7 +1340,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
/* direct snapped/virtual snapdir requests
* based on parent dir inode */
inode = get_nonsnap_parent(parent);
- dout("%s using nonsnap parent %p\n", __func__, inode);
+ doutc(cl, "using nonsnap parent %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
} else {
/* dentry target */
inode = d_inode(req->r_dentry);
@@ -1343,10 +1357,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
rcu_read_unlock();
}
- dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash,
- hash, mode);
if (!inode)
goto random;
+
+ doutc(cl, "%p %llx.%llx is_hash=%d (0x%x) mode %d\n", inode,
+ ceph_vinop(inode), (int)is_hash, hash, mode);
ci = ceph_inode(inode);
if (is_hash && S_ISDIR(inode->i_mode)) {
@@ -1362,9 +1377,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
get_random_bytes(&r, 1);
r %= frag.ndist;
mds = frag.dist[r];
- dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n",
- __func__, inode, ceph_vinop(inode),
- frag.frag, mds, (int)r, frag.ndist);
+ doutc(cl, "%p %llx.%llx frag %u mds%d (%d/%d)\n",
+ inode, ceph_vinop(inode), frag.frag,
+ mds, (int)r, frag.ndist);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE &&
!ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
@@ -1377,9 +1392,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
if (frag.mds >= 0) {
/* choose auth mds */
mds = frag.mds;
- dout("%s %p %llx.%llx frag %u mds%d (auth)\n",
- __func__, inode, ceph_vinop(inode),
- frag.frag, mds);
+ doutc(cl, "%p %llx.%llx frag %u mds%d (auth)\n",
+ inode, ceph_vinop(inode), frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE) {
if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
@@ -1403,9 +1417,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
goto random;
}
mds = cap->session->s_mds;
- dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__,
- inode, ceph_vinop(inode), mds,
- cap == ci->i_auth_cap ? "auth " : "", cap);
+ doutc(cl, "%p %llx.%llx mds%d (%scap %p)\n", inode,
+ ceph_vinop(inode), mds,
+ cap == ci->i_auth_cap ? "auth " : "", cap);
spin_unlock(&ci->i_ceph_lock);
out:
iput(inode);
@@ -1416,7 +1430,7 @@ random:
*random = true;
mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
- dout("%s chose random mds%d\n", __func__, mds);
+ doutc(cl, "chose random mds%d\n", mds);
return mds;
}
@@ -1529,6 +1543,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
int metadata_key_count = 0;
struct ceph_options *opt = mdsc->fsc->client->options;
struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
+ struct ceph_client *cl = mdsc->fsc->client;
size_t size, count;
void *p, *end;
int ret;
@@ -1567,7 +1582,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
if (!msg) {
- pr_err("ENOMEM creating session open msg\n");
+ pr_err_client(cl, "ENOMEM creating session open msg\n");
return ERR_PTR(-ENOMEM);
}
p = msg->front.iov_base;
@@ -1607,14 +1622,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
ret = encode_supported_features(&p, end);
if (ret) {
- pr_err("encode_supported_features failed!\n");
+ pr_err_client(cl, "encode_supported_features failed!\n");
ceph_msg_put(msg);
return ERR_PTR(ret);
}
ret = encode_metric_spec(&p, end);
if (ret) {
- pr_err("encode_metric_spec failed!\n");
+ pr_err_client(cl, "encode_metric_spec failed!\n");
ceph_msg_put(msg);
return ERR_PTR(ret);
}
@@ -1642,8 +1657,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
/* wait for mds to go active? */
mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
- dout("open_session to mds%d (%s)\n", mds,
- ceph_mds_state_name(mstate));
+ doutc(mdsc->fsc->client, "open_session to mds%d (%s)\n", mds,
+ ceph_mds_state_name(mstate));
session->s_state = CEPH_MDS_SESSION_OPENING;
session->s_renew_requested = jiffies;
@@ -1686,8 +1701,9 @@ struct ceph_mds_session *
ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
{
struct ceph_mds_session *session;
+ struct ceph_client *cl = mdsc->fsc->client;
- dout("open_export_target_session to mds%d\n", target);
+ doutc(cl, "to mds%d\n", target);
mutex_lock(&mdsc->mutex);
session = __open_export_target_session(mdsc, target);
@@ -1702,13 +1718,14 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_info *mi;
struct ceph_mds_session *ts;
int i, mds = session->s_mds;
+ struct ceph_client *cl = mdsc->fsc->client;
if (mds >= mdsc->mdsmap->possible_max_rank)
return;
mi = &mdsc->mdsmap->m_info[mds];
- dout("open_export_target_sessions for mds%d (%d targets)\n",
- session->s_mds, mi->num_export_targets);
+ doutc(cl, "for mds%d (%d targets)\n", session->s_mds,
+ mi->num_export_targets);
for (i = 0; i < mi->num_export_targets; i++) {
ts = __open_export_target_session(mdsc, mi->export_targets[i]);
@@ -1731,11 +1748,13 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
static void detach_cap_releases(struct ceph_mds_session *session,
struct list_head *target)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
+
lockdep_assert_held(&session->s_cap_lock);
list_splice_init(&session->s_cap_releases, target);
session->s_num_cap_releases = 0;
- dout("dispose_cap_releases mds%d\n", session->s_mds);
+ doutc(cl, "mds%d\n", session->s_mds);
}
static void dispose_cap_releases(struct ceph_mds_client *mdsc,
@@ -1753,16 +1772,17 @@ static void dispose_cap_releases(struct ceph_mds_client *mdsc,
static void cleanup_session_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct rb_node *p;
- dout("cleanup_session_requests mds%d\n", session->s_mds);
+ doutc(cl, "mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
while (!list_empty(&session->s_unsafe)) {
req = list_first_entry(&session->s_unsafe,
struct ceph_mds_request, r_unsafe_item);
- pr_warn_ratelimited(" dropping unsafe request %llu\n",
- req->r_tid);
+ pr_warn_ratelimited_client(cl, " dropping unsafe request %llu\n",
+ req->r_tid);
if (req->r_target_inode)
mapping_set_error(req->r_target_inode->i_mapping, -EIO);
if (req->r_unsafe_dir)
@@ -1791,13 +1811,14 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
int (*cb)(struct inode *, int mds, void *),
void *arg)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct list_head *p;
struct ceph_cap *cap;
struct inode *inode, *last_inode = NULL;
struct ceph_cap *old_cap = NULL;
int ret;
- dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+ doutc(cl, "%p mds%d\n", session, session->s_mds);
spin_lock(&session->s_cap_lock);
p = session->s_caps.next;
while (p != &session->s_caps) {
@@ -1828,8 +1849,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
spin_lock(&session->s_cap_lock);
p = p->next;
if (!cap->ci) {
- dout("iterate_session_caps finishing cap %p removal\n",
- cap);
+ doutc(cl, "finishing cap %p removal\n", cap);
BUG_ON(cap->session != session);
cap->session = NULL;
list_del_init(&cap->session_caps);
@@ -1858,6 +1878,7 @@ out:
static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
bool invalidate = false;
struct ceph_cap *cap;
int iputs = 0;
@@ -1865,8 +1886,8 @@ static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (cap) {
- dout(" removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->netfs.inode);
+ doutc(cl, " removing cap %p, ci is %p, inode is %p\n",
+ cap, ci, &ci->netfs.inode);
iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
}
@@ -1890,7 +1911,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
struct super_block *sb = fsc->sb;
LIST_HEAD(dispose);
- dout("remove_session_caps on %p\n", session);
+ doutc(fsc->client, "on %p\n", session);
ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
wake_up_all(&fsc->mdsc->cap_flushing_wq);
@@ -1971,7 +1992,9 @@ static int wake_up_session_cb(struct inode *inode, int mds, void *arg)
static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
{
- dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
+
+ doutc(cl, "session %p mds%d\n", session, session->s_mds);
ceph_iterate_session_caps(session, wake_up_session_cb,
(void *)(unsigned long)ev);
}
@@ -1985,25 +2008,26 @@ static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
static int send_renew_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
int state;
if (time_after_eq(jiffies, session->s_cap_ttl) &&
time_after_eq(session->s_cap_ttl, session->s_renew_requested))
- pr_info("mds%d caps stale\n", session->s_mds);
+ pr_info_client(cl, "mds%d caps stale\n", session->s_mds);
session->s_renew_requested = jiffies;
/* do not try to renew caps until a recovering mds has reconnected
* with its clients. */
state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
if (state < CEPH_MDS_STATE_RECONNECT) {
- dout("send_renew_caps ignoring mds%d (%s)\n",
- session->s_mds, ceph_mds_state_name(state));
+ doutc(cl, "ignoring mds%d (%s)\n", session->s_mds,
+ ceph_mds_state_name(state));
return 0;
}
- dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
- ceph_mds_state_name(state));
+ doutc(cl, "to mds%d (%s)\n", session->s_mds,
+ ceph_mds_state_name(state));
msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
++session->s_renew_seq);
if (!msg)
@@ -2015,10 +2039,11 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session, u64 seq)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
- dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
- session->s_mds, ceph_session_state_name(session->s_state), seq);
+ doutc(cl, "to mds%d (%s)s seq %lld\n", session->s_mds,
+ ceph_session_state_name(session->s_state), seq);
msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
if (!msg)
return -ENOMEM;
@@ -2035,6 +2060,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
static void renewed_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session, int is_renew)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int was_stale;
int wake = 0;
@@ -2046,15 +2072,17 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
if (was_stale) {
if (time_before(jiffies, session->s_cap_ttl)) {
- pr_info("mds%d caps renewed\n", session->s_mds);
+ pr_info_client(cl, "mds%d caps renewed\n",
+ session->s_mds);
wake = 1;
} else {
- pr_info("mds%d caps still stale\n", session->s_mds);
+ pr_info_client(cl, "mds%d caps still stale\n",
+ session->s_mds);
}
}
- dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
- session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
- time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+ doutc(cl, "mds%d ttl now %lu, was %s, now %s\n", session->s_mds,
+ session->s_cap_ttl, was_stale ? "stale" : "fresh",
+ time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
spin_unlock(&session->s_cap_lock);
if (wake)
@@ -2066,11 +2094,11 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
*/
static int request_close_session(struct ceph_mds_session *session)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct ceph_msg *msg;
- dout("request_close_session mds%d state %s seq %lld\n",
- session->s_mds, ceph_session_state_name(session->s_state),
- session->s_seq);
+ doutc(cl, "mds%d state %s seq %lld\n", session->s_mds,
+ ceph_session_state_name(session->s_state), session->s_seq);
msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
session->s_seq);
if (!msg)
@@ -2126,6 +2154,8 @@ out:
*/
static int trim_caps_cb(struct inode *inode, int mds, void *arg)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
int *remaining = arg;
struct ceph_inode_info *ci = ceph_inode(inode);
int used, wanted, oissued, mine;
@@ -2145,9 +2175,10 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
wanted = __ceph_caps_file_wanted(ci);
oissued = __ceph_caps_issued_other(ci, cap);
- dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
- inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
- ceph_cap_string(used), ceph_cap_string(wanted));
+ doutc(cl, "%p %llx.%llx cap %p mine %s oissued %s used %s wanted %s\n",
+ inode, ceph_vinop(inode), cap, ceph_cap_string(mine),
+ ceph_cap_string(oissued), ceph_cap_string(used),
+ ceph_cap_string(wanted));
if (cap == ci->i_auth_cap) {
if (ci->i_dirty_caps || ci->i_flushing_caps ||
!list_empty(&ci->i_cap_snaps))
@@ -2173,7 +2204,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
if (oissued) {
/* we aren't the only cap.. just remove us */
- ceph_remove_cap(cap, true);
+ ceph_remove_cap(mdsc, cap, true);
(*remaining)--;
} else {
struct dentry *dentry;
@@ -2187,8 +2218,8 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
count = atomic_read(&inode->i_count);
if (count == 1)
(*remaining)--;
- dout("trim_caps_cb %p cap %p pruned, count now %d\n",
- inode, cap, count);
+ doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",
+ inode, ceph_vinop(inode), cap, count);
} else {
dput(dentry);
}
@@ -2207,17 +2238,18 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
int max_caps)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int trim_caps = session->s_nr_caps - max_caps;
- dout("trim_caps mds%d start: %d / %d, trim %d\n",
- session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+ doutc(cl, "mds%d start: %d / %d, trim %d\n", session->s_mds,
+ session->s_nr_caps, max_caps, trim_caps);
if (trim_caps > 0) {
int remaining = trim_caps;
ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
- dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
- session->s_mds, session->s_nr_caps, max_caps,
- trim_caps - remaining);
+ doutc(cl, "mds%d done: %d / %d, trimmed %d\n",
+ session->s_mds, session->s_nr_caps, max_caps,
+ trim_caps - remaining);
}
ceph_flush_cap_releases(mdsc, session);
@@ -2227,6 +2259,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
static int check_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int ret = 1;
spin_lock(&mdsc->cap_dirty_lock);
@@ -2235,8 +2268,8 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
list_first_entry(&mdsc->cap_flush_list,
struct ceph_cap_flush, g_list);
if (cf->tid <= want_flush_tid) {
- dout("check_caps_flush still flushing tid "
- "%llu <= %llu\n", cf->tid, want_flush_tid);
+ doutc(cl, "still flushing tid %llu <= %llu\n",
+ cf->tid, want_flush_tid);
ret = 0;
}
}
@@ -2252,12 +2285,14 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
static void wait_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid)
{
- dout("check_caps_flush want %llu\n", want_flush_tid);
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "want %llu\n", want_flush_tid);
wait_event(mdsc->cap_flushing_wq,
check_caps_flush(mdsc, want_flush_tid));
- dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
+ doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
}
/*
@@ -2266,6 +2301,7 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
struct ceph_mds_cap_item *item;
@@ -2324,7 +2360,7 @@ again:
msg->front.iov_len += sizeof(*cap_barrier);
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+ doutc(cl, "mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
msg = NULL;
}
@@ -2344,13 +2380,13 @@ again:
msg->front.iov_len += sizeof(*cap_barrier);
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+ doutc(cl, "mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
}
return;
out_err:
- pr_err("send_cap_releases mds%d, failed to allocate message\n",
- session->s_mds);
+ pr_err_client(cl, "mds%d, failed to allocate message\n",
+ session->s_mds);
spin_lock(&session->s_cap_lock);
list_splice(&tmp_list, &session->s_cap_releases);
session->s_num_cap_releases += num_cap_releases;
@@ -2373,16 +2409,17 @@ static void ceph_cap_release_work(struct work_struct *work)
void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
if (mdsc->stopping)
return;
ceph_get_mds_session(session);
if (queue_work(mdsc->fsc->cap_wq,
&session->s_cap_release_work)) {
- dout("cap release work queued\n");
+ doutc(cl, "cap release work queued\n");
} else {
ceph_put_mds_session(session);
- dout("failed to queue cap release work\n");
+ doutc(cl, "failed to queue cap release work\n");
}
}
@@ -2410,13 +2447,14 @@ static void ceph_cap_reclaim_work(struct work_struct *work)
void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
if (mdsc->stopping)
return;
if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
- dout("caps reclaim work queued\n");
+ doutc(cl, "caps reclaim work queued\n");
} else {
- dout("failed to queue caps release work\n");
+ doutc(cl, "failed to queue caps release work\n");
}
}
@@ -2588,6 +2626,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
/**
* ceph_mdsc_build_path - build a path string to a given dentry
+ * @mdsc: mds client
* @dentry: dentry to which path should be built
* @plen: returned length of string
* @pbase: returned base inode number
@@ -2607,9 +2646,10 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* Encode hidden .snap dirs as a double /, i.e.
* foo/.snap/bar -> foo//bar
*/
-char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
- int for_wire)
+char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+ int *plen, u64 *pbase, int for_wire)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct dentry *cur;
struct inode *inode;
char *path;
@@ -2635,8 +2675,7 @@ retry:
spin_lock(&cur->d_lock);
inode = d_inode(cur);
if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("build_path path+%d: %p SNAPDIR\n",
- pos, cur);
+ doutc(cl, "path+%d: %p SNAPDIR\n", pos, cur);
spin_unlock(&cur->d_lock);
parent = dget_parent(cur);
} else if (for_wire && inode && dentry != cur &&
@@ -2714,21 +2753,21 @@ retry:
* A rename didn't occur, but somehow we didn't end up where
* we thought we would. Throw a warning and try again.
*/
- pr_warn("build_path did not end path lookup where expected (pos = %d)\n",
- pos);
+ pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n",
+ pos);
goto retry;
}
*pbase = base;
*plen = PATH_MAX - 1 - pos;
- dout("build_path on %p %d built %llx '%.*s'\n",
- dentry, d_count(dentry), base, *plen, path + pos);
+ doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
+ base, *plen, path + pos);
return path + pos;
}
-static int build_dentry_path(struct dentry *dentry, struct inode *dir,
- const char **ppath, int *ppathlen, u64 *pino,
- bool *pfreepath, bool parent_locked)
+static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+ struct inode *dir, const char **ppath, int *ppathlen,
+ u64 *pino, bool *pfreepath, bool parent_locked)
{
char *path;
@@ -2744,7 +2783,7 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
return 0;
}
rcu_read_unlock();
- path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
if (IS_ERR(path))
return PTR_ERR(path);
*ppath = path;
@@ -2756,6 +2795,7 @@ static int build_inode_path(struct inode *inode,
const char **ppath, int *ppathlen, u64 *pino,
bool *pfreepath)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct dentry *dentry;
char *path;
@@ -2765,7 +2805,7 @@ static int build_inode_path(struct inode *inode,
return 0;
}
dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
dput(dentry);
if (IS_ERR(path))
return PTR_ERR(path);
@@ -2778,27 +2818,28 @@ static int build_inode_path(struct inode *inode,
* request arguments may be specified via an inode *, a dentry *, or
* an explicit ino+path.
*/
-static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
- struct inode *rdiri, const char *rpath,
- u64 rino, const char **ppath, int *pathlen,
- u64 *ino, bool *freepath, bool parent_locked)
+static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
+ struct dentry *rdentry, struct inode *rdiri,
+ const char *rpath, u64 rino, const char **ppath,
+ int *pathlen, u64 *ino, bool *freepath,
+ bool parent_locked)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int r = 0;
if (rinode) {
r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
- dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
- ceph_snap(rinode));
+ doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+ ceph_snap(rinode));
} else if (rdentry) {
- r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
+ r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
freepath, parent_locked);
- dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
- *ppath);
+ doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
} else if (rpath || rino) {
*ino = rino;
*ppath = rpath;
*pathlen = rpath ? strlen(rpath) : 0;
- dout(" path %.*s\n", *pathlen, rpath);
+ doutc(cl, " path %.*s\n", *pathlen, rpath);
}
return r;
@@ -2840,6 +2881,17 @@ static void encode_mclientrequest_tail(void **p,
}
}
+static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
+{
+ if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
+ return 1;
+
+ if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
+ return 2;
+
+ return CEPH_MDS_REQUEST_HEAD_VERSION;
+}
+
static struct ceph_mds_request_head_legacy *
find_legacy_request_head(void *p, u64 features)
{
@@ -2861,6 +2913,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
{
int mds = session->s_mds;
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_request_head_legacy *lhead;
const char *path1 = NULL;
@@ -2874,10 +2927,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
void *p, *end;
int ret;
bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
- bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
- &session->s_features);
+ u16 request_head_version = mds_supported_head_version(session);
+ kuid_t caller_fsuid = req->r_cred->fsuid;
+ kgid_t caller_fsgid = req->r_cred->fsgid;
- ret = set_request_path_attr(req->r_inode, req->r_dentry,
+ ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
req->r_parent, req->r_path1, req->r_ino1.ino,
&path1, &pathlen1, &ino1, &freepath1,
test_bit(CEPH_MDS_R_PARENT_LOCKED,
@@ -2891,7 +2945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
if (req->r_old_dentry &&
!(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
old_dentry = req->r_old_dentry;
- ret = set_request_path_attr(NULL, old_dentry,
+ ret = set_request_path_attr(mdsc, NULL, old_dentry,
req->r_old_dentry_dir,
req->r_path2, req->r_ino2.ino,
&path2, &pathlen2, &ino2, &freepath2, true);
@@ -2916,8 +2970,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
*/
if (legacy)
len = sizeof(struct ceph_mds_request_head_legacy);
- else if (old_version)
+ else if (request_head_version == 1)
len = sizeof(struct ceph_mds_request_head_old);
+ else if (request_head_version == 2)
+ len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
else
len = sizeof(struct ceph_mds_request_head);
@@ -2967,6 +3023,30 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead = find_legacy_request_head(msg->front.iov_base,
session->s_con.peer_features);
+ if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
+ !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
+ WARN_ON_ONCE(!IS_CEPH_MDS_OP_NEWINODE(req->r_op));
+
+ if (enable_unsafe_idmap) {
+ pr_warn_once_client(cl,
+ "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
+ " is not supported by MDS. UID/GID-based restrictions may"
+ " not work properly.\n");
+
+ caller_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
+ VFSUIDT_INIT(req->r_cred->fsuid));
+ caller_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
+ VFSGIDT_INIT(req->r_cred->fsgid));
+ } else {
+ pr_err_ratelimited_client(cl,
+ "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
+ " is not supported by MDS. Fail request with -EIO.\n");
+
+ ret = -EIO;
+ goto out_err;
+ }
+ }
+
/*
* The ceph_mds_request_head_legacy didn't contain a version field, and
* one was added when we moved the message version from 3->4.
@@ -2974,17 +3054,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
if (legacy) {
msg->hdr.version = cpu_to_le16(3);
p = msg->front.iov_base + sizeof(*lhead);
- } else if (old_version) {
+ } else if (request_head_version == 1) {
struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
msg->hdr.version = cpu_to_le16(4);
ohead->version = cpu_to_le16(1);
p = msg->front.iov_base + sizeof(*ohead);
+ } else if (request_head_version == 2) {
+ struct ceph_mds_request_head *nhead = msg->front.iov_base;
+
+ msg->hdr.version = cpu_to_le16(6);
+ nhead->version = cpu_to_le16(2);
+
+ p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
} else {
struct ceph_mds_request_head *nhead = msg->front.iov_base;
+ kuid_t owner_fsuid;
+ kgid_t owner_fsgid;
msg->hdr.version = cpu_to_le16(6);
nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+ nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head));
+
+ if (IS_CEPH_MDS_OP_NEWINODE(req->r_op)) {
+ owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
+ VFSUIDT_INIT(req->r_cred->fsuid));
+ owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
+ VFSGIDT_INIT(req->r_cred->fsgid));
+ nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
+ nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
+ } else {
+ nhead->owner_uid = cpu_to_le32(-1);
+ nhead->owner_gid = cpu_to_le32(-1);
+ }
+
p = msg->front.iov_base + sizeof(*nhead);
}
@@ -2993,9 +3096,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
lhead->op = cpu_to_le32(req->r_op);
lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
- req->r_cred->fsuid));
+ caller_fsuid));
lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
- req->r_cred->fsgid));
+ caller_fsgid));
lhead->ino = cpu_to_le64(req->r_deleg_ino);
lhead->args = req->r_args;
@@ -3099,6 +3202,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
{
int mds = session->s_mds;
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request_head_legacy *lhead;
struct ceph_mds_request_head *nhead;
struct ceph_msg *msg;
@@ -3117,8 +3221,8 @@ static int __prepare_send_request(struct ceph_mds_session *session,
old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
if ((old_version && req->r_attempts >= old_max_retry) ||
((uint32_t)req->r_attempts >= U32_MAX)) {
- pr_warn_ratelimited("%s request tid %llu seq overflow\n",
- __func__, req->r_tid);
+ pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
+ req->r_tid);
return -EMULTIHOP;
}
}
@@ -3133,8 +3237,8 @@ static int __prepare_send_request(struct ceph_mds_session *session,
else
req->r_sent_on_mseq = -1;
}
- dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
- req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+ doutc(cl, "%p tid %lld %s (attempt %d)\n", req, req->r_tid,
+ ceph_mds_op_name(req->r_op), req->r_attempts);
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
void *p;
@@ -3202,7 +3306,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
}
- dout(" r_parent = %p\n", req->r_parent);
+ doutc(cl, " r_parent = %p\n", req->r_parent);
return 0;
}
@@ -3230,6 +3334,7 @@ static int __send_request(struct ceph_mds_session *session,
static void __do_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *session = NULL;
int mds = -1;
int err = 0;
@@ -3242,29 +3347,29 @@ static void __do_request(struct ceph_mds_client *mdsc,
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
- dout("do_request metadata corrupted\n");
+ doutc(cl, "metadata corrupted\n");
err = -EIO;
goto finish;
}
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
- dout("do_request timed out\n");
+ doutc(cl, "timed out\n");
err = -ETIMEDOUT;
goto finish;
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
- dout("do_request forced umount\n");
+ doutc(cl, "forced umount\n");
err = -EIO;
goto finish;
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
if (mdsc->mdsmap_err) {
err = mdsc->mdsmap_err;
- dout("do_request mdsmap err %d\n", err);
+ doutc(cl, "mdsmap err %d\n", err);
goto finish;
}
if (mdsc->mdsmap->m_epoch == 0) {
- dout("do_request no mdsmap, waiting for map\n");
+ doutc(cl, "no mdsmap, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
}
@@ -3285,7 +3390,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
err = -EJUKEBOX;
goto finish;
}
- dout("do_request no mds or not active, waiting for map\n");
+ doutc(cl, "no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
}
@@ -3301,8 +3406,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
}
req->r_session = ceph_get_mds_session(session);
- dout("do_request mds%d session %p state %s\n", mds, session,
- ceph_session_state_name(session->s_state));
+ doutc(cl, "mds%d session %p state %s\n", mds, session,
+ ceph_session_state_name(session->s_state));
/*
* The old ceph will crash the MDSs when see unknown OPs
@@ -3393,8 +3498,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
- dout("do_request session changed for auth cap %d -> %d\n",
- cap->session->s_mds, session->s_mds);
+ doutc(cl, "session changed for auth cap %d -> %d\n",
+ cap->session->s_mds, session->s_mds);
/* Remove the auth cap from old session */
spin_lock(&cap->session->s_cap_lock);
@@ -3421,7 +3526,7 @@ out_session:
ceph_put_mds_session(session);
finish:
if (err) {
- dout("__do_request early error %d\n", err);
+ doutc(cl, "early error %d\n", err);
req->r_err = err;
complete_request(mdsc, req);
__unregister_request(mdsc, req);
@@ -3435,6 +3540,7 @@ finish:
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
LIST_HEAD(tmp_list);
@@ -3444,7 +3550,8 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
req = list_entry(tmp_list.next,
struct ceph_mds_request, r_wait);
list_del_init(&req->r_wait);
- dout(" wake request %p tid %llu\n", req, req->r_tid);
+ doutc(cl, " wake request %p tid %llu\n", req,
+ req->r_tid);
__do_request(mdsc, req);
}
}
@@ -3455,10 +3562,11 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
*/
static void kick_requests(struct ceph_mds_client *mdsc, int mds)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct rb_node *p = rb_first(&mdsc->request_tree);
- dout("kick_requests mds%d\n", mds);
+ doutc(cl, "kick_requests mds%d\n", mds);
while (p) {
req = rb_entry(p, struct ceph_mds_request, r_node);
p = rb_next(p);
@@ -3468,7 +3576,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
continue; /* only new requests */
if (req->r_session &&
req->r_session->s_mds == mds) {
- dout(" kicking tid %llu\n", req->r_tid);
+ doutc(cl, " kicking tid %llu\n", req->r_tid);
list_del_init(&req->r_wait);
__do_request(mdsc, req);
}
@@ -3478,6 +3586,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int err = 0;
/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
@@ -3499,8 +3608,7 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
if (req->r_inode) {
err = ceph_wait_on_async_create(req->r_inode);
if (err) {
- dout("%s: wait for async create returned: %d\n",
- __func__, err);
+ doutc(cl, "wait for async create returned: %d\n", err);
return err;
}
}
@@ -3508,13 +3616,12 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
if (!err && req->r_old_inode) {
err = ceph_wait_on_async_create(req->r_old_inode);
if (err) {
- dout("%s: wait for async create returned: %d\n",
- __func__, err);
+ doutc(cl, "wait for async create returned: %d\n", err);
return err;
}
}
- dout("submit_request on %p for inode %p\n", req, dir);
+ doutc(cl, "submit_request on %p for inode %p\n", req, dir);
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
__do_request(mdsc, req);
@@ -3527,10 +3634,11 @@ int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req,
ceph_mds_request_wait_callback_t wait_func)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int err;
/* wait */
- dout("do_request waiting\n");
+ doutc(cl, "do_request waiting\n");
if (wait_func) {
err = wait_func(mdsc, req);
} else {
@@ -3544,14 +3652,14 @@ int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
else
err = timeleft; /* killed */
}
- dout("do_request waited, got %d\n", err);
+ doutc(cl, "do_request waited, got %d\n", err);
mutex_lock(&mdsc->mutex);
/* only abort if we didn't race with a real reply */
if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
err = le32_to_cpu(req->r_reply_info.head->result);
} else if (err < 0) {
- dout("aborted request %lld with %d\n", req->r_tid, err);
+ doutc(cl, "aborted request %lld with %d\n", req->r_tid, err);
/*
* ensure we aren't running concurrently with
@@ -3582,15 +3690,16 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int err;
- dout("do_request on %p\n", req);
+ doutc(cl, "do_request on %p\n", req);
/* issue */
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err)
err = ceph_mdsc_wait_request(mdsc, req, NULL);
- dout("do_request %p done, result %d\n", req, err);
+ doutc(cl, "do_request %p done, result %d\n", req, err);
return err;
}
@@ -3602,8 +3711,10 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
{
struct inode *dir = req->r_parent;
struct inode *old_dir = req->r_old_dentry_dir;
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
- dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir);
+ doutc(cl, "invalidate_dir_request %p %p (complete, lease(s))\n",
+ dir, old_dir);
ceph_dir_clear_complete(dir);
if (old_dir)
@@ -3624,6 +3735,7 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_mds_reply_head *head = msg->front.iov_base;
struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
@@ -3634,7 +3746,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
bool close_sessions = false;
if (msg->front.iov_len < sizeof(*head)) {
- pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+ pr_err_client(cl, "got corrupt (short) reply\n");
ceph_msg_dump(msg);
return;
}
@@ -3644,17 +3756,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&mdsc->mutex);
req = lookup_get_request(mdsc, tid);
if (!req) {
- dout("handle_reply on unknown tid %llu\n", tid);
+ doutc(cl, "on unknown tid %llu\n", tid);
mutex_unlock(&mdsc->mutex);
return;
}
- dout("handle_reply %p\n", req);
+ doutc(cl, "handle_reply %p\n", req);
/* correct session? */
if (req->r_session != session) {
- pr_err("mdsc_handle_reply got %llu on session mds%d"
- " not mds%d\n", tid, session->s_mds,
- req->r_session ? req->r_session->s_mds : -1);
+ pr_err_client(cl, "got %llu on session mds%d not mds%d\n",
+ tid, session->s_mds,
+ req->r_session ? req->r_session->s_mds : -1);
mutex_unlock(&mdsc->mutex);
goto out;
}
@@ -3662,14 +3774,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* dup? */
if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
(test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
- pr_warn("got a dup %s reply on %llu from mds%d\n",
- head->safe ? "safe" : "unsafe", tid, mds);
+ pr_warn_client(cl, "got a dup %s reply on %llu from mds%d\n",
+ head->safe ? "safe" : "unsafe", tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
}
if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
- pr_warn("got unsafe after safe on %llu from mds%d\n",
- tid, mds);
+ pr_warn_client(cl, "got unsafe after safe on %llu from mds%d\n",
+ tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
}
@@ -3692,7 +3804,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
* response. And even if it did, there is nothing
* useful we could do with a revised return value.
*/
- dout("got safe reply %llu, mds%d\n", tid, mds);
+ doutc(cl, "got safe reply %llu, mds%d\n", tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
@@ -3702,7 +3814,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
}
- dout("handle_reply tid %lld result %d\n", tid, result);
+ doutc(cl, "tid %lld result %d\n", tid, result);
if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
err = parse_reply_info(session, msg, req, (u64)-1);
else
@@ -3742,7 +3854,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&session->s_mutex);
if (err < 0) {
- pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
+ pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n",
+ mds, tid);
ceph_msg_dump(msg);
goto out_err;
}
@@ -3806,7 +3919,7 @@ out_err:
set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
}
} else {
- dout("reply arrived after request %lld was aborted\n", tid);
+ doutc(cl, "reply arrived after request %lld was aborted\n", tid);
}
mutex_unlock(&mdsc->mutex);
@@ -3835,6 +3948,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
u64 tid = le64_to_cpu(msg->hdr.tid);
u32 next_mds;
@@ -3852,12 +3966,12 @@ static void handle_forward(struct ceph_mds_client *mdsc,
req = lookup_get_request(mdsc, tid);
if (!req) {
mutex_unlock(&mdsc->mutex);
- dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
+ doutc(cl, "forward tid %llu to mds%d - req dne\n", tid, next_mds);
return; /* dup reply? */
}
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
- dout("forward tid %llu aborted, unregistering\n", tid);
+ doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
} else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
/*
@@ -3873,10 +3987,11 @@ static void handle_forward(struct ceph_mds_client *mdsc,
set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
mutex_unlock(&req->r_fill_mutex);
aborted = true;
- pr_warn_ratelimited("forward tid %llu seq overflow\n", tid);
+ pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n",
+ tid);
} else {
/* resend. forward race not possible; mds would drop */
- dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+ doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
BUG_ON(req->r_err);
BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
req->r_attempts = 0;
@@ -3894,7 +4009,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
return;
bad:
- pr_err("mdsc_handle_forward decode error err=%d\n", err);
+ pr_err_client(cl, "decode error err=%d\n", err);
ceph_msg_dump(msg);
}
@@ -3933,6 +4048,7 @@ static void handle_session(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
int mds = session->s_mds;
int msg_version = le16_to_cpu(msg->hdr.version);
void *p = msg->front.iov_base;
@@ -3980,7 +4096,8 @@ static void handle_session(struct ceph_mds_session *session,
/* version >= 5, flags */
ceph_decode_32_safe(&p, end, flags, bad);
if (flags & CEPH_SESSION_BLOCKLISTED) {
- pr_warn("mds%d session blocklisted\n", session->s_mds);
+ pr_warn_client(cl, "mds%d session blocklisted\n",
+ session->s_mds);
blocklisted = true;
}
}
@@ -3996,22 +4113,24 @@ static void handle_session(struct ceph_mds_session *session,
mutex_lock(&session->s_mutex);
- dout("handle_session mds%d %s %p state %s seq %llu\n",
- mds, ceph_session_op_name(op), session,
- ceph_session_state_name(session->s_state), seq);
+ doutc(cl, "mds%d %s %p state %s seq %llu\n", mds,
+ ceph_session_op_name(op), session,
+ ceph_session_state_name(session->s_state), seq);
if (session->s_state == CEPH_MDS_SESSION_HUNG) {
session->s_state = CEPH_MDS_SESSION_OPEN;
- pr_info("mds%d came back\n", session->s_mds);
+ pr_info_client(cl, "mds%d came back\n", session->s_mds);
}
switch (op) {
case CEPH_SESSION_OPEN:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
- pr_info("mds%d reconnect success\n", session->s_mds);
+ pr_info_client(cl, "mds%d reconnect success\n",
+ session->s_mds);
if (session->s_state == CEPH_MDS_SESSION_OPEN) {
- pr_notice("mds%d is already opened\n", session->s_mds);
+ pr_notice_client(cl, "mds%d is already opened\n",
+ session->s_mds);
} else {
session->s_state = CEPH_MDS_SESSION_OPEN;
session->s_features = features;
@@ -4041,7 +4160,8 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_CLOSE:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
- pr_info("mds%d reconnect denied\n", session->s_mds);
+ pr_info_client(cl, "mds%d reconnect denied\n",
+ session->s_mds);
session->s_state = CEPH_MDS_SESSION_CLOSED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
@@ -4050,8 +4170,8 @@ static void handle_session(struct ceph_mds_session *session,
break;
case CEPH_SESSION_STALE:
- pr_info("mds%d caps went stale, renewing\n",
- session->s_mds);
+ pr_info_client(cl, "mds%d caps went stale, renewing\n",
+ session->s_mds);
atomic_inc(&session->s_cap_gen);
session->s_cap_ttl = jiffies - 1;
send_renew_caps(mdsc, session);
@@ -4072,7 +4192,7 @@ static void handle_session(struct ceph_mds_session *session,
break;
case CEPH_SESSION_FORCE_RO:
- dout("force_session_readonly %p\n", session);
+ doutc(cl, "force_session_readonly %p\n", session);
spin_lock(&session->s_cap_lock);
session->s_readonly = true;
spin_unlock(&session->s_cap_lock);
@@ -4081,7 +4201,8 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_REJECT:
WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
- pr_info("mds%d rejected session\n", session->s_mds);
+ pr_info_client(cl, "mds%d rejected session\n",
+ session->s_mds);
session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
@@ -4091,7 +4212,7 @@ static void handle_session(struct ceph_mds_session *session,
break;
default:
- pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+ pr_err_client(cl, "bad op %d mds%d\n", op, mds);
WARN_ON(1);
}
@@ -4108,30 +4229,32 @@ static void handle_session(struct ceph_mds_session *session,
return;
bad:
- pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
- (int)msg->front.iov_len);
+ pr_err_client(cl, "corrupt message mds%d len %d\n", mds,
+ (int)msg->front.iov_len);
ceph_msg_dump(msg);
return;
}
void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
{
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
int dcaps;
dcaps = xchg(&req->r_dir_caps, 0);
if (dcaps) {
- dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
}
}
void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
{
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
int dcaps;
dcaps = xchg(&req->r_dir_caps, 0);
if (dcaps) {
- dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
dcaps);
}
@@ -4146,7 +4269,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req, *nreq;
struct rb_node *p;
- dout("replay_unsafe_requests mds%d\n", session->s_mds);
+ doutc(mdsc->fsc->client, "mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
@@ -4290,6 +4413,8 @@ out_unlock:
*/
static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
union {
struct ceph_mds_cap_reconnect v2;
struct ceph_mds_cap_reconnect_v1 v1;
@@ -4307,7 +4432,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
dentry = d_find_primary(inode);
if (dentry) {
/* set pathbase to parent dir when msg_version >= 2 */
- path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
+ path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
recon_state->msg_version >= 2);
dput(dentry);
if (IS_ERR(path)) {
@@ -4326,9 +4451,9 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
err = 0;
goto out_err;
}
- dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
- inode, ceph_vinop(inode), cap, cap->cap_id,
- ceph_cap_string(cap->issued));
+ doutc(cl, " adding %p ino %llx.%llx cap %p %lld %s\n", inode,
+ ceph_vinop(inode), cap, cap->cap_id,
+ ceph_cap_string(cap->issued));
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
@@ -4482,6 +4607,7 @@ static int encode_snap_realms(struct ceph_mds_client *mdsc,
{
struct rb_node *p;
struct ceph_pagelist *pagelist = recon_state->pagelist;
+ struct ceph_client *cl = mdsc->fsc->client;
int err = 0;
if (recon_state->msg_version >= 4) {
@@ -4520,8 +4646,8 @@ static int encode_snap_realms(struct ceph_mds_client *mdsc,
ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
}
- dout(" adding snap realm %llx seq %lld parent %llx\n",
- realm->ino, realm->seq, realm->parent_ino);
+ doutc(cl, " adding snap realm %llx seq %lld parent %llx\n",
+ realm->ino, realm->seq, realm->parent_ino);
sr_rec.ino = cpu_to_le64(realm->ino);
sr_rec.seq = cpu_to_le64(realm->seq);
sr_rec.parent = cpu_to_le64(realm->parent_ino);
@@ -4550,6 +4676,7 @@ fail:
static void send_mds_reconnect(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *reply;
int mds = session->s_mds;
int err = -ENOMEM;
@@ -4558,7 +4685,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
};
LIST_HEAD(dispose);
- pr_info("mds%d reconnect start\n", mds);
+ pr_info_client(cl, "mds%d reconnect start\n", mds);
recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!recon_state.pagelist)
@@ -4574,8 +4701,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;
- dout("session %p state %s\n", session,
- ceph_session_state_name(session->s_state));
+ doutc(cl, "session %p state %s\n", session,
+ ceph_session_state_name(session->s_state));
atomic_inc(&session->s_cap_gen);
@@ -4709,7 +4836,8 @@ fail:
fail_nomsg:
ceph_pagelist_release(recon_state.pagelist);
fail_nopagelist:
- pr_err("error %d preparing reconnect for mds%d\n", err, mds);
+ pr_err_client(cl, "error %d preparing reconnect for mds%d\n",
+ err, mds);
return;
}
@@ -4728,9 +4856,9 @@ static void check_new_map(struct ceph_mds_client *mdsc,
int oldstate, newstate;
struct ceph_mds_session *s;
unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0};
+ struct ceph_client *cl = mdsc->fsc->client;
- dout("check_new_map new %u old %u\n",
- newmap->m_epoch, oldmap->m_epoch);
+ doutc(cl, "new %u old %u\n", newmap->m_epoch, oldmap->m_epoch);
if (newmap->m_info) {
for (i = 0; i < newmap->possible_max_rank; i++) {
@@ -4746,12 +4874,12 @@ static void check_new_map(struct ceph_mds_client *mdsc,
oldstate = ceph_mdsmap_get_state(oldmap, i);
newstate = ceph_mdsmap_get_state(newmap, i);
- dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
- i, ceph_mds_state_name(oldstate),
- ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
- ceph_mds_state_name(newstate),
- ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
- ceph_session_state_name(s->s_state));
+ doutc(cl, "mds%d state %s%s -> %s%s (session %s)\n",
+ i, ceph_mds_state_name(oldstate),
+ ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
+ ceph_mds_state_name(newstate),
+ ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
+ ceph_session_state_name(s->s_state));
if (i >= newmap->possible_max_rank) {
/* force close session for stopped mds */
@@ -4804,7 +4932,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,
newstate >= CEPH_MDS_STATE_ACTIVE) {
if (oldstate != CEPH_MDS_STATE_CREATING &&
oldstate != CEPH_MDS_STATE_STARTING)
- pr_info("mds%d recovery completed\n", s->s_mds);
+ pr_info_client(cl, "mds%d recovery completed\n",
+ s->s_mds);
kick_requests(mdsc, i);
mutex_unlock(&mdsc->mutex);
mutex_lock(&s->s_mutex);
@@ -4848,12 +4977,13 @@ static void check_new_map(struct ceph_mds_client *mdsc,
s = __open_export_target_session(mdsc, i);
if (IS_ERR(s)) {
err = PTR_ERR(s);
- pr_err("failed to open export target session, err %d\n",
- err);
+ pr_err_client(cl,
+ "failed to open export target session, err %d\n",
+ err);
continue;
}
}
- dout("send reconnect to export target mds.%d\n", i);
+ doutc(cl, "send reconnect to export target mds.%d\n", i);
mutex_unlock(&mdsc->mutex);
send_mds_reconnect(mdsc, s);
ceph_put_mds_session(s);
@@ -4869,8 +4999,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
if (s->s_state == CEPH_MDS_SESSION_OPEN ||
s->s_state == CEPH_MDS_SESSION_HUNG ||
s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout(" connecting to export targets of laggy mds%d\n",
- i);
+ doutc(cl, " connecting to export targets of laggy mds%d\n", i);
__open_export_target_sessions(mdsc, s);
}
}
@@ -4897,6 +5026,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct super_block *sb = mdsc->fsc->sb;
struct inode *inode;
struct dentry *parent, *dentry;
@@ -4908,7 +5038,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
struct qstr dname;
int release = 0;
- dout("handle_lease from mds%d\n", mds);
+ doutc(cl, "from mds%d\n", mds);
if (!ceph_inc_mds_stopping_blocker(mdsc, session))
return;
@@ -4926,20 +5056,19 @@ static void handle_lease(struct ceph_mds_client *mdsc,
/* lookup inode */
inode = ceph_find_inode(sb, vino);
- dout("handle_lease %s, ino %llx %p %.*s\n",
- ceph_lease_op_name(h->action), vino.ino, inode,
- dname.len, dname.name);
+ doutc(cl, "%s, ino %llx %p %.*s\n", ceph_lease_op_name(h->action),
+ vino.ino, inode, dname.len, dname.name);
mutex_lock(&session->s_mutex);
if (!inode) {
- dout("handle_lease no inode %llx\n", vino.ino);
+ doutc(cl, "no inode %llx\n", vino.ino);
goto release;
}
/* dentry */
parent = d_find_alias(inode);
if (!parent) {
- dout("no parent dentry on inode %p\n", inode);
+ doutc(cl, "no parent dentry on inode %p\n", inode);
WARN_ON(1);
goto release; /* hrm... */
}
@@ -4999,7 +5128,7 @@ out:
bad:
ceph_dec_mds_stopping_blocker(mdsc);
- pr_err("corrupt lease message\n");
+ pr_err_client(cl, "corrupt lease message\n");
ceph_msg_dump(msg);
}
@@ -5007,13 +5136,14 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
struct dentry *dentry, char action,
u32 seq)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_lease *lease;
struct inode *dir;
int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
- dout("lease_send_msg identry %p %s to mds%d\n",
- dentry, ceph_lease_op_name(action), session->s_mds);
+ doutc(cl, "identry %p %s to mds%d\n", dentry, ceph_lease_op_name(action),
+ session->s_mds);
msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
if (!msg)
@@ -5046,6 +5176,7 @@ static void lock_unlock_session(struct ceph_mds_session *s)
static void maybe_recover_session(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_fs_client *fsc = mdsc->fsc;
if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
@@ -5057,17 +5188,19 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
if (!READ_ONCE(fsc->blocklisted))
return;
- pr_info("auto reconnect after blocklisted\n");
+ pr_info_client(cl, "auto reconnect after blocklisted\n");
ceph_force_reconnect(fsc->sb);
}
bool check_session_state(struct ceph_mds_session *s)
{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
+
switch (s->s_state) {
case CEPH_MDS_SESSION_OPEN:
if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
s->s_state = CEPH_MDS_SESSION_HUNG;
- pr_info("mds%d hung\n", s->s_mds);
+ pr_info_client(cl, "mds%d hung\n", s->s_mds);
}
break;
case CEPH_MDS_SESSION_CLOSING:
@@ -5087,6 +5220,8 @@ bool check_session_state(struct ceph_mds_session *s)
*/
void inc_session_sequence(struct ceph_mds_session *s)
{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
+
lockdep_assert_held(&s->s_mutex);
s->s_seq++;
@@ -5094,11 +5229,11 @@ void inc_session_sequence(struct ceph_mds_session *s)
if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
int ret;
- dout("resending session close request for mds%d\n", s->s_mds);
+ doutc(cl, "resending session close request for mds%d\n", s->s_mds);
ret = request_close_session(s);
if (ret < 0)
- pr_err("unable to close session to mds%d: %d\n",
- s->s_mds, ret);
+ pr_err_client(cl, "unable to close session to mds%d: %d\n",
+ s->s_mds, ret);
}
}
@@ -5127,7 +5262,7 @@ static void delayed_work(struct work_struct *work)
int renew_caps;
int i;
- dout("mdsc delayed_work\n");
+ doutc(mdsc->fsc->client, "mdsc delayed_work\n");
if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
return;
@@ -5256,6 +5391,7 @@ err_mdsc:
*/
static void wait_requests(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_options *opts = mdsc->fsc->client->options;
struct ceph_mds_request *req;
@@ -5263,25 +5399,25 @@ static void wait_requests(struct ceph_mds_client *mdsc)
if (__get_oldest_req(mdsc)) {
mutex_unlock(&mdsc->mutex);
- dout("wait_requests waiting for requests\n");
+ doutc(cl, "waiting for requests\n");
wait_for_completion_timeout(&mdsc->safe_umount_waiters,
ceph_timeout_jiffies(opts->mount_timeout));
/* tear down remaining requests */
mutex_lock(&mdsc->mutex);
while ((req = __get_oldest_req(mdsc))) {
- dout("wait_requests timed out on tid %llu\n",
- req->r_tid);
+ doutc(cl, "timed out on tid %llu\n", req->r_tid);
list_del_init(&req->r_wait);
__unregister_request(mdsc, req);
}
}
mutex_unlock(&mdsc->mutex);
- dout("wait_requests done\n");
+ doutc(cl, "done\n");
}
void send_flush_mdlog(struct ceph_mds_session *s)
{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
struct ceph_msg *msg;
/*
@@ -5291,13 +5427,13 @@ void send_flush_mdlog(struct ceph_mds_session *s)
return;
mutex_lock(&s->s_mutex);
- dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
- ceph_session_state_name(s->s_state), s->s_seq);
+ doutc(cl, "request mdlog flush to mds%d (%s)s seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
s->s_seq);
if (!msg) {
- pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
- s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
+ pr_err_client(cl, "failed to request mdlog flush to mds%d (%s) seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
} else {
ceph_con_send(&s->s_con, msg);
}
@@ -5310,7 +5446,7 @@ void send_flush_mdlog(struct ceph_mds_session *s)
*/
void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
{
- dout("pre_umount\n");
+ doutc(mdsc->fsc->client, "begin\n");
mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
@@ -5325,6 +5461,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
ceph_msgr_flush();
ceph_cleanup_quotarealms_inodes(mdsc);
+ doutc(mdsc->fsc->client, "done\n");
}
/*
@@ -5333,12 +5470,13 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
u64 want_tid)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req = NULL, *nextreq;
struct ceph_mds_session *last_session = NULL;
struct rb_node *n;
mutex_lock(&mdsc->mutex);
- dout("%s want %lld\n", __func__, want_tid);
+ doutc(cl, "want %lld\n", want_tid);
restart:
req = __get_oldest_req(mdsc);
while (req && req->r_tid <= want_tid) {
@@ -5372,8 +5510,8 @@ restart:
} else {
ceph_put_mds_session(s);
}
- dout("%s wait on %llu (want %llu)\n", __func__,
- req->r_tid, want_tid);
+ doutc(cl, "wait on %llu (want %llu)\n",
+ req->r_tid, want_tid);
wait_for_completion(&req->r_safe_completion);
mutex_lock(&mdsc->mutex);
@@ -5391,17 +5529,18 @@ restart:
}
mutex_unlock(&mdsc->mutex);
ceph_put_mds_session(last_session);
- dout("%s done\n", __func__);
+ doutc(cl, "done\n");
}
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
u64 want_tid, want_flush;
if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
return;
- dout("sync\n");
+ doutc(cl, "sync\n");
mutex_lock(&mdsc->mutex);
want_tid = mdsc->last_tid;
mutex_unlock(&mdsc->mutex);
@@ -5417,8 +5556,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
}
spin_unlock(&mdsc->cap_dirty_lock);
- dout("sync want tid %lld flush_seq %lld\n",
- want_tid, want_flush);
+ doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
wait_caps_flush(mdsc, want_flush);
@@ -5440,11 +5578,12 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
{
struct ceph_options *opts = mdsc->fsc->client->options;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *session;
int i;
int skipped = 0;
- dout("close_sessions\n");
+ doutc(cl, "begin\n");
/* close sessions */
mutex_lock(&mdsc->mutex);
@@ -5462,7 +5601,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
}
mutex_unlock(&mdsc->mutex);
- dout("waiting for sessions to close\n");
+ doutc(cl, "waiting for sessions to close\n");
wait_event_timeout(mdsc->session_close_wq,
done_closing_sessions(mdsc, skipped),
ceph_timeout_jiffies(opts->mount_timeout));
@@ -5490,7 +5629,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
- dout("stopped\n");
+ doutc(cl, "done\n");
}
void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
@@ -5498,7 +5637,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
struct ceph_mds_session *session;
int mds;
- dout("force umount\n");
+ doutc(mdsc->fsc->client, "force umount\n");
mutex_lock(&mdsc->mutex);
for (mds = 0; mds < mdsc->max_sessions; mds++) {
@@ -5529,7 +5668,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
- dout("stop\n");
+ doutc(mdsc->fsc->client, "stop\n");
/*
* Make sure the delayed work stopped before releasing
* the resources.
@@ -5550,7 +5689,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
{
struct ceph_mds_client *mdsc = fsc->mdsc;
- dout("mdsc_destroy %p\n", mdsc);
+ doutc(fsc->client, "%p\n", mdsc);
if (!mdsc)
return;
@@ -5564,12 +5703,13 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
fsc->mdsc = NULL;
kfree(mdsc);
- dout("mdsc_destroy %p done\n", mdsc);
+ doutc(fsc->client, "%p done\n", mdsc);
}
void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
{
struct ceph_fs_client *fsc = mdsc->fsc;
+ struct ceph_client *cl = fsc->client;
const char *mds_namespace = fsc->mount_options->mds_namespace;
void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len;
@@ -5581,7 +5721,7 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
ceph_decode_need(&p, end, sizeof(u32), bad);
epoch = ceph_decode_32(&p);
- dout("handle_fsmap epoch %u\n", epoch);
+ doutc(cl, "epoch %u\n", epoch);
/* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */
ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad);
@@ -5626,7 +5766,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
return;
bad:
- pr_err("error decoding fsmap %d. Shutting down mount.\n", err);
+ pr_err_client(cl, "error decoding fsmap %d. Shutting down mount.\n",
+ err);
ceph_umount_begin(mdsc->fsc->sb);
ceph_msg_dump(msg);
err_out:
@@ -5641,6 +5782,7 @@ err_out:
*/
void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
u32 epoch;
u32 maplen;
void *p = msg->front.iov_base;
@@ -5655,18 +5797,17 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
return;
epoch = ceph_decode_32(&p);
maplen = ceph_decode_32(&p);
- dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+ doutc(cl, "epoch %u len %d\n", epoch, (int)maplen);
/* do we need it? */
mutex_lock(&mdsc->mutex);
if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
- dout("handle_map epoch %u <= our %u\n",
- epoch, mdsc->mdsmap->m_epoch);
+ doutc(cl, "epoch %u <= our %u\n", epoch, mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex);
return;
}
- newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client));
+ newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client));
if (IS_ERR(newmap)) {
err = PTR_ERR(newmap);
goto bad_unlock;
@@ -5695,7 +5836,8 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
bad_unlock:
mutex_unlock(&mdsc->mutex);
bad:
- pr_err("error decoding mdsmap %d. Shutting down mount.\n", err);
+ pr_err_client(cl, "error decoding mdsmap %d. Shutting down mount.\n",
+ err);
ceph_umount_begin(mdsc->fsc->sb);
ceph_msg_dump(msg);
return;
@@ -5726,7 +5868,8 @@ static void mds_peer_reset(struct ceph_connection *con)
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- pr_warn("mds%d closed our session\n", s->s_mds);
+ pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",
+ s->s_mds);
if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
send_mds_reconnect(mdsc, s);
}
@@ -5735,6 +5878,7 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
int type = le16_to_cpu(msg->hdr.type);
mutex_lock(&mdsc->mutex);
@@ -5774,8 +5918,8 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
break;
default:
- pr_err("received unknown message type %d %s\n", type,
- ceph_msg_type_name(type));
+ pr_err_client(cl, "received unknown message type %d %s\n",
+ type, ceph_msg_type_name(type));
}
out:
ceph_msg_put(msg);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 5a3714bdd64a..2e6ddaa13d72 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -14,9 +14,9 @@
#include <linux/ceph/types.h>
#include <linux/ceph/messenger.h>
-#include <linux/ceph/mdsmap.h>
#include <linux/ceph/auth.h>
+#include "mdsmap.h"
#include "metric.h"
#include "super.h"
@@ -33,8 +33,10 @@ enum ceph_feature_type {
CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
CEPHFS_FEATURE_OP_GETVXATTR,
CEPHFS_FEATURE_32BITS_RETRY_FWD,
+ CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
+ CEPHFS_FEATURE_HAS_OWNER_UIDGID,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
};
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
@@ -49,6 +51,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
CEPHFS_FEATURE_OP_GETVXATTR, \
CEPHFS_FEATURE_32BITS_RETRY_FWD, \
+ CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
}
/*
@@ -300,6 +303,7 @@ struct ceph_mds_request {
int r_fmode; /* file mode, if expecting cap */
int r_request_release_offset;
const struct cred *r_cred;
+ struct mnt_idmap *r_mnt_idmap;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
@@ -581,7 +585,8 @@ static inline void ceph_mdsc_free_path(char *path, int len)
__putname(path - (PATH_MAX - 1 - len));
}
-extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
+ struct dentry *dentry, int *plen, u64 *base,
int for_wire);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
@@ -614,4 +619,6 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
+
+extern bool enable_unsafe_idmap;
#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 7dac21ee6ce7..fae97c25ce58 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -7,10 +7,11 @@
#include <linux/slab.h>
#include <linux/types.h>
-#include <linux/ceph/mdsmap.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/decode.h>
+#include "mdsmap.h"
+#include "mds_client.h"
#include "super.h"
#define CEPH_MDS_IS_READY(i, ignore_laggy) \
@@ -114,8 +115,10 @@ bad:
* Ignore any fields we don't care about (there are quite a few of
* them).
*/
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+ void *end, bool msgr2)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mdsmap *m;
const void *start = *p;
int i, j, n;
@@ -233,20 +236,18 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
*p = info_end;
}
- dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
- i+1, n, global_id, mds, inc,
- ceph_pr_addr(&addr),
- ceph_mds_state_name(state),
- laggy ? "(laggy)" : "");
+ doutc(cl, "%d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id,
+ mds, inc, ceph_pr_addr(&addr),
+ ceph_mds_state_name(state), laggy ? "(laggy)" : "");
if (mds < 0 || mds >= m->possible_max_rank) {
- pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
+ pr_warn_client(cl, "got incorrect mds(%d)\n", mds);
continue;
}
if (state <= 0) {
- dout("mdsmap_decode got incorrect state(%s)\n",
- ceph_mds_state_name(state));
+ doutc(cl, "got incorrect state(%s)\n",
+ ceph_mds_state_name(state));
continue;
}
@@ -385,16 +386,16 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
m->m_max_xattr_size = 0;
}
bad_ext:
- dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
- !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
+ doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
+ !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
*p = end;
- dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+ doutc(cl, "success epoch %u\n", m->m_epoch);
return m;
nomem:
err = -ENOMEM;
goto out_err;
corrupt:
- pr_err("corrupt mdsmap\n");
+ pr_err_client(cl, "corrupt mdsmap\n");
print_hex_dump(KERN_DEBUG, "mdsmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..89f1931f1ba6
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include <linux/bug.h>
+#include <linux/ceph/types.h>
+
+struct ceph_mds_client;
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+ u64 global_id;
+ struct ceph_entity_addr addr;
+ s32 state;
+ int num_export_targets;
+ bool laggy;
+ u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+ u32 m_epoch, m_client_epoch, m_last_failure;
+ u32 m_root;
+ u32 m_session_timeout; /* seconds */
+ u32 m_session_autoclose; /* seconds */
+ u64 m_max_file_size;
+ u64 m_max_xattr_size; /* maximum size for xattrs blob */
+ u32 m_max_mds; /* expected up:active mds number */
+ u32 m_num_active_mds; /* actual up:active mds number */
+ u32 possible_max_rank; /* possible max rank index */
+ struct ceph_mds_info *m_info;
+
+ /* which object pools file data can be stored in */
+ int m_num_data_pg_pools;
+ u64 *m_data_pg_pools;
+ u64 m_cas_pg_pool;
+
+ bool m_enabled;
+ bool m_damaged;
+ int m_num_laggy;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+ if (w >= m->possible_max_rank)
+ return NULL;
+ return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+ BUG_ON(w < 0);
+ if (w >= m->possible_max_rank)
+ return CEPH_MDS_STATE_DNE;
+ return m->m_info[w].state;
+}
+
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+ if (w >= 0 && w < m->possible_max_rank)
+ return m->m_info[w].laggy;
+ return false;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+ void *end, bool msgr2);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
+
+#endif
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 6d3584f16f9a..871c1090e520 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -31,6 +31,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
struct ceph_client_metric *m = &mdsc->metric;
u64 nr_caps = atomic64_read(&m->total_caps);
u32 header_len = sizeof(struct ceph_metric_header);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
s64 sum;
s32 items = 0;
@@ -51,8 +52,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
if (!msg) {
- pr_err("send metrics to mds%d, failed to allocate message\n",
- s->s_mds);
+ pr_err_client(cl, "to mds%d, failed to allocate message\n",
+ s->s_mds);
return false;
}
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index f7fcf7f08ec6..9d36c3532de1 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -43,6 +43,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
{
struct super_block *sb = mdsc->fsc->sb;
struct ceph_mds_quota *h = msg->front.iov_base;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_vino vino;
struct inode *inode;
struct ceph_inode_info *ci;
@@ -51,8 +52,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
return;
if (msg->front.iov_len < sizeof(*h)) {
- pr_err("%s corrupt message mds%d len %d\n", __func__,
- session->s_mds, (int)msg->front.iov_len);
+ pr_err_client(cl, "corrupt message mds%d len %d\n",
+ session->s_mds, (int)msg->front.iov_len);
ceph_msg_dump(msg);
goto out;
}
@@ -62,7 +63,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
if (!inode) {
- pr_warn("Failed to find inode %llu\n", vino.ino);
+ pr_warn_client(cl, "failed to find inode %llx\n", vino.ino);
goto out;
}
ci = ceph_inode(inode);
@@ -85,6 +86,7 @@ find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
{
struct ceph_quotarealm_inode *qri = NULL;
struct rb_node **node, *parent = NULL;
+ struct ceph_client *cl = mdsc->fsc->client;
mutex_lock(&mdsc->quotarealms_inodes_mutex);
node = &(mdsc->quotarealms_inodes.rb_node);
@@ -110,7 +112,7 @@ find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
rb_link_node(&qri->node, parent, node);
rb_insert_color(&qri->node, &mdsc->quotarealms_inodes);
} else
- pr_warn("Failed to alloc quotarealms_inode\n");
+ pr_warn_client(cl, "Failed to alloc quotarealms_inode\n");
}
mutex_unlock(&mdsc->quotarealms_inodes_mutex);
@@ -129,6 +131,7 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
struct super_block *sb,
struct ceph_snap_realm *realm)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_quotarealm_inode *qri;
struct inode *in;
@@ -161,8 +164,8 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
}
if (IS_ERR(in)) {
- dout("Can't lookup inode %llx (err: %ld)\n",
- realm->ino, PTR_ERR(in));
+ doutc(cl, "Can't lookup inode %llx (err: %ld)\n", realm->ino,
+ PTR_ERR(in));
qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
} else {
qri->timeout = 0;
@@ -213,6 +216,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
enum quota_get_realm which_quota,
bool retry)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci = NULL;
struct ceph_snap_realm *realm, *next;
struct inode *in;
@@ -226,8 +230,9 @@ restart:
if (realm)
ceph_get_snap_realm(mdsc, realm);
else
- pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
- "null i_snap_realm\n", ceph_vinop(inode));
+ pr_err_ratelimited_client(cl,
+ "%p %llx.%llx null i_snap_realm\n",
+ inode, ceph_vinop(inode));
while (realm) {
bool has_inode;
@@ -317,6 +322,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
loff_t delta)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct ceph_snap_realm *realm, *next;
struct inode *in;
@@ -332,8 +338,9 @@ restart:
if (realm)
ceph_get_snap_realm(mdsc, realm);
else
- pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
- "null i_snap_realm\n", ceph_vinop(inode));
+ pr_err_ratelimited_client(cl,
+ "%p %llx.%llx null i_snap_realm\n",
+ inode, ceph_vinop(inode));
while (realm) {
bool has_inode;
@@ -383,7 +390,7 @@ restart:
break;
default:
/* Shouldn't happen */
- pr_warn("Invalid quota check op (%d)\n", op);
+ pr_warn_client(cl, "Invalid quota check op (%d)\n", op);
exceeded = true; /* Just break the loop */
}
iput(in);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 6732e1ea97d9..c65f2b202b2b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -138,7 +138,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
__insert_snap_realm(&mdsc->snap_realms, realm);
mdsc->num_snap_realms++;
- dout("%s %llx %p\n", __func__, realm->ino, realm);
+ doutc(mdsc->fsc->client, "%llx %p\n", realm->ino, realm);
return realm;
}
@@ -150,6 +150,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct rb_node *n = mdsc->snap_realms.rb_node;
struct ceph_snap_realm *r;
@@ -162,7 +163,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
else if (ino > r->ino)
n = n->rb_right;
else {
- dout("%s %llx %p\n", __func__, r->ino, r);
+ doutc(cl, "%llx %p\n", r->ino, r);
return r;
}
}
@@ -188,9 +189,10 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
+ struct ceph_client *cl = mdsc->fsc->client;
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("%s %p %llx\n", __func__, realm, realm->ino);
+ doutc(cl, "%p %llx\n", realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
mdsc->num_snap_realms--;
@@ -290,6 +292,7 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm,
u64 parentino)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snap_realm *parent;
lockdep_assert_held_write(&mdsc->snap_rwsem);
@@ -303,8 +306,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
if (IS_ERR(parent))
return PTR_ERR(parent);
}
- dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino,
- realm, realm->parent_ino, realm->parent, parentino, parent);
+ doutc(cl, "%llx %p: %llx %p -> %llx %p\n", realm->ino, realm,
+ realm->parent_ino, realm->parent, parentino, parent);
if (realm->parent) {
list_del_init(&realm->child_item);
ceph_put_snap_realm(mdsc, realm->parent);
@@ -329,10 +332,12 @@ static int cmpu64_rev(const void *a, const void *b)
/*
* build the snap context for a given realm.
*/
-static int build_snap_context(struct ceph_snap_realm *realm,
+static int build_snap_context(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm,
struct list_head *realm_queue,
struct list_head *dirty_realms)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc;
int err = 0;
@@ -360,10 +365,10 @@ static int build_snap_context(struct ceph_snap_realm *realm,
realm->cached_context->seq == realm->seq &&
(!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) {
- dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n",
- __func__, realm->ino, realm, realm->cached_context,
- realm->cached_context->seq,
- (unsigned int)realm->cached_context->num_snaps);
+ doutc(cl, "%llx %p: %p seq %lld (%u snaps) (unchanged)\n",
+ realm->ino, realm, realm->cached_context,
+ realm->cached_context->seq,
+ (unsigned int)realm->cached_context->num_snaps);
return 0;
}
@@ -400,8 +405,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
snapc->num_snaps = num;
- dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino,
- realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps);
+ doutc(cl, "%llx %p: %p seq %lld (%u snaps)\n", realm->ino, realm,
+ snapc, snapc->seq, (unsigned int) snapc->num_snaps);
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
@@ -418,16 +423,18 @@ fail:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = NULL;
}
- pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err);
+ pr_err_client(cl, "%llx %p fail %d\n", realm->ino, realm, err);
return err;
}
/*
* rebuild snap context for the given realm and all of its children.
*/
-static void rebuild_snap_realms(struct ceph_snap_realm *realm,
+static void rebuild_snap_realms(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm,
struct list_head *dirty_realms)
{
+ struct ceph_client *cl = mdsc->fsc->client;
LIST_HEAD(realm_queue);
int last = 0;
bool skip = false;
@@ -451,9 +458,10 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm,
continue;
}
- last = build_snap_context(_realm, &realm_queue, dirty_realms);
- dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm,
- last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
+ last = build_snap_context(mdsc, _realm, &realm_queue,
+ dirty_realms);
+ doutc(cl, "%llx %p, %s\n", realm->ino, realm,
+ last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
/* is any child in the list ? */
list_for_each_entry(child, &_realm->children, child_item) {
@@ -523,6 +531,7 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap **pcapsnap)
{
struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_snap_context *old_snapc, *new_snapc;
struct ceph_cap_snap *capsnap = *pcapsnap;
struct ceph_buffer *old_blob = NULL;
@@ -548,14 +557,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
as no new writes are allowed to start when pending, so any
writes in progress now were started before the previous
cap_snap. lucky us. */
- dout("%s %p %llx.%llx already pending\n",
- __func__, inode, ceph_vinop(inode));
+ doutc(cl, "%p %llx.%llx already pending\n", inode,
+ ceph_vinop(inode));
goto update_snapc;
}
if (ci->i_wrbuffer_ref_head == 0 &&
!(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
- dout("%s %p %llx.%llx nothing dirty|writing\n",
- __func__, inode, ceph_vinop(inode));
+ doutc(cl, "%p %llx.%llx nothing dirty|writing\n", inode,
+ ceph_vinop(inode));
goto update_snapc;
}
@@ -575,15 +584,15 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
} else {
if (!(used & CEPH_CAP_FILE_WR) &&
ci->i_wrbuffer_ref_head == 0) {
- dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n",
- __func__, inode, ceph_vinop(inode));
+ doutc(cl, "%p %llx.%llx no new_snap|dirty_page|writing\n",
+ inode, ceph_vinop(inode));
goto update_snapc;
}
}
- dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n",
- __func__, inode, ceph_vinop(inode), capsnap, old_snapc,
- ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
+ doutc(cl, "%p %llx.%llx cap_snap %p queuing under %p %s %s\n",
+ inode, ceph_vinop(inode), capsnap, old_snapc,
+ ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
ihold(inode);
capsnap->follows = old_snapc->seq;
@@ -615,9 +624,9 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
if (used & CEPH_CAP_FILE_WR) {
- dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
- " now pending\n", __func__, inode, ceph_vinop(inode),
- capsnap, old_snapc, old_snapc->seq);
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
+ " now pending\n", inode, ceph_vinop(inode), capsnap,
+ old_snapc, old_snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
@@ -634,7 +643,7 @@ update_snapc:
ci->i_head_snapc = NULL;
} else {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
- dout(" new snapc is %p\n", new_snapc);
+ doutc(cl, " new snapc is %p\n", new_snapc);
}
spin_unlock(&ci->i_ceph_lock);
@@ -655,6 +664,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
{
struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
BUG_ON(capsnap->writing);
capsnap->size = i_size_read(inode);
@@ -667,11 +677,12 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->truncate_size = ci->i_truncate_size;
capsnap->truncate_seq = ci->i_truncate_seq;
if (capsnap->dirty_pages) {
- dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
- "still has %d dirty pages\n", __func__, inode,
- ceph_vinop(inode), capsnap, capsnap->context,
- capsnap->context->seq, ceph_cap_string(capsnap->dirty),
- capsnap->size, capsnap->dirty_pages);
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s "
+ "s=%llu still has %d dirty pages\n", inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq,
+ ceph_cap_string(capsnap->dirty),
+ capsnap->size, capsnap->dirty_pages);
return 0;
}
@@ -680,20 +691,20 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
* And trigger to flush the buffer immediately.
*/
if (ci->i_wrbuffer_ref) {
- dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
- "used WRBUFFER, delaying\n", __func__, inode,
- ceph_vinop(inode), capsnap, capsnap->context,
- capsnap->context->seq, ceph_cap_string(capsnap->dirty),
- capsnap->size);
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s "
+ "s=%llu used WRBUFFER, delaying\n", inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
ceph_queue_writeback(inode);
return 0;
}
ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
- dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
- __func__, inode, ceph_vinop(inode), capsnap, capsnap->context,
- capsnap->context->seq, ceph_cap_string(capsnap->dirty),
- capsnap->size);
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
+ inode, ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
if (list_empty(&ci->i_snap_flush_item)) {
@@ -708,13 +719,15 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
* Queue cap_snaps for snap writeback for this realm and its children.
* Called under snap_rwsem, so realm topology won't change.
*/
-static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct inode *lastinode = NULL;
struct ceph_cap_snap *capsnap = NULL;
- dout("%s %p %llx inode\n", __func__, realm, realm->ino);
+ doutc(cl, "%p %llx inode\n", realm, realm->ino);
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
@@ -733,8 +746,9 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
if (!capsnap) {
capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS);
if (!capsnap) {
- pr_err("ENOMEM allocating ceph_cap_snap on %p\n",
- inode);
+ pr_err_client(cl,
+ "ENOMEM allocating ceph_cap_snap on %p\n",
+ inode);
return;
}
}
@@ -752,7 +766,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
if (capsnap)
kmem_cache_free(ceph_cap_snap_cachep, capsnap);
- dout("%s %p %llx done\n", __func__, realm, realm->ino);
+ doutc(cl, "%p %llx done\n", realm, realm->ino);
}
/*
@@ -766,6 +780,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
void *p, void *e, bool deletion,
struct ceph_snap_realm **realm_ret)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
@@ -780,7 +795,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("%s deletion=%d\n", __func__, deletion);
+ doutc(cl, "deletion=%d\n", deletion);
more:
realm = NULL;
rebuild_snapcs = 0;
@@ -810,8 +825,8 @@ more:
rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
- dout("%s updating %llx %p %lld -> %lld\n", __func__,
- realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+ doutc(cl, "updating %llx %p %lld -> %lld\n", realm->ino,
+ realm, realm->seq, le64_to_cpu(ri->seq));
/* update realm parameters, snap lists */
realm->seq = le64_to_cpu(ri->seq);
realm->created = le64_to_cpu(ri->created);
@@ -834,16 +849,16 @@ more:
rebuild_snapcs = 1;
} else if (!realm->cached_context) {
- dout("%s %llx %p seq %lld new\n", __func__,
- realm->ino, realm, realm->seq);
+ doutc(cl, "%llx %p seq %lld new\n", realm->ino, realm,
+ realm->seq);
rebuild_snapcs = 1;
} else {
- dout("%s %llx %p seq %lld unchanged\n", __func__,
- realm->ino, realm, realm->seq);
+ doutc(cl, "%llx %p seq %lld unchanged\n", realm->ino, realm,
+ realm->seq);
}
- dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
- realm, rebuild_snapcs, p, e);
+ doutc(cl, "done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
/*
* this will always track the uppest parent realm from which
@@ -855,7 +870,7 @@ more:
/* rebuild_snapcs when we reach the _end_ (root) of the trace */
if (realm_to_rebuild && p >= e)
- rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
+ rebuild_snap_realms(mdsc, realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm;
@@ -873,7 +888,7 @@ more:
realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
dirty_item);
list_del_init(&realm->dirty_item);
- queue_realm_cap_snaps(realm);
+ queue_realm_cap_snaps(mdsc, realm);
}
if (realm_ret)
@@ -891,7 +906,7 @@ fail:
ceph_put_snap_realm(mdsc, realm);
if (first_realm)
ceph_put_snap_realm(mdsc, first_realm);
- pr_err("%s error %d\n", __func__, err);
+ pr_err_client(cl, "error %d\n", err);
/*
* When receiving a corrupted snap trace we don't know what
@@ -905,11 +920,12 @@ fail:
WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO);
ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
if (ret)
- pr_err("%s failed to blocklist %s: %d\n", __func__,
- ceph_pr_addr(&client->msgr.inst.addr), ret);
+ pr_err_client(cl, "failed to blocklist %s: %d\n",
+ ceph_pr_addr(&client->msgr.inst.addr), ret);
- WARN(1, "%s: %s%sdo remount to continue%s",
- __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
+ WARN(1, "[client.%lld] %s %s%sdo remount to continue%s",
+ client->monc.auth->global_id, __func__,
+ ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
ret ? "" : " was blocklisted, ",
err == -EIO ? " after corrupted snaptrace is fixed" : "");
@@ -925,11 +941,12 @@ fail:
*/
static void flush_snaps(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct inode *inode;
struct ceph_mds_session *session = NULL;
- dout("%s\n", __func__);
+ doutc(cl, "begin\n");
spin_lock(&mdsc->snap_flush_lock);
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
@@ -944,7 +961,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_flush_lock);
ceph_put_mds_session(session);
- dout("%s done\n", __func__);
+ doutc(cl, "done\n");
}
/**
@@ -960,7 +977,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
lockdep_assert_held(&ci->i_ceph_lock);
@@ -1000,6 +1017,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct super_block *sb = mdsc->fsc->sb;
int mds = session->s_mds;
u64 split;
@@ -1030,8 +1048,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
trace_len = le32_to_cpu(h->trace_len);
p += sizeof(*h);
- dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
- mds, ceph_snap_op_name(op), split, trace_len);
+ doutc(cl, "from mds%d op %s split %llx tracelen %d\n", mds,
+ ceph_snap_op_name(op), split, trace_len);
down_write(&mdsc->snap_rwsem);
locked_rwsem = 1;
@@ -1062,7 +1080,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
goto out;
}
- dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+ doutc(cl, "splitting snap_realm %llx %p\n", realm->ino, realm);
for (i = 0; i < num_split_inos; i++) {
struct ceph_vino vino = {
.ino = le64_to_cpu(split_inos[i]),
@@ -1087,13 +1105,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
*/
if (ci->i_snap_realm->created >
le64_to_cpu(ri->created)) {
- dout(" leaving %p %llx.%llx in newer realm %llx %p\n",
- inode, ceph_vinop(inode), ci->i_snap_realm->ino,
- ci->i_snap_realm);
+ doutc(cl, " leaving %p %llx.%llx in newer realm %llx %p\n",
+ inode, ceph_vinop(inode), ci->i_snap_realm->ino,
+ ci->i_snap_realm);
goto skip_inode;
}
- dout(" will move %p %llx.%llx to split realm %llx %p\n",
- inode, ceph_vinop(inode), realm->ino, realm);
+ doutc(cl, " will move %p %llx.%llx to split realm %llx %p\n",
+ inode, ceph_vinop(inode), realm->ino, realm);
ceph_get_snap_realm(mdsc, realm);
ceph_change_snap_realm(inode, realm);
@@ -1154,7 +1172,7 @@ skip_inode:
return;
bad:
- pr_err("%s corrupt snap message from mds%d\n", __func__, mds);
+ pr_err_client(cl, "corrupt snap message from mds%d\n", mds);
ceph_msg_dump(msg);
out:
if (locked_rwsem)
@@ -1170,6 +1188,7 @@ out:
struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
u64 snap)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snapid_map *sm, *exist;
struct rb_node **p, *parent;
int ret;
@@ -1192,8 +1211,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
}
spin_unlock(&mdsc->snapid_map_lock);
if (exist) {
- dout("%s found snapid map %llx -> %x\n", __func__,
- exist->snap, exist->dev);
+ doutc(cl, "found snapid map %llx -> %x\n", exist->snap,
+ exist->dev);
return exist;
}
@@ -1237,13 +1256,12 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
if (exist) {
free_anon_bdev(sm->dev);
kfree(sm);
- dout("%s found snapid map %llx -> %x\n", __func__,
- exist->snap, exist->dev);
+ doutc(cl, "found snapid map %llx -> %x\n", exist->snap,
+ exist->dev);
return exist;
}
- dout("%s create snapid map %llx -> %x\n", __func__,
- sm->snap, sm->dev);
+ doutc(cl, "create snapid map %llx -> %x\n", sm->snap, sm->dev);
return sm;
}
@@ -1268,6 +1286,7 @@ void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snapid_map *sm;
unsigned long now;
LIST_HEAD(to_free);
@@ -1289,7 +1308,7 @@ void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
while (!list_empty(&to_free)) {
sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
list_del(&sm->lru);
- dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+ doutc(cl, "trim snapid map %llx -> %x\n", sm->snap, sm->dev);
free_anon_bdev(sm->dev);
kfree(sm);
}
@@ -1297,6 +1316,7 @@ void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snapid_map *sm;
struct rb_node *p;
LIST_HEAD(to_free);
@@ -1315,8 +1335,8 @@ void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
list_del(&sm->lru);
free_anon_bdev(sm->dev);
if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
- pr_err("snapid map %llx -> %x still in use\n",
- sm->snap, sm->dev);
+ pr_err_client(cl, "snapid map %llx -> %x still in use\n",
+ sm->snap, sm->dev);
}
kfree(sm);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2d7f5a8d4a92..5ec102f6b1ac 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -44,28 +44,29 @@ static LIST_HEAD(ceph_fsc_list);
*/
static void ceph_put_super(struct super_block *s)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s);
- dout("put_super\n");
+ doutc(fsc->client, "begin\n");
ceph_fscrypt_free_dummy_policy(fsc);
ceph_mdsc_close_sessions(fsc->mdsc);
+ doutc(fsc->client, "done\n");
}
static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(d_inode(dentry));
struct ceph_mon_client *monc = &fsc->client->monc;
struct ceph_statfs st;
int i, err;
u64 data_pool;
+ doutc(fsc->client, "begin\n");
if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0];
} else {
data_pool = CEPH_NOPOOL;
}
- dout("statfs\n");
err = ceph_monc_do_statfs(monc, data_pool, &st);
if (err < 0)
return err;
@@ -113,24 +114,26 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
/* fold the fs_cluster_id into the upper bits */
buf->f_fsid.val[1] = monc->fs_cluster_id;
+ doutc(fsc->client, "done\n");
return 0;
}
static int ceph_sync_fs(struct super_block *sb, int wait)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_client *cl = fsc->client;
if (!wait) {
- dout("sync_fs (non-blocking)\n");
+ doutc(cl, "(non-blocking)\n");
ceph_flush_dirty_caps(fsc->mdsc);
- dout("sync_fs (non-blocking) done\n");
+ doutc(cl, "(non-blocking) done\n");
return 0;
}
- dout("sync_fs (blocking)\n");
+ doutc(cl, "(blocking)\n");
ceph_osdc_sync(&fsc->client->osdc);
ceph_mdsc_sync(fsc->mdsc);
- dout("sync_fs (blocking) done\n");
+ doutc(cl, "(blocking) done\n");
return 0;
}
@@ -341,7 +344,7 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
char *dev_name = param->string, *dev_name_end;
int ret;
- dout("%s '%s'\n", __func__, dev_name);
+ dout("'%s'\n", dev_name);
if (!dev_name || !*dev_name)
return invalfc(fc, "Empty source");
@@ -413,7 +416,7 @@ static int ceph_parse_mount_param(struct fs_context *fc,
return ret;
token = fs_parse(fc, ceph_mount_parameters, param, &result);
- dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
+ dout("%s: fs_parse '%s' token %d\n",__func__, param->key, token);
if (token < 0)
return token;
@@ -684,7 +687,7 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
*/
static int ceph_show_options(struct seq_file *m, struct dentry *root)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(root->d_sb);
struct ceph_mount_options *fsopt = fsc->mount_options;
size_t pos;
int ret;
@@ -881,7 +884,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc)
static void destroy_fs_client(struct ceph_fs_client *fsc)
{
- dout("destroy_fs_client %p\n", fsc);
+ doutc(fsc->client, "%p\n", fsc);
spin_lock(&ceph_fsc_lock);
list_del(&fsc->metric_wakeup);
@@ -896,7 +899,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
ceph_destroy_client(fsc->client);
kfree(fsc);
- dout("destroy_fs_client %p done\n", fsc);
+ dout("%s: %p done\n", __func__, fsc);
}
/*
@@ -1015,9 +1018,9 @@ static void __ceph_umount_begin(struct ceph_fs_client *fsc)
*/
void ceph_umount_begin(struct super_block *sb)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
- dout("ceph_umount_begin - starting forced umount\n");
+ doutc(fsc->client, "starting forced umount\n");
if (!fsc)
return;
fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
@@ -1045,13 +1048,14 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
const char *path,
unsigned long started)
{
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req = NULL;
int err;
struct dentry *root;
/* open dir */
- dout("open_root_inode opening '%s'\n", path);
+ doutc(cl, "opening '%s'\n", path);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
if (IS_ERR(req))
return ERR_CAST(req);
@@ -1071,13 +1075,13 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
if (err == 0) {
struct inode *inode = req->r_target_inode;
req->r_target_inode = NULL;
- dout("open_root_inode success\n");
+ doutc(cl, "success\n");
root = d_make_root(inode);
if (!root) {
root = ERR_PTR(-ENOMEM);
goto out;
}
- dout("open_root_inode success, root dentry is %p\n", root);
+ doutc(cl, "success, root dentry is %p\n", root);
} else {
root = ERR_PTR(err);
}
@@ -1136,11 +1140,12 @@ static int ceph_apply_test_dummy_encryption(struct super_block *sb,
static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
struct fs_context *fc)
{
+ struct ceph_client *cl = fsc->client;
int err;
unsigned long started = jiffies; /* note the start time */
struct dentry *root;
- dout("mount start %p\n", fsc);
+ doutc(cl, "mount start %p\n", fsc);
mutex_lock(&fsc->client->mount_mutex);
if (!fsc->sb->s_root) {
@@ -1163,7 +1168,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
if (err)
goto out;
- dout("mount opening path '%s'\n", path);
+ doutc(cl, "mount opening path '%s'\n", path);
ceph_fs_debugfs_init(fsc);
@@ -1178,7 +1183,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
}
fsc->mount_state = CEPH_MOUNT_MOUNTED;
- dout("mount success\n");
+ doutc(cl, "mount success\n");
mutex_unlock(&fsc->client->mount_mutex);
return root;
@@ -1191,9 +1196,10 @@ out:
static int ceph_set_super(struct super_block *s, struct fs_context *fc)
{
struct ceph_fs_client *fsc = s->s_fs_info;
+ struct ceph_client *cl = fsc->client;
int ret;
- dout("set_super %p\n", s);
+ doutc(cl, "%p\n", s);
s->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1226,31 +1232,32 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
struct ceph_fs_client *new = fc->s_fs_info;
struct ceph_mount_options *fsopt = new->mount_options;
struct ceph_options *opt = new->client->options;
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_client *cl = fsc->client;
- dout("ceph_compare_super %p\n", sb);
+ doutc(cl, "%p\n", sb);
if (compare_mount_options(fsopt, opt, fsc)) {
- dout("monitor(s)/mount options don't match\n");
+ doutc(cl, "monitor(s)/mount options don't match\n");
return 0;
}
if ((opt->flags & CEPH_OPT_FSID) &&
ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) {
- dout("fsid doesn't match\n");
+ doutc(cl, "fsid doesn't match\n");
return 0;
}
if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) {
- dout("flags differ\n");
+ doutc(cl, "flags differ\n");
return 0;
}
if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) {
- dout("client is blocklisted (and CLEANRECOVER is not set)\n");
+ doutc(cl, "client is blocklisted (and CLEANRECOVER is not set)\n");
return 0;
}
if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
- dout("client has been forcibly unmounted\n");
+ doutc(cl, "client has been forcibly unmounted\n");
return 0;
}
@@ -1322,9 +1329,9 @@ static int ceph_get_tree(struct fs_context *fc)
goto out;
}
- if (ceph_sb_to_client(sb) != fsc) {
+ if (ceph_sb_to_fs_client(sb) != fsc) {
destroy_fs_client(fsc);
- fsc = ceph_sb_to_client(sb);
+ fsc = ceph_sb_to_fs_client(sb);
dout("get_sb got existing client %p\n", fsc);
} else {
dout("get_sb using new client %p\n", fsc);
@@ -1338,8 +1345,9 @@ static int ceph_get_tree(struct fs_context *fc)
err = PTR_ERR(res);
goto out_splat;
}
- dout("root %p inode %p ino %llx.%llx\n", res,
- d_inode(res), ceph_vinop(d_inode(res)));
+
+ doutc(fsc->client, "root %p inode %p ino %llx.%llx\n", res,
+ d_inode(res), ceph_vinop(d_inode(res)));
fc->root = fsc->sb->s_root;
return 0;
@@ -1377,7 +1385,7 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
struct ceph_parse_opts_ctx *pctx = fc->fs_private;
struct ceph_mount_options *fsopt = pctx->opts;
struct super_block *sb = fc->root->d_sb;
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
err = ceph_apply_test_dummy_encryption(sb, fc, fsopt);
if (err)
@@ -1397,7 +1405,8 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
kfree(fsc->mount_options->mon_addr);
fsc->mount_options->mon_addr = fsopt->mon_addr;
fsopt->mon_addr = NULL;
- pr_notice("ceph: monitor addresses recorded, but not used for reconnection");
+ pr_notice_client(fsc->client,
+ "monitor addresses recorded, but not used for reconnection");
}
sync_filesystem(sb);
@@ -1516,11 +1525,12 @@ void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc)
static void ceph_kill_sb(struct super_block *s)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
bool wait;
- dout("kill_sb %p\n", s);
+ doutc(cl, "%p\n", s);
ceph_mdsc_pre_umount(mdsc);
flush_fs_workqueues(fsc);
@@ -1551,9 +1561,9 @@ static void ceph_kill_sb(struct super_block *s)
&mdsc->stopping_waiter,
fsc->client->options->mount_timeout);
if (!timeleft) /* timed out */
- pr_warn("umount timed out, %ld\n", timeleft);
+ pr_warn_client(cl, "umount timed out, %ld\n", timeleft);
else if (timeleft < 0) /* killed */
- pr_warn("umount was killed, %ld\n", timeleft);
+ pr_warn_client(cl, "umount was killed, %ld\n", timeleft);
}
mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
@@ -1572,13 +1582,13 @@ static struct file_system_type ceph_fs_type = {
.name = "ceph",
.init_fs_context = ceph_init_fs_context,
.kill_sb = ceph_kill_sb,
- .fs_flags = FS_RENAME_DOES_D_MOVE,
+ .fs_flags = FS_RENAME_DOES_D_MOVE | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ceph");
int ceph_force_reconnect(struct super_block *sb)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
int err = 0;
fsc->mount_state = CEPH_MOUNT_RECOVER;
@@ -1671,6 +1681,11 @@ static const struct kernel_param_ops param_ops_mount_syntax = {
module_param_cb(mount_syntax_v1, &param_ops_mount_syntax, &mount_support, 0444);
module_param_cb(mount_syntax_v2, &param_ops_mount_syntax, &mount_support, 0444);
+bool enable_unsafe_idmap = false;
+module_param(enable_unsafe_idmap, bool, 0644);
+MODULE_PARM_DESC(enable_unsafe_idmap,
+ "Allow to use idmapped mounts with MDS without CEPHFS_FEATURE_HAS_OWNER_UIDGID");
+
module_init(init_ceph);
module_exit(exit_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 98844fc8a2f7..fe0f64a0acb2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -488,13 +488,13 @@ ceph_inode(const struct inode *inode)
}
static inline struct ceph_fs_client *
-ceph_inode_to_client(const struct inode *inode)
+ceph_inode_to_fs_client(const struct inode *inode)
{
return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
}
static inline struct ceph_fs_client *
-ceph_sb_to_client(const struct super_block *sb)
+ceph_sb_to_fs_client(const struct super_block *sb)
{
return (struct ceph_fs_client *)sb->s_fs_info;
}
@@ -502,7 +502,13 @@ ceph_sb_to_client(const struct super_block *sb)
static inline struct ceph_mds_client *
ceph_sb_to_mdsc(const struct super_block *sb)
{
- return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc;
+ return (struct ceph_mds_client *)ceph_sb_to_fs_client(sb)->mdsc;
+}
+
+static inline struct ceph_client *
+ceph_inode_to_client(const struct inode *inode)
+{
+ return (struct ceph_client *)ceph_inode_to_fs_client(inode)->client;
}
static inline struct ceph_vino
@@ -558,7 +564,7 @@ static inline u64 ceph_snap(struct inode *inode)
*/
static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
{
- if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+ if (unlikely(ceph_test_mount_opt(ceph_sb_to_fs_client(sb), INO32)))
return ceph_ino_to_ino32(ino);
return ino;
}
@@ -1094,8 +1100,8 @@ struct ceph_iattr {
struct ceph_fscrypt_auth *fscrypt_auth;
};
-extern int __ceph_setattr(struct inode *inode, struct iattr *attr,
- struct ceph_iattr *cia);
+extern int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
+ struct iattr *attr, struct ceph_iattr *cia);
extern int ceph_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *attr);
extern int ceph_getattr(struct mnt_idmap *idmap,
@@ -1106,7 +1112,7 @@ void ceph_inode_shutdown(struct inode *inode);
static inline bool ceph_inode_is_shutdown(struct inode *inode)
{
unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
int state = READ_ONCE(fsc->mount_state);
return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN;
@@ -1223,7 +1229,8 @@ extern void ceph_add_cap(struct inode *inode,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
-extern void ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+ bool queue_release);
extern void __ceph_remove_caps(struct ceph_inode_info *ci);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 097ce7f74073..e066a556eccb 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -57,7 +57,8 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_string *pool_ns;
s64 pool = ci->i_layout.pool_id;
@@ -69,7 +70,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
- dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode);
+ doutc(cl, "%p\n", &ci->netfs.inode);
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
@@ -161,7 +162,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
char *val, size_t size)
{
ssize_t ret;
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ci->i_layout.pool_id;
const char *pool_name;
@@ -313,7 +314,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);
}
@@ -321,7 +322,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "client%lld",
ceph_client_gid(fsc->client));
@@ -570,6 +571,8 @@ static int __set_xattr(struct ceph_inode_info *ci,
int flags, int update_xattr,
struct ceph_inode_xattr **newxattr)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct rb_node **p;
struct rb_node *parent = NULL;
struct ceph_inode_xattr *xattr = NULL;
@@ -626,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
xattr->should_free_name = update_xattr;
ci->i_xattrs.count++;
- dout("%s count=%d\n", __func__, ci->i_xattrs.count);
+ doutc(cl, "count=%d\n", ci->i_xattrs.count);
} else {
kfree(*newxattr);
*newxattr = NULL;
@@ -654,13 +657,13 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (new) {
rb_link_node(&xattr->node, parent, p);
rb_insert_color(&xattr->node, &ci->i_xattrs.index);
- dout("%s p=%p\n", __func__, p);
+ doutc(cl, "p=%p\n", p);
}
- dout("%s added %llx.%llx xattr %p %.*s=%.*s%s\n", __func__,
- ceph_vinop(&ci->netfs.inode), xattr, name_len, name,
- min(val_len, MAX_XATTR_VAL_PRINT_LEN), val,
- val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : "");
+ doutc(cl, "added %p %llx.%llx xattr %p %.*s=%.*s%s\n", inode,
+ ceph_vinop(inode), xattr, name_len, name, min(val_len,
+ MAX_XATTR_VAL_PRINT_LEN), val,
+ val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : "");
return 0;
}
@@ -668,6 +671,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
const char *name)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
struct rb_node **p;
struct rb_node *parent = NULL;
struct ceph_inode_xattr *xattr = NULL;
@@ -688,13 +692,13 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
else {
int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN);
- dout("%s %s: found %.*s%s\n", __func__, name, len,
- xattr->val, xattr->val_len > len ? "..." : "");
+ doutc(cl, "%s found %.*s%s\n", name, len, xattr->val,
+ xattr->val_len > len ? "..." : "");
return xattr;
}
}
- dout("%s %s: not found\n", __func__, name);
+ doutc(cl, "%s not found\n", name);
return NULL;
}
@@ -735,19 +739,20 @@ static int __remove_xattr(struct ceph_inode_info *ci,
static char *__copy_xattr_names(struct ceph_inode_info *ci,
char *dest)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
struct rb_node *p;
struct ceph_inode_xattr *xattr = NULL;
p = rb_first(&ci->i_xattrs.index);
- dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+ doutc(cl, "count=%d\n", ci->i_xattrs.count);
while (p) {
xattr = rb_entry(p, struct ceph_inode_xattr, node);
memcpy(dest, xattr->name, xattr->name_len);
dest[xattr->name_len] = '\0';
- dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
- xattr->name_len, ci->i_xattrs.names_size);
+ doutc(cl, "dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+ xattr->name_len, ci->i_xattrs.names_size);
dest += xattr->name_len + 1;
p = rb_next(p);
@@ -758,19 +763,19 @@ static char *__copy_xattr_names(struct ceph_inode_info *ci,
void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
struct rb_node *p, *tmp;
struct ceph_inode_xattr *xattr = NULL;
p = rb_first(&ci->i_xattrs.index);
- dout("__ceph_destroy_xattrs p=%p\n", p);
+ doutc(cl, "p=%p\n", p);
while (p) {
xattr = rb_entry(p, struct ceph_inode_xattr, node);
tmp = p;
p = rb_next(tmp);
- dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
- xattr->name_len, xattr->name);
+ doutc(cl, "next p=%p (%.*s)\n", p, xattr->name_len, xattr->name);
rb_erase(tmp, &ci->i_xattrs.index);
__free_xattr(xattr);
@@ -787,6 +792,7 @@ static int __build_xattrs(struct inode *inode)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
u32 namelen;
u32 numattr = 0;
void *p, *end;
@@ -798,8 +804,8 @@ static int __build_xattrs(struct inode *inode)
int err = 0;
int i;
- dout("__build_xattrs() len=%d\n",
- ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+ doutc(cl, "len=%d\n",
+ ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
return 0; /* already built */
@@ -874,6 +880,8 @@ bad:
static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
int val_size)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
+
/*
* 4 bytes for the length, and additional 4 bytes per each xattr name,
* 4 bytes per each value
@@ -881,9 +889,8 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
int size = 4 + ci->i_xattrs.count*(4 + 4) +
ci->i_xattrs.names_size +
ci->i_xattrs.vals_size;
- dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
- ci->i_xattrs.count, ci->i_xattrs.names_size,
- ci->i_xattrs.vals_size);
+ doutc(cl, "c=%d names.size=%d vals.size=%d\n", ci->i_xattrs.count,
+ ci->i_xattrs.names_size, ci->i_xattrs.vals_size);
if (name_size)
size += 4 + 4 + name_size + val_size;
@@ -899,12 +906,14 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
*/
struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct rb_node *p;
struct ceph_inode_xattr *xattr = NULL;
struct ceph_buffer *old_blob = NULL;
void *dest;
- dout("__build_xattrs_blob %p\n", &ci->netfs.inode);
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
if (ci->i_xattrs.dirty) {
int need = __get_required_blob_size(ci, 0, 0);
@@ -962,6 +971,7 @@ static inline int __get_request_mask(struct inode *in) {
ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_xattr *xattr;
struct ceph_vxattr *vxattr;
@@ -1000,8 +1010,9 @@ handle_non_vxattrs:
req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock);
- dout("getxattr %p name '%s' ver=%lld index_ver=%lld\n", inode, name,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
+ doutc(cl, "%p %llx.%llx name '%s' ver=%lld index_ver=%lld\n", inode,
+ ceph_vinop(inode), name, ci->i_xattrs.version,
+ ci->i_xattrs.index_version);
if (ci->i_xattrs.version == 0 ||
!((req_mask & CEPH_CAP_XATTR_SHARED) ||
@@ -1010,8 +1021,9 @@ handle_non_vxattrs:
/* security module gets xattr while filling trace */
if (current->journal_info) {
- pr_warn_ratelimited("sync getxattr %p "
- "during filling trace\n", inode);
+ pr_warn_ratelimited_client(cl,
+ "sync %p %llx.%llx during filling trace\n",
+ inode, ceph_vinop(inode));
return -EBUSY;
}
@@ -1053,14 +1065,16 @@ out:
ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
{
struct inode *inode = d_inode(dentry);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
bool len_only = (size == 0);
u32 namelen;
int err;
spin_lock(&ci->i_ceph_lock);
- dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
+ doutc(cl, "%p %llx.%llx ver=%lld index_ver=%lld\n", inode,
+ ceph_vinop(inode), ci->i_xattrs.version,
+ ci->i_xattrs.index_version);
if (ci->i_xattrs.version == 0 ||
!__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1)) {
@@ -1094,7 +1108,8 @@ out:
static int ceph_sync_setxattr(struct inode *inode, const char *name,
const char *value, size_t size, int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1119,7 +1134,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
flags |= CEPH_XATTR_REMOVE;
}
- dout("setxattr value size: %zu\n", size);
+ doutc(cl, "name %s value size %zu\n", name, size);
/* do request */
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1148,10 +1163,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
req->r_num_caps = 1;
req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
- dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+ doutc(cl, "xattr.ver (before): %lld\n", ci->i_xattrs.version);
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
- dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+ doutc(cl, "xattr.ver (after): %lld\n", ci->i_xattrs.version);
out:
if (pagelist)
@@ -1162,9 +1177,10 @@ out:
int __ceph_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
struct ceph_buffer *old_blob = NULL;
int issued;
@@ -1220,9 +1236,9 @@ retry:
required_blob_size = __get_required_blob_size(ci, name_len, val_len);
if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) ||
(required_blob_size > mdsc->mdsmap->m_max_xattr_size)) {
- dout("%s do sync setxattr: version: %llu size: %d max: %llu\n",
- __func__, ci->i_xattrs.version, required_blob_size,
- mdsc->mdsmap->m_max_xattr_size);
+ doutc(cl, "sync version: %llu size: %d max: %llu\n",
+ ci->i_xattrs.version, required_blob_size,
+ mdsc->mdsmap->m_max_xattr_size);
goto do_sync;
}
@@ -1236,8 +1252,8 @@ retry:
}
}
- dout("setxattr %p name '%s' issued %s\n", inode, name,
- ceph_cap_string(issued));
+ doutc(cl, "%p %llx.%llx name '%s' issued %s\n", inode,
+ ceph_vinop(inode), name, ceph_cap_string(issued));
__build_xattrs(inode);
if (!ci->i_xattrs.prealloc_blob ||
@@ -1246,7 +1262,8 @@ retry:
spin_unlock(&ci->i_ceph_lock);
ceph_buffer_put(old_blob); /* Shouldn't be required */
- dout(" pre-allocating new blob size=%d\n", required_blob_size);
+ doutc(cl, " pre-allocating new blob size=%d\n",
+ required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob)
goto do_sync_unlocked;
@@ -1285,8 +1302,9 @@ do_sync_unlocked:
/* security module set xattr while filling trace */
if (current->journal_info) {
- pr_warn_ratelimited("sync setxattr %p "
- "during filling trace\n", inode);
+ pr_warn_ratelimited_client(cl,
+ "sync %p %llx.%llx during filling trace\n",
+ inode, ceph_vinop(inode));
err = -EBUSY;
} else {
err = ceph_sync_setxattr(inode, name, value, size, flags);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 6ba032442b39..57cc096c498a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -25,7 +25,7 @@
#include "internal.h"
-static struct kobj_map *cdev_map;
+static struct kobj_map *cdev_map __ro_after_init;
static DEFINE_MUTEX(chrdevs_lock);
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index a10710bc8123..cf3b58ec32cc 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -20,8 +20,8 @@
* managed alongside the master keys in the filesystem-level keyring)
*/
-#include <crypto/algapi.h>
#include <crypto/skcipher.h>
+#include <crypto/utils.h>
#include <keys/user-type.h>
#include <linux/hashtable.h>
#include <linux/scatterlist.h>
diff --git a/fs/dax.c b/fs/dax.c
index 8fafecbe42b1..3380b43cb6bb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -412,23 +412,23 @@ static struct page *dax_busy_page(void *entry)
return NULL;
}
-/*
- * dax_lock_page - Lock the DAX entry corresponding to a page
- * @page: The page whose entry we want to lock
+/**
+ * dax_lock_folio - Lock the DAX entry corresponding to a folio
+ * @folio: The folio whose entry we want to lock
*
* Context: Process context.
- * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
+ * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
* not be locked.
*/
-dax_entry_t dax_lock_page(struct page *page)
+dax_entry_t dax_lock_folio(struct folio *folio)
{
XA_STATE(xas, NULL, 0);
void *entry;
- /* Ensure page->mapping isn't freed while we look at it */
+ /* Ensure folio->mapping isn't freed while we look at it */
rcu_read_lock();
for (;;) {
- struct address_space *mapping = READ_ONCE(page->mapping);
+ struct address_space *mapping = READ_ONCE(folio->mapping);
entry = NULL;
if (!mapping || !dax_mapping(mapping))
@@ -447,11 +447,11 @@ dax_entry_t dax_lock_page(struct page *page)
xas.xa = &mapping->i_pages;
xas_lock_irq(&xas);
- if (mapping != page->mapping) {
+ if (mapping != folio->mapping) {
xas_unlock_irq(&xas);
continue;
}
- xas_set(&xas, page->index);
+ xas_set(&xas, folio->index);
entry = xas_load(&xas);
if (dax_is_locked(entry)) {
rcu_read_unlock();
@@ -467,10 +467,10 @@ dax_entry_t dax_lock_page(struct page *page)
return (dax_entry_t)entry;
}
-void dax_unlock_page(struct page *page, dax_entry_t cookie)
+void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
{
- struct address_space *mapping = page->mapping;
- XA_STATE(xas, &mapping->i_pages, page->index);
+ struct address_space *mapping = folio->mapping;
+ XA_STATE(xas, &mapping->i_pages, folio->index);
if (S_ISCHR(mapping->host->i_mode))
return;
diff --git a/fs/dcache.c b/fs/dcache.c
index 796e23761ba0..c82ae731df9a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -78,7 +78,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
-static struct kmem_cache *dentry_cache __read_mostly;
+static struct kmem_cache *dentry_cache __ro_after_init;
const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
@@ -96,9 +96,9 @@ EXPORT_SYMBOL(dotdot_name);
* information, yet avoid using a prime hash-size or similar.
*/
-static unsigned int d_hash_shift __read_mostly;
+static unsigned int d_hash_shift __ro_after_init;
-static struct hlist_bl_head *dentry_hashtable __read_mostly;
+static struct hlist_bl_head *dentry_hashtable __ro_after_init;
static inline struct hlist_bl_head *d_hash(unsigned int hash)
{
@@ -3332,7 +3332,7 @@ static void __init dcache_init(void)
}
/* SLAB cache for __getname() consumers */
-struct kmem_cache *names_cachep __read_mostly;
+struct kmem_cache *names_cachep __ro_after_init;
EXPORT_SYMBOL(names_cachep);
void __init vfs_caches_init_early(void)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 87b3753aa4b1..c45e8c2d62e1 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -939,7 +939,7 @@ static ssize_t debugfs_write_file_str(struct file *file, const char __user *user
new[pos + count] = '\0';
strim(new);
- rcu_assign_pointer(*(char **)file->private_data, new);
+ rcu_assign_pointer(*(char __rcu **)file->private_data, new);
synchronize_rcu();
kfree(old);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7bc494ee56b9..20533266ade6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -151,7 +151,7 @@ struct dio {
};
} ____cacheline_aligned_in_smp;
-static struct kmem_cache *dio_cache __read_mostly;
+static struct kmem_cache *dio_cache __ro_after_init;
/*
* How many pages are in the queue?
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 76dd3c7295d9..91290fe4a70b 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -21,8 +21,12 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
dev_t dev, bool is_removable)
{
struct inode *inode = new_inode(sb);
+ struct efivarfs_fs_info *fsi = sb->s_fs_info;
+ struct efivarfs_mount_opts *opts = &fsi->mount_opts;
if (inode) {
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
simple_inode_init_ts(inode);
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 8ebf3a6a8aa2..c66647f5c0bd 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -9,6 +9,15 @@
#include <linux/list.h>
#include <linux/efi.h>
+struct efivarfs_mount_opts {
+ kuid_t uid;
+ kgid_t gid;
+};
+
+struct efivarfs_fs_info {
+ struct efivarfs_mount_opts mount_opts;
+};
+
struct efi_variable {
efi_char16_t VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)];
efi_guid_t VendorGuid;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 996271473609..77240953a92e 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -8,6 +8,7 @@
#include <linux/efi.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/ucs2_string.h>
@@ -24,12 +25,28 @@ static void efivarfs_evict_inode(struct inode *inode)
clear_inode(inode);
}
+static int efivarfs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct efivarfs_fs_info *sbi = sb->s_fs_info;
+ struct efivarfs_mount_opts *opts = &sbi->mount_opts;
+
+ if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, opts->uid));
+ if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, opts->gid));
+ return 0;
+}
+
static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
const u32 attr = EFI_VARIABLE_NON_VOLATILE |
EFI_VARIABLE_BOOTSERVICE_ACCESS |
EFI_VARIABLE_RUNTIME_ACCESS;
u64 storage_space, remaining_space, max_variable_size;
+ u64 id = huge_encode_dev(dentry->d_sb->s_dev);
efi_status_t status;
/* Some UEFI firmware does not implement QueryVariableInfo() */
@@ -53,6 +70,7 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = storage_space;
buf->f_bfree = remaining_space;
buf->f_type = dentry->d_sb->s_magic;
+ buf->f_fsid = u64_to_fsid(id);
/*
* In f_bavail we declare the free space that the kernel will allow writing
@@ -70,6 +88,7 @@ static const struct super_operations efivarfs_ops = {
.statfs = efivarfs_statfs,
.drop_inode = generic_delete_inode,
.evict_inode = efivarfs_evict_inode,
+ .show_options = efivarfs_show_options,
};
/*
@@ -231,6 +250,45 @@ static int efivarfs_destroy(struct efivar_entry *entry, void *data)
return 0;
}
+enum {
+ Opt_uid, Opt_gid,
+};
+
+static const struct fs_parameter_spec efivarfs_parameters[] = {
+ fsparam_u32("uid", Opt_uid),
+ fsparam_u32("gid", Opt_gid),
+ {},
+};
+
+static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct efivarfs_fs_info *sbi = fc->s_fs_info;
+ struct efivarfs_mount_opts *opts = &sbi->mount_opts;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, efivarfs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ opts->uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(opts->uid))
+ return -EINVAL;
+ break;
+ case Opt_gid:
+ opts->gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(opts->gid))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode = NULL;
@@ -277,10 +335,21 @@ static int efivarfs_get_tree(struct fs_context *fc)
static const struct fs_context_operations efivarfs_context_ops = {
.get_tree = efivarfs_get_tree,
+ .parse_param = efivarfs_parse_param,
};
static int efivarfs_init_fs_context(struct fs_context *fc)
{
+ struct efivarfs_fs_info *sfi;
+
+ sfi = kzalloc(sizeof(*sfi), GFP_KERNEL);
+ if (!sfi)
+ return -ENOMEM;
+
+ sfi->mount_opts.uid = GLOBAL_ROOT_UID;
+ sfi->mount_opts.gid = GLOBAL_ROOT_GID;
+
+ fc->s_fs_info = sfi;
fc->ops = &efivarfs_context_ops;
return 0;
}
@@ -301,6 +370,7 @@ static struct file_system_type efivarfs_type = {
.name = "efivarfs",
.init_fs_context = efivarfs_init_fs_context,
.kill_sb = efivarfs_kill_sb,
+ .parameters = efivarfs_parameters,
};
static __init int efivarfs_init(void)
diff --git a/fs/efs/super.c b/fs/efs/super.c
index b287f47c165b..f17fdac76b2e 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -123,6 +123,7 @@ static const struct super_operations efs_superblock_operations = {
};
static const struct export_operations efs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = efs_fh_to_dentry,
.fh_to_parent = efs_fh_to_parent,
.get_parent = efs_get_parent,
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index e540648dedc2..1d318f85232d 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -21,7 +21,7 @@ config EROFS_FS
performance under extremely memory pressure without extra cost.
See the documentation at <file:Documentation/filesystems/erofs.rst>
- for more details.
+ and the web pages at <https://erofs.docs.kernel.org> for more details.
If unsure, say N.
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 029c761670bf..c98aeda8abb2 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -220,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
up_read(&devs->rwsem);
return 0;
}
- map->m_bdev = dif->bdev_handle->bdev;
+ map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
@@ -238,7 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
if (map->m_pa >= startoff &&
map->m_pa < startoff + length) {
map->m_pa -= startoff;
- map->m_bdev = dif->bdev_handle->bdev;
+ map->m_bdev = dif->bdev_handle ?
+ dif->bdev_handle->bdev : NULL;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index b8ad05b4509d..14a79d3226ab 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -15,11 +15,11 @@ static void *erofs_read_inode(struct erofs_buf *buf,
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_inode *vi = EROFS_I(inode);
const erofs_off_t inode_loc = erofs_iloc(inode);
-
erofs_blk_t blkaddr, nblks = 0;
void *kaddr;
struct erofs_inode_compact *dic;
struct erofs_inode_extended *die, *copied = NULL;
+ union erofs_inode_i_u iu;
unsigned int ifmt;
int err;
@@ -35,9 +35,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
dic = kaddr + *ofs;
ifmt = le16_to_cpu(dic->i_format);
-
if (ifmt & ~EROFS_I_ALL) {
- erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+ erofs_err(sb, "unsupported i_format %u of nid %llu",
ifmt, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -45,7 +44,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->datalayout = erofs_inode_datalayout(ifmt);
if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
- erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
+ erofs_err(sb, "unsupported datalayout %u of nid %llu",
vi->datalayout, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -82,40 +81,15 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
inode->i_mode = le16_to_cpu(die->i_mode);
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr);
- break;
- case S_IFCHR:
- case S_IFBLK:
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(die->i_u.rdev));
- break;
- case S_IFIFO:
- case S_IFSOCK:
- inode->i_rdev = 0;
- break;
- default:
- goto bogusimode;
- }
+ iu = die->i_u;
i_uid_write(inode, le32_to_cpu(die->i_uid));
i_gid_write(inode, le32_to_cpu(die->i_gid));
set_nlink(inode, le32_to_cpu(die->i_nlink));
-
- /* extended inode has its own timestamp */
+ /* each extended inode has its own timestamp */
inode_set_ctime(inode, le64_to_cpu(die->i_mtime),
le32_to_cpu(die->i_mtime_nsec));
inode->i_size = le64_to_cpu(die->i_size);
-
- /* total blocks for compressed files */
- if (erofs_inode_is_data_compressed(vi->datalayout))
- nblks = le32_to_cpu(die->i_u.compressed_blocks);
- else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
- /* fill chunked inode summary info */
- vi->chunkformat = le16_to_cpu(die->i_u.c.format);
kfree(copied);
copied = NULL;
break;
@@ -125,49 +99,51 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
inode->i_mode = le16_to_cpu(dic->i_mode);
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr);
- break;
- case S_IFCHR:
- case S_IFBLK:
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(dic->i_u.rdev));
- break;
- case S_IFIFO:
- case S_IFSOCK:
- inode->i_rdev = 0;
- break;
- default:
- goto bogusimode;
- }
+ iu = dic->i_u;
i_uid_write(inode, le16_to_cpu(dic->i_uid));
i_gid_write(inode, le16_to_cpu(dic->i_gid));
set_nlink(inode, le16_to_cpu(dic->i_nlink));
-
/* use build time for compact inodes */
inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec);
inode->i_size = le32_to_cpu(dic->i_size);
- if (erofs_inode_is_data_compressed(vi->datalayout))
- nblks = le32_to_cpu(dic->i_u.compressed_blocks);
- else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
- vi->chunkformat = le16_to_cpu(dic->i_u.c.format);
break;
default:
- erofs_err(inode->i_sb,
- "unsupported on-disk inode version %u of nid %llu",
+ erofs_err(sb, "unsupported on-disk inode version %u of nid %llu",
erofs_inode_version(ifmt), vi->nid);
err = -EOPNOTSUPP;
goto err_out;
}
- if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev));
+ break;
+ case S_IFIFO:
+ case S_IFSOCK:
+ inode->i_rdev = 0;
+ break;
+ default:
+ erofs_err(sb, "bogus i_mode (%o) @ nid %llu", inode->i_mode,
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto err_out;
+ }
+
+ /* total blocks for compressed files */
+ if (erofs_inode_is_data_compressed(vi->datalayout)) {
+ nblks = le32_to_cpu(iu.compressed_blocks);
+ } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ /* fill chunked inode summary info */
+ vi->chunkformat = le16_to_cpu(iu.c.format);
if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
- erofs_err(inode->i_sb,
- "unsupported chunk format %x of nid %llu",
+ erofs_err(sb, "unsupported chunk format %x of nid %llu",
vi->chunkformat, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -191,10 +167,6 @@ static void *erofs_read_inode(struct erofs_buf *buf,
inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
return kaddr;
-bogusimode:
- erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
- inode->i_mode, vi->nid);
- err = -EFSCORRUPTED;
err_out:
DBG_BUGON(1);
kfree(copied);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 976dc39a88f7..3789d6224513 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -567,6 +567,7 @@ static struct dentry *erofs_get_parent(struct dentry *child)
}
static const struct export_operations erofs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = erofs_fh_to_dentry,
.fh_to_parent = erofs_fh_to_parent,
.get_parent = erofs_get_parent,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 4256a85719a1..5dea308764b4 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -264,19 +264,24 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
return freed;
}
-static struct shrinker erofs_shrinker_info = {
- .scan_objects = erofs_shrink_scan,
- .count_objects = erofs_shrink_count,
- .seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *erofs_shrinker_info;
int __init erofs_init_shrinker(void)
{
- return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
+ erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker");
+ if (!erofs_shrinker_info)
+ return -ENOMEM;
+
+ erofs_shrinker_info->count_objects = erofs_shrink_count;
+ erofs_shrinker_info->scan_objects = erofs_shrink_scan;
+
+ shrinker_register(erofs_shrinker_info);
+
+ return 0;
}
void erofs_exit_shrinker(void)
{
- unregister_shrinker(&erofs_shrinker_info);
+ shrinker_free(erofs_shrinker_info);
}
#endif /* !CONFIG_EROFS_FS_ZIP */
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1d9a71a0c4c1..2877cc01cff1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -256,10 +256,10 @@ static u64 loop_check_gen = 0;
static struct eventpoll *inserting_into;
/* Slab cache used to allocate "struct epitem" */
-static struct kmem_cache *epi_cache __read_mostly;
+static struct kmem_cache *epi_cache __ro_after_init;
/* Slab cache used to allocate "struct eppoll_entry" */
-static struct kmem_cache *pwq_cache __read_mostly;
+static struct kmem_cache *pwq_cache __ro_after_init;
/*
* List of files with newly added links, where we may need to limit the number
@@ -271,7 +271,7 @@ struct epitems_head {
};
static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
-static struct kmem_cache *ephead_cache __read_mostly;
+static struct kmem_cache *ephead_cache __ro_after_init;
static inline void free_ephead(struct epitems_head *head)
{
diff --git a/fs/exec.c b/fs/exec.c
index 6518e33ea813..4aa19b24f281 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -713,7 +713,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* process cleanup to remove whatever mess we made.
*/
if (length != move_page_tables(vma, old_start,
- vma, new_start, length, false))
+ vma, new_start, length, false, true))
return -ENOMEM;
lru_add_drain();
@@ -986,8 +986,6 @@ static int exec_mmap(struct mm_struct *mm)
tsk = current;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
- if (old_mm)
- sync_mm_rss(old_mm);
ret = down_write_killable(&tsk->signal->exec_update_lock);
if (ret)
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 02c4e2937879..bfdfafe00993 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -295,6 +295,7 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (attr->ia_valid & ATTR_SIZE)
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ setattr_copy(&nop_mnt_idmap, inode, attr);
exfat_truncate_inode_atime(inode);
if (attr->ia_valid & ATTR_SIZE) {
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 875234179d1f..e7ff58b8e68c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -56,18 +56,18 @@ int __exfat_write_inode(struct inode *inode, int sync)
&ep->dentry.file.create_time,
&ep->dentry.file.create_date,
&ep->dentry.file.create_time_cs);
+ ts = inode_get_mtime(inode);
exfat_set_entry_time(sbi, &ts,
&ep->dentry.file.modify_tz,
&ep->dentry.file.modify_time,
&ep->dentry.file.modify_date,
&ep->dentry.file.modify_time_cs);
- inode_set_mtime_to_ts(inode, ts);
+ ts = inode_get_atime(inode);
exfat_set_entry_time(sbi, &ts,
&ep->dentry.file.access_tz,
&ep->dentry.file.access_time,
&ep->dentry.file.access_date,
NULL);
- inode_set_atime_to_ts(inode, ts);
/* File size should be zero if there is no cluster allocated */
on_disk_size = i_size_read(inode);
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c20704aa21b3..3ae0154c5680 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -342,43 +342,30 @@ out:
return error;
}
+#define FILEID_INO64_GEN_LEN 3
+
/**
- * export_encode_fh - default export_operations->encode_fh function
+ * exportfs_encode_ino64_fid - encode non-decodeable 64bit ino file id
* @inode: the object to encode
* @fid: where to store the file handle fragment
- * @max_len: maximum length to store there
- * @parent: parent directory inode, if wanted
+ * @max_len: maximum length to store there (in 4 byte units)
*
- * This default encode_fh function assumes that the 32 inode number
- * is suitable for locating an inode, and that the generation number
- * can be used to check that it is still valid. It places them in the
- * filehandle fragment where export_decode_fh expects to find them.
+ * This generic function is used to encode a non-decodeable file id for
+ * fanotify for filesystems that do not support NFS export.
*/
-static int export_encode_fh(struct inode *inode, struct fid *fid,
- int *max_len, struct inode *parent)
+static int exportfs_encode_ino64_fid(struct inode *inode, struct fid *fid,
+ int *max_len)
{
- int len = *max_len;
- int type = FILEID_INO32_GEN;
-
- if (parent && (len < 4)) {
- *max_len = 4;
- return FILEID_INVALID;
- } else if (len < 2) {
- *max_len = 2;
+ if (*max_len < FILEID_INO64_GEN_LEN) {
+ *max_len = FILEID_INO64_GEN_LEN;
return FILEID_INVALID;
}
- len = 2;
- fid->i32.ino = inode->i_ino;
- fid->i32.gen = inode->i_generation;
- if (parent) {
- fid->i32.parent_ino = parent->i_ino;
- fid->i32.parent_gen = parent->i_generation;
- len = 4;
- type = FILEID_INO32_GEN_PARENT;
- }
- *max_len = len;
- return type;
+ fid->i64.ino = inode->i_ino;
+ fid->i64.gen = inode->i_generation;
+ *max_len = FILEID_INO64_GEN_LEN;
+
+ return FILEID_INO64_GEN;
}
/**
@@ -396,17 +383,13 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
{
const struct export_operations *nop = inode->i_sb->s_export_op;
- /*
- * If a decodeable file handle was requested, we need to make sure that
- * filesystem can decode file handles.
- */
- if (nop && !(flags & EXPORT_FH_FID) && !nop->fh_to_dentry)
+ if (!exportfs_can_encode_fh(nop, flags))
return -EOPNOTSUPP;
- if (nop && nop->encode_fh)
- return nop->encode_fh(inode, fid->raw, max_len, parent);
+ if (!nop && (flags & EXPORT_FH_FID))
+ return exportfs_encode_ino64_fid(inode, fid, max_len);
- return export_encode_fh(inode, fid, max_len, parent);
+ return nop->encode_fh(inode, fid->raw, max_len, parent);
}
EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
@@ -456,7 +439,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
/*
* Try to get any dentry for the given file handle from the filesystem.
*/
- if (!nop || !nop->fh_to_dentry)
+ if (!exportfs_can_decode_fh(nop))
return ERR_PTR(-ESTALE);
result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
if (IS_ERR_OR_NULL(result))
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 645ee6142f69..01f9addc8b1f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -397,6 +397,7 @@ static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid,
}
static const struct export_operations ext2_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = ext2_fh_to_dentry,
.fh_to_parent = ext2_fh_to_parent,
.get_parent = ext2_get_parent,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f16aa375c02b..a5d784872303 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1664,7 +1664,7 @@ struct ext4_sb_info {
__u32 s_csum_seed;
/* Reclaim extents from extent status tree */
- struct shrinker s_es_shrinker;
+ struct shrinker *s_es_shrinker;
struct list_head s_es_list; /* List of inodes with reclaimable extents */
long s_es_nr_inode;
struct ext4_es_stats s_es_stats;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index f4b50652f0cc..4a00e2f019d9 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1632,7 +1632,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
unsigned long nr;
struct ext4_sb_info *sbi;
- sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+ sbi = shrink->private_data;
nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
return nr;
@@ -1641,8 +1641,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
static unsigned long ext4_es_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct ext4_sb_info *sbi = container_of(shrink,
- struct ext4_sb_info, s_es_shrinker);
+ struct ext4_sb_info *sbi = shrink->private_data;
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk;
@@ -1726,13 +1725,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
if (err)
goto err3;
- sbi->s_es_shrinker.scan_objects = ext4_es_scan;
- sbi->s_es_shrinker.count_objects = ext4_es_count;
- sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
- err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
- sbi->s_sb->s_id);
- if (err)
+ sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
+ if (!sbi->s_es_shrinker) {
+ err = -ENOMEM;
goto err4;
+ }
+
+ sbi->s_es_shrinker->scan_objects = ext4_es_scan;
+ sbi->s_es_shrinker->count_objects = ext4_es_count;
+ sbi->s_es_shrinker->private_data = sbi;
+
+ shrinker_register(sbi->s_es_shrinker);
return 0;
err4:
@@ -1752,7 +1755,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
- unregister_shrinker(&sbi->s_es_shrinker);
+ shrinker_free(sbi->s_es_shrinker);
}
/*
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a6838f54ae91..61277f7f8722 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1032,10 +1032,8 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
BUG_ON(from > to);
head = folio_buffers(folio);
- if (!head) {
- create_empty_buffers(&folio->page, blocksize, 0);
- head = folio_buffers(folio);
- }
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
bbits = ilog2(blocksize);
block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
@@ -1165,7 +1163,7 @@ retry_grab:
* starting the handle.
*/
if (!folio_buffers(folio))
- create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0);
+ create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);
folio_unlock(folio);
@@ -3655,10 +3653,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
bh = folio_buffers(folio);
- if (!bh) {
- create_empty_buffers(&folio->page, blocksize, 0);
- bh = folio_buffers(folio);
- }
+ if (!bh)
+ bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
pos = blocksize;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 18a9e7c47975..3aa57376d9c2 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -183,10 +183,8 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
blocksize = i_blocksize(inode);
head = folio_buffers(folio);
- if (!head) {
- create_empty_buffers(&folio->page, blocksize, 0);
- head = folio_buffers(folio);
- }
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits);
for (bh = head, block_start = 0; bh != head || !block_start;
@@ -380,9 +378,10 @@ data_copy:
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- if (!folio_buffers(folio[0]))
- create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0);
bh = folio_buffers(folio[0]);
+ if (!bh)
+ bh = create_empty_buffers(folio[0],
+ 1 << orig_inode->i_blkbits, 0);
for (i = 0; i < data_offset_in_page; i++)
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3e7d160f543f..21e8f0aebb3c 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -70,15 +70,8 @@ static void __read_end_io(struct bio *bio)
{
struct folio_iter fi;
- bio_for_each_folio_all(fi, bio) {
- struct folio *folio = fi.folio;
-
- if (bio->bi_status)
- folio_clear_uptodate(folio);
- else
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- }
+ bio_for_each_folio_all(fi, bio)
+ folio_end_read(fi.folio, bio->bi_status == 0);
if (bio->bi_private)
mempool_free(bio->bi_private, bio_post_read_ctx_pool);
bio_put(bio);
@@ -336,8 +329,7 @@ int ext4_mpage_readpages(struct inode *inode,
if (ext4_need_verity(inode, folio->index) &&
!fsverity_verify_folio(folio))
goto set_error_page;
- folio_mark_uptodate(folio);
- folio_unlock(folio);
+ folio_end_read(folio, true);
continue;
}
} else if (fully_mapped) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 77e2b694c7d5..c5fcf377ab1f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -244,18 +244,25 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
blk_opf_t op_flags)
{
- return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+ ~__GFP_FS) | __GFP_MOVABLE;
+
+ return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
}
struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block)
{
- return __ext4_sb_bread_gfp(sb, block, 0, 0);
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+ ~__GFP_FS);
+
+ return __ext4_sb_bread_gfp(sb, block, 0, gfp);
}
void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
- struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
+ struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
+ sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN);
if (likely(bh)) {
if (trylock_buffer(bh))
@@ -1647,6 +1654,7 @@ static const struct super_operations ext4_sops = {
};
static const struct export_operations ext4_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = ext4_fh_to_dentry,
.fh_to_parent = ext4_fh_to_parent,
.get_parent = ext4_get_parent,
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d820801f473e..36e5dab6baae 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -893,14 +893,15 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc)
bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
{
+#ifdef CONFIG_F2FS_CHECK_FS
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
- bool compressed = dn->data_blkaddr == COMPRESS_ADDR;
int cluster_end = 0;
+ unsigned int count;
int i;
char *reason = "";
- if (!compressed)
+ if (dn->data_blkaddr != COMPRESS_ADDR)
return false;
/* [..., COMPR_ADDR, ...] */
@@ -909,7 +910,7 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
goto out;
}
- for (i = 1; i < cluster_size; i++) {
+ for (i = 1, count = 1; i < cluster_size; i++, count++) {
block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
dn->ofs_in_node + i);
@@ -929,19 +930,42 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
goto out;
}
}
+
+ f2fs_bug_on(F2FS_I_SB(dn->inode), count != cluster_size &&
+ !is_inode_flag_set(dn->inode, FI_COMPRESS_RELEASED));
+
return false;
out:
f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s",
dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason);
set_sbi_flag(sbi, SBI_NEED_FSCK);
return true;
+#else
+ return false;
+#endif
+}
+
+static int __f2fs_get_cluster_blocks(struct inode *inode,
+ struct dnode_of_data *dn)
+{
+ unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
+ int count, i;
+
+ for (i = 1, count = 1; i < cluster_size; i++) {
+ block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ dn->ofs_in_node + i);
+
+ if (__is_valid_data_blkaddr(blkaddr))
+ count++;
+ }
+
+ return count;
}
static int __f2fs_cluster_blocks(struct inode *inode,
- unsigned int cluster_idx, bool compr)
+ unsigned int cluster_idx, bool compr_blks)
{
struct dnode_of_data dn;
- unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
unsigned int start_idx = cluster_idx <<
F2FS_I(inode)->i_log_cluster_size;
int ret;
@@ -956,31 +980,14 @@ static int __f2fs_cluster_blocks(struct inode *inode,
if (f2fs_sanity_check_cluster(&dn)) {
ret = -EFSCORRUPTED;
- f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER);
goto fail;
}
if (dn.data_blkaddr == COMPRESS_ADDR) {
- int i;
-
- ret = 1;
- for (i = 1; i < cluster_size; i++) {
- block_t blkaddr;
-
- blkaddr = data_blkaddr(dn.inode,
- dn.node_page, dn.ofs_in_node + i);
- if (compr) {
- if (__is_valid_data_blkaddr(blkaddr))
- ret++;
- } else {
- if (blkaddr != NULL_ADDR)
- ret++;
- }
- }
-
- f2fs_bug_on(F2FS_I_SB(inode),
- !compr && ret != cluster_size &&
- !is_inode_flag_set(inode, FI_COMPRESS_RELEASED));
+ if (compr_blks)
+ ret = __f2fs_get_cluster_blocks(inode, &dn);
+ else
+ ret = 1;
}
fail:
f2fs_put_dnode(&dn);
@@ -993,7 +1000,7 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc)
return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true);
}
-/* return # of valid blocks in compressed cluster */
+/* return whether cluster is compressed one or not */
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
{
return __f2fs_cluster_blocks(inode,
@@ -1976,7 +1983,7 @@ void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi)
int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
- char slab_name[32];
+ char slab_name[35];
if (!f2fs_sb_has_compression(sbi))
return 0;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 916e317ac925..4e42b5f24deb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1690,9 +1690,7 @@ next_block:
map->m_flags |= F2FS_MAP_NEW;
} else if (is_hole) {
if (f2fs_compressed_file(inode) &&
- f2fs_sanity_check_cluster(&dn) &&
- (flag != F2FS_GET_BLOCK_FIEMAP ||
- IS_ENABLED(CONFIG_F2FS_CHECK_FS))) {
+ f2fs_sanity_check_cluster(&dn)) {
err = -EFSCORRUPTED;
f2fs_handle_error(sbi,
ERROR_CORRUPTED_CLUSTER);
@@ -2344,8 +2342,10 @@ skip_reading_dnode:
f2fs_wait_on_block_writeback(inode, blkaddr);
if (f2fs_load_compressed_page(sbi, page, blkaddr)) {
- if (atomic_dec_and_test(&dic->remaining_pages))
+ if (atomic_dec_and_test(&dic->remaining_pages)) {
f2fs_decompress_cluster(dic, true);
+ break;
+ }
continue;
}
@@ -2665,6 +2665,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
return true;
if (f2fs_is_atomic_file(inode))
return true;
+ /* rewrite low ratio compress data w/ OPU mode to avoid fragmentation */
+ if (f2fs_compressed_file(inode) &&
+ F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER &&
+ is_inode_flag_set(inode, FI_ENABLE_COMPRESS))
+ return true;
/* swap file is migrating in aligned write mode */
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
@@ -3023,7 +3028,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
{
int ret = 0;
int done = 0, retry = 0;
- struct page *pages[F2FS_ONSTACK_PAGES];
+ struct page *pages_local[F2FS_ONSTACK_PAGES];
+ struct page **pages = pages_local;
struct folio_batch fbatch;
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
struct bio *bio = NULL;
@@ -3047,6 +3053,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
#endif
int nr_folios, p, idx;
int nr_pages;
+ unsigned int max_pages = F2FS_ONSTACK_PAGES;
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
@@ -3056,6 +3063,15 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
int submitted = 0;
int i;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_compressed_file(inode) &&
+ 1 << cc.log_cluster_size > F2FS_ONSTACK_PAGES) {
+ pages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
+ cc.log_cluster_size, GFP_NOFS | __GFP_NOFAIL);
+ max_pages = 1 << cc.log_cluster_size;
+ }
+#endif
+
folio_batch_init(&fbatch);
if (get_dirty_pages(mapping->host) <=
@@ -3101,7 +3117,7 @@ again:
add_more:
pages[nr_pages] = folio_page(folio, idx);
folio_get(folio);
- if (++nr_pages == F2FS_ONSTACK_PAGES) {
+ if (++nr_pages == max_pages) {
index = folio->index + idx + 1;
folio_batch_release(&fbatch);
goto write;
@@ -3283,6 +3299,11 @@ next:
if (bio)
f2fs_submit_merged_ipu_write(sbi, &bio, NULL);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (pages != pages_local)
+ kfree(pages);
+#endif
+
return ret;
}
@@ -4055,7 +4076,7 @@ next:
sis->highest_bit = cur_lblock - 1;
out:
if (not_aligned)
- f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)",
+ f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)",
not_aligned, blks_per_sec * F2FS_BLKSIZE);
return ret;
}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 0e2d49140c07..ad8dfac73bd4 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -74,40 +74,14 @@ static void __set_extent_info(struct extent_info *ei,
}
}
-static bool __may_read_extent_tree(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
- if (!test_opt(sbi, READ_EXTENT_CACHE))
- return false;
- if (is_inode_flag_set(inode, FI_NO_EXTENT))
- return false;
- if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
- !f2fs_sb_has_readonly(sbi))
- return false;
- return S_ISREG(inode->i_mode);
-}
-
-static bool __may_age_extent_tree(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
- if (!test_opt(sbi, AGE_EXTENT_CACHE))
- return false;
- if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
- return false;
- if (file_is_cold(inode))
- return false;
-
- return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
-}
-
static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
{
if (type == EX_READ)
- return __may_read_extent_tree(inode);
- else if (type == EX_BLOCK_AGE)
- return __may_age_extent_tree(inode);
+ return test_opt(F2FS_I_SB(inode), READ_EXTENT_CACHE) &&
+ S_ISREG(inode->i_mode);
+ if (type == EX_BLOCK_AGE)
+ return test_opt(F2FS_I_SB(inode), AGE_EXTENT_CACHE) &&
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode));
return false;
}
@@ -120,7 +94,22 @@ static bool __may_extent_tree(struct inode *inode, enum extent_type type)
if (list_empty(&F2FS_I_SB(inode)->s_list))
return false;
- return __init_may_extent_tree(inode, type);
+ if (!__init_may_extent_tree(inode, type))
+ return false;
+
+ if (type == EX_READ) {
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
+ return false;
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+ !f2fs_sb_has_readonly(F2FS_I_SB(inode)))
+ return false;
+ } else if (type == EX_BLOCK_AGE) {
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
+ return false;
+ if (file_is_cold(inode))
+ return false;
+ }
+ return true;
}
static void __try_update_largest_extent(struct extent_tree *et,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index dd99abbb7186..e50363583f01 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3258,11 +3258,12 @@ int f2fs_precache_extents(struct inode *inode)
return -EOPNOTSUPP;
map.m_lblk = 0;
+ map.m_pblk = 0;
map.m_next_pgofs = NULL;
map.m_next_extent = &m_next_extent;
map.m_seg_type = NO_CHECK_TYPE;
map.m_may_create = false;
- end = max_file_blocks(inode);
+ end = F2FS_BLK_ALIGN(i_size_read(inode));
while (map.m_lblk < end) {
map.m_len = end - map.m_lblk;
@@ -3270,7 +3271,7 @@ int f2fs_precache_extents(struct inode *inode)
f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRECACHE);
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
- if (err)
+ if (err || !map.m_len)
return err;
map.m_lblk = m_next_extent;
@@ -4005,6 +4006,15 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
F2FS_I(inode)->i_compress_algorithm = option.algorithm;
F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size;
F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size);
+ /* Set default level */
+ if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD)
+ F2FS_I(inode)->i_compress_level = F2FS_ZSTD_DEFAULT_CLEVEL;
+ else
+ F2FS_I(inode)->i_compress_level = 0;
+ /* Adjust mount option level */
+ if (option.algorithm == F2FS_OPTION(sbi).compress_algorithm &&
+ F2FS_OPTION(sbi).compress_level)
+ F2FS_I(inode)->i_compress_level = F2FS_OPTION(sbi).compress_level;
f2fs_mark_inode_dirty_sync(inode, true);
if (!f2fs_is_compress_backend_ready(inode))
@@ -4849,6 +4859,9 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
filp->f_mode &= ~FMODE_RANDOM;
spin_unlock(&filp->f_lock);
return 0;
+ } else if (advice == POSIX_FADV_WILLNEED && offset == 0) {
+ /* Load extent cache at the first readahead. */
+ f2fs_precache_extents(inode);
}
err = generic_fadvise(filp, offset, len, advice);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 5779c7edd49b..560bfcad1af2 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -315,7 +315,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
f2fs_has_inline_xattr(inode) &&
(!fi->i_inline_xattr_size ||
fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
- f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu",
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %lu",
__func__, inode->i_ino, fi->i_inline_xattr_size,
MAX_INLINE_XATTR_SIZE);
return false;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ee2e1dd64f25..6c7f6a649d27 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -633,7 +633,7 @@ static void f2fs_ra_node_pages(struct page *parent, int start, int n)
/* Then, try readahead for siblings of the desired node */
end = start + n;
- end = min(end, NIDS_PER_BLOCK);
+ end = min(end, (int)NIDS_PER_BLOCK);
for (i = start; i < end; i++) {
nid = get_nid(parent, i, false);
f2fs_ra_node_page(sbi, nid);
@@ -1467,7 +1467,8 @@ page_hit:
ofs_of_node(page), cpver_of_node(page),
next_blkaddr_of_node(page));
set_sbi_flag(sbi, SBI_NEED_FSCK);
- err = -EINVAL;
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
+ err = -EFSCORRUPTED;
out_err:
ClearPageUptodate(page);
out_put_err:
@@ -2389,7 +2390,7 @@ static int scan_nat_page(struct f2fs_sb_info *sbi,
blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
if (blk_addr == NEW_ADDR)
- return -EINVAL;
+ return -EFSCORRUPTED;
if (blk_addr == NULL_ADDR) {
add_free_nid(sbi, start_nid, true, true);
@@ -2504,7 +2505,14 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
if (ret) {
f2fs_up_read(&nm_i->nat_tree_lock);
- f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
+
+ if (ret == -EFSCORRUPTED) {
+ f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi,
+ ERROR_INCONSISTENT_NAT);
+ }
+
return ret;
}
}
@@ -2743,7 +2751,9 @@ recover_xnid:
f2fs_update_inode_page(inode);
/* 3: update and set xattr node page dirty */
- memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
+ if (page)
+ memcpy(F2FS_NODE(xpage), F2FS_NODE(page),
+ VALID_XATTR_BLOCK_SIZE);
set_page_dirty(xpage);
f2fs_put_page(xpage, 1);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d05b41608fc0..727d016318f9 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -4910,22 +4910,31 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
}
/*
- * The write pointer matches with the valid blocks or
- * already points to the end of the zone.
+ * When safely unmounted in the previous mount, we can trust write
+ * pointers. Otherwise, finish zones.
*/
- if ((last_valid_block + 1 == wp_block) ||
- (zone->wp == zone->start + zone->len))
- return 0;
-
- if (last_valid_block + 1 == zone_block) {
+ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
/*
- * If there is no valid block in the zone and if write pointer
- * is not at zone start, reset the write pointer.
+ * The write pointer matches with the valid blocks or
+ * already points to the end of the zone.
*/
- f2fs_notice(sbi,
- "Zone without valid block has non-zero write "
- "pointer. Reset the write pointer: wp[0x%x,0x%x]",
- wp_segno, wp_blkoff);
+ if ((last_valid_block + 1 == wp_block) ||
+ (zone->wp == zone->start + zone->len))
+ return 0;
+ }
+
+ if (last_valid_block + 1 == zone_block) {
+ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+ /*
+ * If there is no valid block in the zone and if write
+ * pointer is not at zone start, reset the write
+ * pointer.
+ */
+ f2fs_notice(sbi,
+ "Zone without valid block has non-zero write "
+ "pointer. Reset the write pointer: wp[0x%x,0x%x]",
+ wp_segno, wp_blkoff);
+ }
ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
zone->len >> log_sectors_per_block);
if (ret)
@@ -4935,18 +4944,20 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
return ret;
}
- /*
- * If there are valid blocks and the write pointer doesn't
- * match with them, we need to report the inconsistency and
- * fill the zone till the end to close the zone. This inconsistency
- * does not cause write error because the zone will not be selected
- * for write operation until it get discarded.
- */
- f2fs_notice(sbi, "Valid blocks are not aligned with write pointer: "
- "valid block[0x%x,0x%x] wp[0x%x,0x%x]",
- GET_SEGNO(sbi, last_valid_block),
- GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
- wp_segno, wp_blkoff);
+ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+ /*
+ * If there are valid blocks and the write pointer doesn't match
+ * with them, we need to report the inconsistency and fill
+ * the zone till the end to close the zone. This inconsistency
+ * does not cause write error because the zone will not be
+ * selected for write operation until it get discarded.
+ */
+ f2fs_notice(sbi, "Valid blocks are not aligned with write "
+ "pointer: valid block[0x%x,0x%x] wp[0x%x,0x%x]",
+ GET_SEGNO(sbi, last_valid_block),
+ GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
+ wp_segno, wp_blkoff);
+ }
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
zone->start, zone->len, GFP_NOFS);
@@ -5020,18 +5031,27 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
return 0;
- wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
- wp_segno = GET_SEGNO(sbi, wp_block);
- wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
- wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
-
- if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
- wp_sector_off == 0)
- return 0;
+ /*
+ * When safely unmounted in the previous mount, we could use current
+ * segments. Otherwise, allocate new sections.
+ */
+ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+ wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
+ wp_segno = GET_SEGNO(sbi, wp_block);
+ wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
+ wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
+
+ if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
+ wp_sector_off == 0)
+ return 0;
- f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
- "curseg[0x%x,0x%x] wp[0x%x,0x%x]",
- type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff);
+ f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
+ "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno,
+ cs->next_blkoff, wp_segno, wp_blkoff);
+ } else {
+ f2fs_notice(sbi, "Not successfully unmounted in the previous "
+ "mount");
+ }
f2fs_notice(sbi, "Assign new section to curseg[%d]: "
"curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 2ca8fb5d0dc4..8129be788bd5 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -108,11 +108,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\
(sbi)->log_blocks_per_seg))
#define GET_SEC_FROM_SEG(sbi, segno) \
- (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec)
+ (((segno) == -1) ? -1 : (segno) / (sbi)->segs_per_sec)
#define GET_SEG_FROM_SEC(sbi, secno) \
((secno) * (sbi)->segs_per_sec)
#define GET_ZONE_FROM_SEC(sbi, secno) \
- (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone)
+ (((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone)
#define GET_ZONE_FROM_SEG(sbi, segno) \
GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index be17d77513d5..033af907c3b1 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -83,11 +83,26 @@ void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
#endif
/* f2fs-wide shrinker description */
-static struct shrinker f2fs_shrinker_info = {
- .scan_objects = f2fs_shrink_scan,
- .count_objects = f2fs_shrink_count,
- .seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *f2fs_shrinker_info;
+
+static int __init f2fs_init_shrinker(void)
+{
+ f2fs_shrinker_info = shrinker_alloc(0, "f2fs-shrinker");
+ if (!f2fs_shrinker_info)
+ return -ENOMEM;
+
+ f2fs_shrinker_info->count_objects = f2fs_shrink_count;
+ f2fs_shrinker_info->scan_objects = f2fs_shrink_scan;
+
+ shrinker_register(f2fs_shrinker_info);
+
+ return 0;
+}
+
+static void f2fs_exit_shrinker(void)
+{
+ shrinker_free(f2fs_shrinker_info);
+}
enum {
Opt_gc_background,
@@ -547,6 +562,29 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
+static bool is_compress_extension_exist(struct f2fs_sb_info *sbi,
+ const char *new_ext, bool is_ext)
+{
+ unsigned char (*ext)[F2FS_EXTENSION_LEN];
+ int ext_cnt;
+ int i;
+
+ if (is_ext) {
+ ext = F2FS_OPTION(sbi).extensions;
+ ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ } else {
+ ext = F2FS_OPTION(sbi).noextensions;
+ ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+ }
+
+ for (i = 0; i < ext_cnt; i++) {
+ if (!strcasecmp(new_ext, ext[i]))
+ return true;
+ }
+
+ return false;
+}
+
/*
* 1. The same extension name cannot not appear in both compress and non-compress extension
* at the same time.
@@ -1149,6 +1187,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
return -EINVAL;
}
+ if (is_compress_extension_exist(sbi, name, true)) {
+ kfree(name);
+ break;
+ }
+
strcpy(ext[ext_cnt], name);
F2FS_OPTION(sbi).compress_ext_cnt++;
kfree(name);
@@ -1173,6 +1216,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
return -EINVAL;
}
+ if (is_compress_extension_exist(sbi, name, false)) {
+ kfree(name);
+ break;
+ }
+
strcpy(noext[noext_cnt], name);
F2FS_OPTION(sbi).nocompress_ext_cnt++;
kfree(name);
@@ -1629,7 +1677,7 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
- if (err) {
+ if (err || f2fs_cp_error(sbi)) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
truncate_inode_pages_final(META_MAPPING(sbi));
}
@@ -2286,9 +2334,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
unsigned long old_sb_flags;
int err;
bool need_restart_gc = false, need_stop_gc = false;
- bool need_restart_ckpt = false, need_stop_ckpt = false;
bool need_restart_flush = false, need_stop_flush = false;
bool need_restart_discard = false, need_stop_discard = false;
+ bool need_enable_checkpoint = false, need_disable_checkpoint = false;
bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
@@ -2452,24 +2500,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
clear_sbi_flag(sbi, SBI_IS_CLOSE);
}
- if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
- !test_opt(sbi, MERGE_CHECKPOINT)) {
- f2fs_stop_ckpt_thread(sbi);
- need_restart_ckpt = true;
- } else {
- /* Flush if the prevous checkpoint, if exists. */
- f2fs_flush_ckpt_thread(sbi);
-
- err = f2fs_start_ckpt_thread(sbi);
- if (err) {
- f2fs_err(sbi,
- "Failed to start F2FS issue_checkpoint_thread (%d)",
- err);
- goto restore_gc;
- }
- need_stop_ckpt = true;
- }
-
/*
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
@@ -2481,7 +2511,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
} else {
err = f2fs_create_flush_cmd_control(sbi);
if (err)
- goto restore_ckpt;
+ goto restore_gc;
need_stop_flush = true;
}
@@ -2503,8 +2533,31 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
err = f2fs_disable_checkpoint(sbi);
if (err)
goto restore_discard;
+ need_enable_checkpoint = true;
} else {
f2fs_enable_checkpoint(sbi);
+ need_disable_checkpoint = true;
+ }
+ }
+
+ /*
+ * Place this routine at the end, since a new checkpoint would be
+ * triggered while remount and we need to take care of it before
+ * returning from remount.
+ */
+ if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
+ !test_opt(sbi, MERGE_CHECKPOINT)) {
+ f2fs_stop_ckpt_thread(sbi);
+ } else {
+ /* Flush if the prevous checkpoint, if exists. */
+ f2fs_flush_ckpt_thread(sbi);
+
+ err = f2fs_start_ckpt_thread(sbi);
+ if (err) {
+ f2fs_err(sbi,
+ "Failed to start F2FS issue_checkpoint_thread (%d)",
+ err);
+ goto restore_checkpoint;
}
}
@@ -2522,6 +2575,13 @@ skip:
adjust_unusable_cap_perc(sbi);
*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
return 0;
+restore_checkpoint:
+ if (need_enable_checkpoint) {
+ f2fs_enable_checkpoint(sbi);
+ } else if (need_disable_checkpoint) {
+ if (f2fs_disable_checkpoint(sbi))
+ f2fs_warn(sbi, "checkpoint has not been disabled");
+ }
restore_discard:
if (need_restart_discard) {
if (f2fs_start_discard_thread(sbi))
@@ -2537,13 +2597,6 @@ restore_flush:
clear_opt(sbi, FLUSH_MERGE);
f2fs_destroy_flush_cmd_control(sbi, false);
}
-restore_ckpt:
- if (need_restart_ckpt) {
- if (f2fs_start_ckpt_thread(sbi))
- f2fs_warn(sbi, "background ckpt thread has stopped");
- } else if (need_stop_ckpt) {
- f2fs_stop_ckpt_thread(sbi);
- }
restore_gc:
if (need_restart_gc) {
if (f2fs_start_gc_thread(sbi))
@@ -3277,6 +3330,7 @@ static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid,
}
static const struct export_operations f2fs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = f2fs_fh_to_dentry,
.fh_to_parent = f2fs_fh_to_parent,
.get_parent = f2fs_get_parent,
@@ -3464,7 +3518,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
- /* Currently, support 512/1024/2048/4096 bytes sector size */
+ /* Currently, support 512/1024/2048/4096/16K bytes sector size */
if (le32_to_cpu(raw_super->log_sectorsize) >
F2FS_MAX_LOG_SECTOR_SIZE ||
le32_to_cpu(raw_super->log_sectorsize) <
@@ -4911,7 +4965,7 @@ static int __init init_f2fs_fs(void)
int err;
if (PAGE_SIZE != F2FS_BLKSIZE) {
- printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n",
+ printk("F2FS not supported on PAGE_SIZE(%lu) != BLOCK_SIZE(%lu)\n",
PAGE_SIZE, F2FS_BLKSIZE);
return -EINVAL;
}
@@ -4940,7 +4994,7 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_sysfs();
if (err)
goto free_garbage_collection_cache;
- err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker");
+ err = f2fs_init_shrinker();
if (err)
goto free_sysfs;
err = register_filesystem(&f2fs_fs_type);
@@ -4985,7 +5039,7 @@ free_root_stats:
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
free_shrinker:
- unregister_shrinker(&f2fs_shrinker_info);
+ f2fs_exit_shrinker();
free_sysfs:
f2fs_exit_sysfs();
free_garbage_collection_cache:
@@ -5017,7 +5071,7 @@ static void __exit exit_f2fs_fs(void)
f2fs_destroy_post_read_processing();
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
- unregister_shrinker(&f2fs_shrinker_info);
+ f2fs_exit_shrinker();
f2fs_exit_sysfs();
f2fs_destroy_garbage_collection_cache();
f2fs_destroy_extent_cache();
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 4314456854f6..47e88b4d4e7d 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -364,10 +364,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
*xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name);
if (!*xe) {
- f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+ f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr",
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
- err = -EFSCORRUPTED;
+ err = -ENODATA;
f2fs_handle_error(F2FS_I_SB(inode),
ERROR_CORRUPTED_XATTR);
goto out;
@@ -584,13 +584,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
(void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) {
- f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+ f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr",
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
- error = -EFSCORRUPTED;
f2fs_handle_error(F2FS_I_SB(inode),
ERROR_CORRUPTED_XATTR);
- goto cleanup;
+ break;
}
if (!prefix)
@@ -650,7 +649,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (size > MAX_VALUE_LEN(inode))
return -E2BIG;
-
+retry:
error = read_all_xattrs(inode, ipage, &base_addr);
if (error)
return error;
@@ -660,7 +659,14 @@ static int __f2fs_setxattr(struct inode *inode, int index,
/* find entry with wanted name. */
here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name);
if (!here) {
- f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+ if (!F2FS_I(inode)->i_xattr_nid) {
+ f2fs_notice(F2FS_I_SB(inode),
+ "recover xattr in inode (%lu)", inode->i_ino);
+ f2fs_recover_xattr_data(inode, NULL);
+ kfree(base_addr);
+ goto retry;
+ }
+ f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr",
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
error = -EFSCORRUPTED;
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index 3626eb585a98..c52e63e10d35 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -279,6 +279,7 @@ static struct dentry *fat_get_parent(struct dentry *child_dir)
}
const struct export_operations fat_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = fat_fh_to_dentry,
.fh_to_parent = fat_fh_to_parent,
.get_parent = fat_get_parent,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index e871009f6c88..c80a6acad742 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -844,7 +844,7 @@ int send_sigurg(struct fown_struct *fown)
}
static DEFINE_SPINLOCK(fasync_lock);
-static struct kmem_cache *fasync_cache __read_mostly;
+static struct kmem_cache *fasync_cache __ro_after_init;
static void fasync_free_rcu(struct rcu_head *head)
{
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6ea8d35a9382..18b3ba8dc8ea 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -26,12 +26,8 @@ static long do_sys_name_to_handle(const struct path *path,
/*
* We need to make sure whether the file system support decoding of
* the file handle if decodeable file handle was requested.
- * Otherwise, even empty export_operations are sufficient to opt-in
- * to encoding FIDs.
*/
- if (!path->dentry->d_sb->s_export_op ||
- (!(fh_flags & EXPORT_FH_FID) &&
- !path->dentry->d_sb->s_export_op->fh_to_dentry))
+ if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags))
return -EOPNOTSUPP;
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
diff --git a/fs/file_table.c b/fs/file_table.c
index fa92743ba6a9..de4a2915bfd4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -40,7 +40,7 @@ static struct files_stat_struct files_stat = {
};
/* SLAB cache for file structures */
-static struct kmem_cache *filp_cachep __read_mostly;
+static struct kmem_cache *filp_cachep __ro_after_init;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 310d73e254df..e6e2a2185e7c 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -76,6 +76,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
{
struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb);
struct vxfs_sb *raw_sb = infp->vsi_raw;
+ u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
bufp->f_type = VXFS_SUPER_MAGIC;
bufp->f_bsize = dentry->d_sb->s_blocksize;
@@ -84,6 +85,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
bufp->f_bavail = 0;
bufp->f_files = 0;
bufp->f_ffree = fs32_to_cpu(infp, raw_sb->vs_ifree);
+ bufp->f_fsid = u64_to_fsid(id);
bufp->f_namelen = VXFS_NAMELEN;
return 0;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index caa8121ad99c..74d4f09d5827 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -999,7 +999,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
}
*max_len = len;
- return parent ? 0x82 : 0x81;
+ return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN;
}
static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
@@ -1007,7 +1007,8 @@ static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
{
struct fuse_inode_handle handle;
- if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+ if ((fh_type != FILEID_INO64_GEN &&
+ fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3)
return NULL;
handle.nodeid = (u64) fid->raw[0] << 32;
@@ -1021,7 +1022,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb,
{
struct fuse_inode_handle parent;
- if (fh_type != 0x82 || fh_len < 6)
+ if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6)
return NULL;
parent.nodeid = (u64) fid->raw[3] << 32;
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index d4deb2b19959..82f5b09c04e6 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -11,9 +11,9 @@
#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
-extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
-extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
- struct posix_acl *acl, int type);
+struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
+int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct posix_acl *acl, int type);
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index c26d48355cc2..9611bfceda4b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -130,7 +130,7 @@ static int __gfs2_jdata_write_folio(struct folio *folio,
if (folio_test_checked(folio)) {
folio_clear_checked(folio);
if (!folio_buffers(folio)) {
- folio_create_empty_buffers(folio,
+ create_empty_buffers(folio,
inode->i_sb->s_blocksize,
BIT(BH_Dirty)|BIT(BH_Uptodate));
}
@@ -155,7 +155,7 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
+ if (gfs2_assert_withdraw(sdp, ip->i_gl->gl_state == LM_ST_EXCLUSIVE))
goto out;
if (folio_test_checked(folio) || current->journal_info)
goto out_ignore;
@@ -214,12 +214,12 @@ static int gfs2_write_jdata_batch(struct address_space *mapping,
unsigned nrblocks;
int i;
int ret;
- int nr_pages = 0;
+ size_t size = 0;
int nr_folios = folio_batch_count(fbatch);
for (i = 0; i < nr_folios; i++)
- nr_pages += folio_nr_pages(fbatch->folios[i]);
- nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits);
+ size += folio_size(fbatch->folios[i]);
+ nrblocks = size >> inode->i_blkbits;
ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
if (ret < 0)
@@ -403,27 +403,27 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
}
/**
- * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * stuffed_readpage - Fill in a Linux folio with stuffed file data
* @ip: the inode
- * @page: the page
+ * @folio: the folio
*
* Returns: errno
*/
-static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio)
{
struct buffer_head *dibh;
- u64 dsize = i_size_read(&ip->i_inode);
- void *kaddr;
+ size_t i_size = i_size_read(&ip->i_inode);
+ void *data;
int error;
/*
* Due to the order of unstuffing files and ->fault(), we can be
- * asked for a zero page in the case of a stuffed file being extended,
+ * asked for a zero folio in the case of a stuffed file being extended,
* so we need to supply one here. It doesn't happen often.
*/
- if (unlikely(page->index)) {
- zero_user(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ if (unlikely(folio->index)) {
+ folio_zero_range(folio, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
return 0;
}
@@ -431,13 +431,11 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
if (error)
return error;
- kaddr = kmap_local_page(page);
- memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
- memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
- kunmap_local(kaddr);
- flush_dcache_page(page);
+ data = dibh->b_data + sizeof(struct gfs2_dinode);
+ memcpy_to_folio(folio, 0, data, i_size);
+ folio_zero_range(folio, i_size, folio_size(folio) - i_size);
brelse(dibh);
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
return 0;
}
@@ -458,7 +456,7 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
(i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
error = iomap_read_folio(folio, &gfs2_iomap_ops);
} else if (gfs2_is_stuffed(ip)) {
- error = stuffed_readpage(ip, &folio->page);
+ error = stuffed_readpage(ip, folio);
folio_unlock(folio);
} else {
error = mpage_read_folio(folio, gfs2_block_map);
@@ -479,31 +477,29 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
*
*/
-int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
- unsigned size)
+ssize_t gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
+ size_t size)
{
struct address_space *mapping = ip->i_inode.i_mapping;
unsigned long index = *pos >> PAGE_SHIFT;
- unsigned offset = *pos & (PAGE_SIZE - 1);
- unsigned copied = 0;
- unsigned amt;
- struct page *page;
+ size_t copied = 0;
do {
- page = read_cache_page(mapping, index, gfs2_read_folio, NULL);
- if (IS_ERR(page)) {
- if (PTR_ERR(page) == -EINTR)
+ size_t offset, chunk;
+ struct folio *folio;
+
+ folio = read_cache_folio(mapping, index, gfs2_read_folio, NULL);
+ if (IS_ERR(folio)) {
+ if (PTR_ERR(folio) == -EINTR)
continue;
- return PTR_ERR(page);
+ return PTR_ERR(folio);
}
- amt = size - copied;
- if (offset + size > PAGE_SIZE)
- amt = PAGE_SIZE - offset;
- memcpy_from_page(buf + copied, page, offset, amt);
- put_page(page);
- copied += amt;
- index++;
- offset = 0;
+ offset = *pos + copied - folio_pos(folio);
+ chunk = min(size - copied, folio_size(folio) - offset);
+ memcpy_from_folio(buf + copied, folio, offset, chunk);
+ index = folio_next_index(folio);
+ folio_put(folio);
+ copied += chunk;
} while(copied < size);
(*pos) += size;
return size;
diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h
index f08322ef41cf..a10c4334d248 100644
--- a/fs/gfs2/aops.h
+++ b/fs/gfs2/aops.h
@@ -8,8 +8,8 @@
#include "incore.h"
-extern void adjust_fs_space(struct inode *inode);
-extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio,
- size_t from, size_t len);
+void adjust_fs_space(struct inode *inode);
+void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio,
+ size_t from, size_t len);
#endif /* __AOPS_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 011cd992e0e6..d9ccfd27e4f1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -43,53 +43,51 @@ struct metapath {
static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
/**
- * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
+ * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio
* @ip: the inode
* @dibh: the dinode buffer
* @block: the block number that was allocated
- * @page: The (optional) page. This is looked up if @page is NULL
+ * @folio: The folio.
*
* Returns: errno
*/
-
-static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
- u64 block, struct page *page)
+static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh,
+ u64 block, struct folio *folio)
{
struct inode *inode = &ip->i_inode;
- if (!PageUptodate(page)) {
- void *kaddr = kmap(page);
+ if (!folio_test_uptodate(folio)) {
+ void *kaddr = kmap_local_folio(folio, 0);
u64 dsize = i_size_read(inode);
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
- memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
- kunmap(page);
+ memset(kaddr + dsize, 0, folio_size(folio) - dsize);
+ kunmap_local(kaddr);
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
if (gfs2_is_jdata(ip)) {
- struct buffer_head *bh;
+ struct buffer_head *bh = folio_buffers(folio);
- if (!page_has_buffers(page))
- create_empty_buffers(page, BIT(inode->i_blkbits),
- BIT(BH_Uptodate));
+ if (!bh)
+ bh = create_empty_buffers(folio,
+ BIT(inode->i_blkbits), BIT(BH_Uptodate));
- bh = page_buffers(page);
if (!buffer_mapped(bh))
map_bh(bh, inode->i_sb, block);
set_buffer_uptodate(bh);
gfs2_trans_add_data(ip->i_gl, bh);
} else {
- set_page_dirty(page);
+ folio_mark_dirty(folio);
gfs2_ordered_add_inode(ip);
}
return 0;
}
-static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
+static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio)
{
struct buffer_head *bh, *dibh;
struct gfs2_dinode *di;
@@ -106,7 +104,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
and write it out to disk */
unsigned int n = 1;
- error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+ error = gfs2_alloc_blocks(ip, &block, &n, 0);
if (error)
goto out_brelse;
if (isdir) {
@@ -118,7 +116,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
dibh, sizeof(struct gfs2_dinode));
brelse(bh);
} else {
- error = gfs2_unstuffer_page(ip, dibh, block, page);
+ error = gfs2_unstuffer_folio(ip, dibh, block, folio);
if (error)
goto out_brelse;
}
@@ -157,17 +155,17 @@ out_brelse:
int gfs2_unstuff_dinode(struct gfs2_inode *ip)
{
struct inode *inode = &ip->i_inode;
- struct page *page;
+ struct folio *folio;
int error;
down_write(&ip->i_rw_mutex);
- page = grab_cache_page(inode->i_mapping, 0);
- error = -ENOMEM;
- if (!page)
+ folio = filemap_grab_folio(inode->i_mapping, 0);
+ error = PTR_ERR(folio);
+ if (IS_ERR(folio))
goto out;
- error = __gfs2_unstuff_inode(ip, page);
- unlock_page(page);
- put_page(page);
+ error = __gfs2_unstuff_inode(ip, folio);
+ folio_unlock(folio);
+ folio_put(folio);
out:
up_write(&ip->i_rw_mutex);
return error;
@@ -317,6 +315,12 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
}
}
+static inline struct buffer_head *
+metapath_dibh(struct metapath *mp)
+{
+ return mp->mp_bh[0];
+}
+
static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
unsigned int x, unsigned int h)
{
@@ -415,13 +419,12 @@ static void release_metapath(struct metapath *mp)
* gfs2_extent_length - Returns length of an extent of blocks
* @bh: The metadata block
* @ptr: Current position in @bh
- * @limit: Max extent length to return
* @eob: Set to 1 if we hit "end of block"
*
* Returns: The length of the extent (minimum of one block)
*/
-static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
+static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, int *eob)
{
const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
const __be64 *first = ptr;
@@ -660,7 +663,7 @@ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct buffer_head *dibh = mp->mp_bh[0];
+ struct buffer_head *dibh = metapath_dibh(mp);
u64 bn;
unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
size_t dblks = iomap->length >> inode->i_blkbits;
@@ -702,7 +705,7 @@ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
i = mp->mp_aheight;
do {
n = blks - alloced;
- ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+ ret = gfs2_alloc_blocks(ip, &bn, &n, 0);
if (ret)
goto out;
alloced += n;
@@ -913,7 +916,7 @@ unstuff:
goto do_alloc;
bh = mp->mp_bh[ip->i_height - 1];
- len = gfs2_extent_length(bh, ptr, len, &eob);
+ len = gfs2_extent_length(bh, ptr, &eob);
iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
iomap->length = len << inode->i_blkbits;
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index e5b7d17131ed..4e8b1e8ebdf3 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -46,24 +46,24 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
extern const struct iomap_ops gfs2_iomap_ops;
extern const struct iomap_writeback_ops gfs2_writeback_ops;
-extern int gfs2_unstuff_dinode(struct gfs2_inode *ip);
-extern int gfs2_block_map(struct inode *inode, sector_t lblock,
- struct buffer_head *bh, int create);
-extern int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
- struct iomap *iomap);
-extern int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
- struct iomap *iomap);
-extern int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
- unsigned int *extlen);
-extern int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
- unsigned *extlen, bool *new);
-extern int gfs2_setattr_size(struct inode *inode, u64 size);
-extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
-extern int gfs2_file_dealloc(struct gfs2_inode *ip);
-extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
- unsigned int len);
-extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
-extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
-extern int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length);
+int gfs2_unstuff_dinode(struct gfs2_inode *ip);
+int gfs2_block_map(struct inode *inode, sector_t lblock,
+ struct buffer_head *bh, int create);
+int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap);
+int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap);
+int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
+ unsigned int *extlen);
+int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
+ unsigned *extlen, bool *new);
+int gfs2_setattr_size(struct inode *inode, u64 size);
+int gfs2_truncatei_resume(struct gfs2_inode *ip);
+int gfs2_file_dealloc(struct gfs2_inode *ip);
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+ unsigned int len);
+int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
+void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
+int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length);
#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 61ddd03ea111..560e4624c09f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -868,7 +868,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
struct gfs2_dirent *dent;
struct timespec64 tv = current_time(inode);
- error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+ error = gfs2_alloc_blocks(ip, &bn, &n, 0);
if (error)
return NULL;
bh = gfs2_meta_new(ip->i_gl, bn);
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 5b76480c17c9..25a857c78b53 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -23,32 +23,32 @@ struct gfs2_diradd {
int save_loc;
};
-extern struct inode *gfs2_dir_search(struct inode *dir,
- const struct qstr *filename,
- bool fail_on_exist);
-extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
- const struct gfs2_inode *ip);
-extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
- const struct gfs2_inode *ip, struct gfs2_diradd *da);
+struct inode *gfs2_dir_search(struct inode *dir,
+ const struct qstr *filename,
+ bool fail_on_exist);
+int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+ const struct gfs2_inode *ip);
+int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+ const struct gfs2_inode *ip, struct gfs2_diradd *da);
static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
{
brelse(da->bh);
da->bh = NULL;
}
-extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
-extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
- struct file_ra_state *f_ra);
-extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
- const struct gfs2_inode *nip, unsigned int new_type);
+int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
+int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+ struct file_ra_state *f_ra);
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+ const struct gfs2_inode *nip, unsigned int new_type);
-extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
-extern int gfs2_diradd_alloc_required(struct inode *dir,
- const struct qstr *filename,
- struct gfs2_diradd *da);
-extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
- struct buffer_head **bhp);
-extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
+int gfs2_diradd_alloc_required(struct inode *dir,
+ const struct qstr *filename,
+ struct gfs2_diradd *da);
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+ struct buffer_head **bhp);
+void gfs2_dir_hash_inval(struct gfs2_inode *ip);
static inline u32 gfs2_disk_hash(const char *data, int len)
{
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index f2700477a300..4b66efc1a82a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -418,7 +418,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
struct inode *inode = file_inode(vmf->vma->vm_file);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct gfs2_alloc_parms ap = { .aflags = 0, };
+ struct gfs2_alloc_parms ap = {};
u64 offset = page_offset(page);
unsigned int data_blocks, ind_blocks, rblocks;
vm_fault_t ret = VM_FAULT_LOCKED;
@@ -1120,14 +1120,16 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret)
goto out_unlock;
- ret = file_update_time(file);
- if (ret)
- goto out_unlock;
-
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
ssize_t buffered, ret2;
+ /*
+ * Note that under direct I/O, we don't allow and inode
+ * timestamp updates, so we're not calling file_update_time()
+ * here.
+ */
+
ret = gfs2_file_direct_write(iocb, from, &gh);
if (ret < 0 || !iov_iter_count(from))
goto out_unlock;
@@ -1154,6 +1156,10 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!ret || ret2 > 0)
ret += ret2;
} else {
+ ret = file_update_time(file);
+ if (ret)
+ goto out_unlock;
+
ret = gfs2_file_buffered_write(iocb, from, &gh);
if (likely(ret > 0))
ret = generic_write_sync(iocb, ret);
@@ -1245,7 +1251,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
struct inode *inode = file_inode(file);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_alloc_parms ap = { .aflags = 0, };
+ struct gfs2_alloc_parms ap = {};
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
loff_t bytes, max_bytes, max_blks;
int error;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3772a5d9e85c..d6bf1f8c25dc 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1524,7 +1524,6 @@ fail:
return;
}
list_add_tail(&gh->gh_list, insert_pt);
- gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
spin_unlock(&gl->gl_lockref.lock);
if (sdp->sd_lockstruct.ls_ops->lm_cancel)
sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
@@ -2041,11 +2040,7 @@ static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink,
return vfs_pressure_ratio(atomic_read(&lru_count));
}
-static struct shrinker glock_shrinker = {
- .seeks = DEFAULT_SEEKS,
- .count_objects = gfs2_glock_shrink_count,
- .scan_objects = gfs2_glock_shrink_scan,
-};
+static struct shrinker *glock_shrinker;
/**
* glock_hash_walk - Call a function for glock in a hash bucket
@@ -2465,13 +2460,18 @@ int __init gfs2_glock_init(void)
return -ENOMEM;
}
- ret = register_shrinker(&glock_shrinker, "gfs2-glock");
- if (ret) {
+ glock_shrinker = shrinker_alloc(0, "gfs2-glock");
+ if (!glock_shrinker) {
destroy_workqueue(glock_workqueue);
rhashtable_destroy(&gl_hash_table);
- return ret;
+ return -ENOMEM;
}
+ glock_shrinker->count_objects = gfs2_glock_shrink_count;
+ glock_shrinker->scan_objects = gfs2_glock_shrink_scan;
+
+ shrinker_register(glock_shrinker);
+
for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++)
init_waitqueue_head(glock_wait_table + i);
@@ -2480,7 +2480,7 @@ int __init gfs2_glock_init(void)
void gfs2_glock_exit(void)
{
- unregister_shrinker(&glock_shrinker);
+ shrinker_free(glock_shrinker);
rhashtable_destroy(&gl_hash_table);
destroy_workqueue(glock_workqueue);
}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c8685ca7d2a2..61197598abfd 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -156,21 +156,6 @@ out:
return gh;
}
-static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
-{
- return gl->gl_state == LM_ST_EXCLUSIVE;
-}
-
-static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
-{
- return gl->gl_state == LM_ST_DEFERRED;
-}
-
-static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
-{
- return gl->gl_state == LM_ST_SHARED;
-}
-
static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
{
if (gl->gl_ops->go_flags & GLOF_ASPACE) {
@@ -181,40 +166,40 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
return NULL;
}
-extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
- const struct gfs2_glock_operations *glops,
- int create, struct gfs2_glock **glp);
-extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl);
-extern void gfs2_glock_put(struct gfs2_glock *gl);
-extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
+int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops,
+ int create, struct gfs2_glock **glp);
+struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl);
+void gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_glock_queue_put(struct gfs2_glock *gl);
-extern void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
- u16 flags, struct gfs2_holder *gh,
- unsigned long ip);
+void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
+ u16 flags, struct gfs2_holder *gh,
+ unsigned long ip);
static inline void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
u16 flags, struct gfs2_holder *gh) {
__gfs2_holder_init(gl, state, flags, gh, _RET_IP_);
}
-extern void gfs2_holder_reinit(unsigned int state, u16 flags,
- struct gfs2_holder *gh);
-extern void gfs2_holder_uninit(struct gfs2_holder *gh);
-extern int gfs2_glock_nq(struct gfs2_holder *gh);
-extern int gfs2_glock_poll(struct gfs2_holder *gh);
-extern int gfs2_instantiate(struct gfs2_holder *gh);
-extern int gfs2_glock_holder_ready(struct gfs2_holder *gh);
-extern int gfs2_glock_wait(struct gfs2_holder *gh);
-extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_glock_dq(struct gfs2_holder *gh);
-extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
-extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
-extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
- const struct gfs2_glock_operations *glops,
- unsigned int state, u16 flags,
- struct gfs2_holder *gh);
-extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl,
+void gfs2_holder_reinit(unsigned int state, u16 flags,
+ struct gfs2_holder *gh);
+void gfs2_holder_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq(struct gfs2_holder *gh);
+int gfs2_glock_poll(struct gfs2_holder *gh);
+int gfs2_instantiate(struct gfs2_holder *gh);
+int gfs2_glock_holder_ready(struct gfs2_holder *gh);
+int gfs2_glock_wait(struct gfs2_holder *gh);
+int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq(struct gfs2_holder *gh);
+void gfs2_glock_dq_wait(struct gfs2_holder *gh);
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops,
+ unsigned int state, u16 flags,
+ struct gfs2_holder *gh);
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl,
bool fsid);
#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { \
gfs2_dump_glock(NULL, gl, true); \
@@ -228,7 +213,7 @@ extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl,
gfs2_assert_withdraw((gl)->gl_name.ln_sbd, (x)); } } \
while (0)
-extern __printf(2, 3)
+__printf(2, 3)
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
@@ -256,27 +241,27 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
return error;
}
-extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
-extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-extern bool gfs2_queue_try_to_evict(struct gfs2_glock *gl);
-extern void gfs2_cancel_delete_work(struct gfs2_glock *gl);
-extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
-extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp);
-extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
-extern void gfs2_glock_free(struct gfs2_glock *gl);
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+bool gfs2_queue_try_to_evict(struct gfs2_glock *gl);
+void gfs2_cancel_delete_work(struct gfs2_glock *gl);
+void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+void gfs2_gl_dq_holders(struct gfs2_sbd *sdp);
+void gfs2_glock_thaw(struct gfs2_sbd *sdp);
+void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
+void gfs2_glock_free(struct gfs2_glock *gl);
-extern int __init gfs2_glock_init(void);
-extern void gfs2_glock_exit(void);
+int __init gfs2_glock_init(void);
+void gfs2_glock_exit(void);
-extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
-extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-extern void gfs2_register_debugfs(void);
-extern void gfs2_unregister_debugfs(void);
+void gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+void gfs2_register_debugfs(void);
+void gfs2_unregister_debugfs(void);
-extern void glock_set_object(struct gfs2_glock *gl, void *object);
-extern void glock_clear_object(struct gfs2_glock *gl, void *object);
+void glock_set_object(struct gfs2_glock *gl, void *object);
+void glock_clear_object(struct gfs2_glock *gl, void *object);
extern const struct lm_lockops gfs2_dlm_ops;
@@ -295,7 +280,7 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh)
return !list_empty(&gh->gh_list);
}
-extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
-extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
+void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
+bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index e7d334c277a1..b41c78bd2cc0 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -615,18 +615,6 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl)
}
/**
- * freeze_go_demote_ok
- * @gl: the glock
- *
- * Always returns 0
- */
-
-static int freeze_go_demote_ok(const struct gfs2_glock *gl)
-{
- return 0;
-}
-
-/**
* iopen_go_callback - schedule the dcache entry for the inode to be deleted
* @gl: the glock
* @remote: true if this came from a different cluster node
@@ -745,7 +733,6 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
const struct gfs2_glock_operations gfs2_freeze_glops = {
.go_xmote_bh = freeze_go_xmote_bh,
- .go_demote_ok = freeze_go_demote_ok,
.go_callback = freeze_go_callback,
.go_type = LM_TYPE_NONDISK,
.go_flags = GLOF_NONDISK,
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 695898afcaf1..9341423798df 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -22,7 +22,7 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
extern const struct gfs2_glock_operations gfs2_journal_glops;
extern const struct gfs2_glock_operations *gfs2_glops_list[];
-extern int gfs2_inode_metasync(struct gfs2_glock *gl);
-extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
+int gfs2_inode_metasync(struct gfs2_glock *gl);
+void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a8c95c5293c6..95a334d64da2 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -863,7 +863,7 @@ static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
preempt_enable();
}
-extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl);
+struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl);
static inline unsigned gfs2_max_stuffed_size(const struct gfs2_inode *ip)
{
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7fe77bc771e5..1b95db2c3aac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -266,17 +266,18 @@ fail_iput:
}
-struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
+/**
+ * gfs2_lookup_meta - Look up an inode in a metadata directory
+ * @dip: The directory
+ * @name: The name of the inode
+ */
+struct inode *gfs2_lookup_meta(struct inode *dip, const char *name)
{
struct qstr qstr;
struct inode *inode;
+
gfs2_str2qstr(&qstr, name);
inode = gfs2_lookupi(dip, &qstr, 1);
- /* gfs2_lookupi has inconsistent callers: vfs
- * related routines expect NULL for no entry found,
- * gfs2_lookup_simple callers expect ENOENT
- * and do not check for NULL.
- */
if (IS_ERR_OR_NULL(inode))
return inode ? inode : ERR_PTR(-ENOENT);
@@ -418,7 +419,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
if (error)
goto out_ipreserv;
- error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
+ error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1);
if (error)
goto out_trans_end;
@@ -1867,16 +1868,24 @@ out:
int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
int mask)
{
+ int may_not_block = mask & MAY_NOT_BLOCK;
struct gfs2_inode *ip;
struct gfs2_holder i_gh;
+ struct gfs2_glock *gl;
int error;
gfs2_holder_mark_uninitialized(&i_gh);
ip = GFS2_I(inode);
- if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- if (mask & MAY_NOT_BLOCK)
+ gl = rcu_dereference_check(ip->i_gl, !may_not_block);
+ if (unlikely(!gl)) {
+ /* inode is getting torn down, must be RCU mode */
+ WARN_ON_ONCE(!may_not_block);
+ return -ECHILD;
+ }
+ if (gfs2_glock_is_locked_by_me(gl) == NULL) {
+ if (may_not_block)
return -ECHILD;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
if (error)
return error;
}
@@ -1921,7 +1930,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
kuid_t ouid, nuid;
kgid_t ogid, ngid;
int error;
- struct gfs2_alloc_parms ap;
+ struct gfs2_alloc_parms ap = {};
ouid = inode->i_uid;
ogid = inode->i_gid;
@@ -2154,7 +2163,7 @@ static int gfs2_update_time(struct inode *inode, int flags)
int error;
gh = gfs2_glock_is_locked_by_me(gl);
- if (gh && !gfs2_glock_is_held_excl(gl)) {
+ if (gh && gl->gl_state != LM_ST_EXCLUSIVE) {
gfs2_glock_dq(gh);
gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, gh);
error = gfs2_glock_nq(gh);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c8c5814e7295..fd15d1c6b6fb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -13,9 +13,9 @@
#include "util.h"
bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask);
-extern int gfs2_internal_read(struct gfs2_inode *ip,
- char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_aops(struct inode *inode);
+ssize_t gfs2_internal_read(struct gfs2_inode *ip,
+ char *buf, loff_t *pos, size_t size);
+void gfs2_set_aops(struct inode *inode);
static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
{
@@ -44,19 +44,17 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip)
static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
{
- inode->i_blocks = blocks <<
- (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+ inode->i_blocks = blocks << (inode->i_blkbits - 9);
}
static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
{
- return inode->i_blocks >>
- (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+ return inode->i_blocks >> (inode->i_blkbits - 9);
}
static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
{
- change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT;
+ change <<= inode->i_blkbits - 9;
gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change));
inode->i_blocks += change;
}
@@ -88,33 +86,33 @@ err:
return -EIO;
}
-extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
- u64 no_addr, u64 no_formal_ino,
- unsigned int blktype);
-extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
- u64 no_formal_ino,
- unsigned int blktype);
-
-extern int gfs2_inode_refresh(struct gfs2_inode *ip);
-
-extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
- int is_root);
-extern int gfs2_permission(struct mnt_idmap *idmap,
- struct inode *inode, int mask);
-extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-extern int gfs2_open_common(struct inode *inode, struct file *file);
-extern loff_t gfs2_seek_data(struct file *file, loff_t offset);
-extern loff_t gfs2_seek_hole(struct file *file, loff_t offset);
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
+ u64 no_addr, u64 no_formal_ino,
+ unsigned int blktype);
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+ u64 no_formal_ino,
+ unsigned int blktype);
+
+int gfs2_inode_refresh(struct gfs2_inode *ip);
+
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+ int is_root);
+int gfs2_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask);
+struct inode *gfs2_lookup_meta(struct inode *dip, const char *name);
+void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+int gfs2_open_common(struct inode *inode, struct file *file);
+loff_t gfs2_seek_data(struct file *file, loff_t offset);
+loff_t gfs2_seek_hole(struct file *file, loff_t offset);
extern const struct file_operations gfs2_file_fops_nolock;
extern const struct file_operations gfs2_dir_fops_nolock;
-extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-extern int gfs2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
-extern void gfs2_set_inode_flags(struct inode *inode);
-
+int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int gfs2_fileattr_set(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct fileattr *fa);
+void gfs2_set_inode_flags(struct inode *inode);
+
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
extern const struct file_operations gfs2_file_fops;
extern const struct file_operations gfs2_dir_fops;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 653cffcbf869..c27b05099c1e 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -70,29 +70,29 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
}
}
-extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
-extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct);
-extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-extern bool gfs2_log_is_empty(struct gfs2_sbd *sdp);
-extern void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes);
-extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
-extern bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
- unsigned int *extra_revokes);
-extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
- unsigned int *extra_revokes);
-extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
- u64 seq, u32 tail, u32 lblock, u32 flags,
- blk_opf_t op_flags);
-extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
- u32 type);
-extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
-extern void log_flush_wait(struct gfs2_sbd *sdp);
+void gfs2_ordered_del_inode(struct gfs2_inode *ip);
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct);
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
+bool gfs2_log_is_empty(struct gfs2_sbd *sdp);
+void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes);
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
+ unsigned int *extra_revokes);
+void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
+ unsigned int *extra_revokes);
+void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+ u64 seq, u32 tail, u32 lblock, u32 flags,
+ blk_opf_t op_flags);
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
+ u32 type);
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
+void log_flush_wait(struct gfs2_sbd *sdp);
-extern int gfs2_logd(void *data);
-extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
-extern void gfs2_flush_revokes(struct gfs2_sbd *sdp);
-extern void gfs2_ail_drain(struct gfs2_sbd *sdp);
+int gfs2_logd(void *data);
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
+void gfs2_flush_revokes(struct gfs2_sbd *sdp);
+void gfs2_ail_drain(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 1412ffba1d44..07890c7b145d 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -11,16 +11,18 @@
#include "incore.h"
extern const struct gfs2_log_operations *gfs2_log_ops[];
-extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn);
-extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
- struct page *page, unsigned size, unsigned offset,
- u64 blkno);
-extern void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf);
-extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
-extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
- struct gfs2_log_header_host *head, bool keep_cache);
-extern void gfs2_drain_revokes(struct gfs2_sbd *sdp);
+
+void gfs2_log_incr_head(struct gfs2_sbd *sdp);
+u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn);
+void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+ struct page *page, unsigned size, unsigned offset,
+ u64 blkno);
+void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf);
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
+int gfs2_find_jhead(struct gfs2_jdesc *jd,
+ struct gfs2_log_header_host *head, bool keep_cache);
+void gfs2_drain_revokes(struct gfs2_sbd *sdp);
+
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
return sdp->sd_ldptrs;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 66eb98b690a2..79be0cdc730c 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -147,7 +147,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_trans_cachep)
goto fail_cachep8;
- error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");
+ error = gfs2_qd_shrinker_init();
if (error)
goto fail_shrinker;
@@ -196,7 +196,7 @@ fail_wq3:
fail_wq2:
destroy_workqueue(gfs2_recovery_wq);
fail_wq1:
- unregister_shrinker(&gfs2_qd_shrinker);
+ gfs2_qd_shrinker_exit();
fail_shrinker:
kmem_cache_destroy(gfs2_trans_cachep);
fail_cachep8:
@@ -229,7 +229,7 @@ fail_lru:
static void __exit exit_gfs2_fs(void)
{
- unregister_shrinker(&gfs2_qd_shrinker);
+ gfs2_qd_shrinker_exit();
gfs2_glock_exit();
gfs2_unregister_debugfs();
unregister_filesystem(&gfs2_fs_type);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 924361fa510b..25ceb0805df2 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -115,7 +115,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct page *page;
+ struct folio *folio;
struct buffer_head *bh;
unsigned int shift;
unsigned long index;
@@ -129,36 +129,31 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
bufnum = blkno - (index << shift); /* block buf index within page */
if (create) {
- for (;;) {
- page = grab_cache_page(mapping, index);
- if (page)
- break;
- yield();
- }
- if (!page_has_buffers(page))
- create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_mask(mapping) | __GFP_NOFAIL);
+ bh = folio_buffers(folio);
+ if (!bh)
+ bh = create_empty_buffers(folio,
+ sdp->sd_sb.sb_bsize, 0);
} else {
- page = find_get_page_flags(mapping, index,
- FGP_LOCK|FGP_ACCESSED);
- if (!page)
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED, 0);
+ if (IS_ERR(folio))
return NULL;
- if (!page_has_buffers(page)) {
- bh = NULL;
- goto out_unlock;
- }
+ bh = folio_buffers(folio);
}
- /* Locate header for our buffer within our page */
- for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
- /* Do nothing */;
- get_bh(bh);
+ if (!bh)
+ goto out_unlock;
+ bh = get_nth_bh(bh, bufnum);
if (!buffer_mapped(bh))
map_bh(bh, sdp->sd_vfs, blkno);
out_unlock:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return bh;
}
@@ -405,26 +400,20 @@ static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno)
{
struct address_space *mapping = ip->i_inode.i_mapping;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct page *page;
+ struct folio *folio;
struct buffer_head *bh;
unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
unsigned long index = blkno >> shift; /* convert block to page */
unsigned int bufnum = blkno - (index << shift);
- page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED);
- if (!page)
+ folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED, 0);
+ if (IS_ERR(folio))
return NULL;
- if (!page_has_buffers(page)) {
- unlock_page(page);
- put_page(page);
- return NULL;
- }
- /* Locate header for our buffer within our page */
- for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
- /* Do nothing */;
- get_bh(bh);
- unlock_page(page);
- put_page(page);
+ bh = folio_buffers(folio);
+ if (bh)
+ bh = get_nth_bh(bh, bufnum);
+ folio_unlock(folio);
+ folio_put(folio);
return bh;
}
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index d0a58cdd433a..831d988c2ceb 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -50,21 +50,21 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
return inode->i_sb->s_fs_info;
}
-extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
-extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
- int rahead, struct buffer_head **bhp);
-extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
-extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
- int create);
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+ int rahead, struct buffer_head **bhp);
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
+ int create);
enum {
REMOVE_JDATA = 0,
REMOVE_META = 1,
};
-extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
-extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
-extern int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
- struct buffer_head **bhp);
+void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
+void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
+ struct buffer_head **bhp);
static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
struct buffer_head **bhp)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 33ca04733e93..b108c5d26839 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -292,8 +292,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
return error;
}
- sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
- GFS2_BASIC_BLOCK_SHIFT;
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9;
sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
sizeof(struct gfs2_dinode)) / sizeof(u64);
@@ -648,7 +647,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
struct gfs2_jdesc *jd;
struct gfs2_inode *ip;
- sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
+ sdp->sd_statfs_inode = gfs2_lookup_meta(master, "statfs");
if (IS_ERR(sdp->sd_statfs_inode)) {
error = PTR_ERR(sdp->sd_statfs_inode);
fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -657,7 +656,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
if (sdp->sd_args.ar_spectator)
goto out;
- pn = gfs2_lookup_simple(master, "per_node");
+ pn = gfs2_lookup_meta(master, "per_node");
if (IS_ERR(pn)) {
error = PTR_ERR(pn);
fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -674,7 +673,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
goto free_local;
}
sprintf(buf, "statfs_change%u", jd->jd_jid);
- lsi->si_sc_inode = gfs2_lookup_simple(pn, buf);
+ lsi->si_sc_inode = gfs2_lookup_meta(pn, buf);
if (IS_ERR(lsi->si_sc_inode)) {
error = PTR_ERR(lsi->si_sc_inode);
fs_err(sdp, "can't find local \"sc\" file#%u: %d\n",
@@ -739,7 +738,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
if (undo)
goto fail_statfs;
- sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
+ sdp->sd_jindex = gfs2_lookup_meta(master, "jindex");
if (IS_ERR(sdp->sd_jindex)) {
fs_err(sdp, "can't lookup journal index: %d\n", error);
return PTR_ERR(sdp->sd_jindex);
@@ -888,7 +887,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
goto fail;
/* Read in the resource index inode */
- sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
+ sdp->sd_rindex = gfs2_lookup_meta(master, "rindex");
if (IS_ERR(sdp->sd_rindex)) {
error = PTR_ERR(sdp->sd_rindex);
fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -897,7 +896,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
sdp->sd_rindex_uptodate = 0;
/* Read in the quota inode */
- sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
+ sdp->sd_quota_inode = gfs2_lookup_meta(master, "quota");
if (IS_ERR(sdp->sd_quota_inode)) {
error = PTR_ERR(sdp->sd_quota_inode);
fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -941,7 +940,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
if (undo)
goto fail_qc_gh;
- pn = gfs2_lookup_simple(master, "per_node");
+ pn = gfs2_lookup_meta(master, "per_node");
if (IS_ERR(pn)) {
error = PTR_ERR(pn);
fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -949,7 +948,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
}
sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
- sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
+ sdp->sd_qc_inode = gfs2_lookup_meta(pn, buf);
if (IS_ERR(sdp->sd_qc_inode)) {
error = PTR_ERR(sdp->sd_qc_inode);
fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
@@ -1126,8 +1125,7 @@ static int init_threads(struct gfs2_sbd *sdp)
return 0;
fail:
- kthread_stop(sdp->sd_logd_process);
- put_task_struct(sdp->sd_logd_process);
+ kthread_stop_put(sdp->sd_logd_process);
sdp->sd_logd_process = NULL;
return error;
}
@@ -1135,13 +1133,11 @@ fail:
void gfs2_destroy_threads(struct gfs2_sbd *sdp)
{
if (sdp->sd_logd_process) {
- kthread_stop(sdp->sd_logd_process);
- put_task_struct(sdp->sd_logd_process);
+ kthread_stop_put(sdp->sd_logd_process);
sdp->sd_logd_process = NULL;
}
if (sdp->sd_quotad_process) {
- kthread_stop(sdp->sd_quotad_process);
- put_task_struct(sdp->sd_quotad_process);
+ kthread_stop_put(sdp->sd_quotad_process);
sdp->sd_quotad_process = NULL;
}
}
@@ -1190,10 +1186,9 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
/* Set up the buffer cache and fill in some fake block size values
to allow us to read-in the on-disk superblock. */
- sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
+ sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, 512);
sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
- sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
- GFS2_BASIC_BLOCK_SHIFT;
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9;
sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
@@ -1281,10 +1276,8 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
if (!sb_rdonly(sb)) {
error = init_threads(sdp);
- if (error) {
- gfs2_withdraw_delayed(sdp);
+ if (error)
goto fail_per_node;
- }
}
error = gfs2_freeze_lock_shared(sdp);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index d9854aece15b..95dae7838b4e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -196,13 +196,26 @@ static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
}
-struct shrinker gfs2_qd_shrinker = {
- .count_objects = gfs2_qd_shrink_count,
- .scan_objects = gfs2_qd_shrink_scan,
- .seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
-};
+static struct shrinker *gfs2_qd_shrinker;
+
+int __init gfs2_qd_shrinker_init(void)
+{
+ gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd");
+ if (!gfs2_qd_shrinker)
+ return -ENOMEM;
+
+ gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count;
+ gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan;
+
+ shrinker_register(gfs2_qd_shrinker);
+ return 0;
+}
+
+void gfs2_qd_shrinker_exit(void)
+{
+ shrinker_free(gfs2_qd_shrinker);
+}
static u64 qd2index(struct gfs2_quota_data *qd)
{
@@ -457,6 +470,17 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
(sync_gen && (qd->qd_sync_gen >= *sync_gen)))
return 0;
+ /*
+ * If qd_change is 0 it means a pending quota change was negated.
+ * We should not sync it, but we still have a qd reference and slot
+ * reference taken by gfs2_quota_change -> do_qc that need to be put.
+ */
+ if (!qd->qd_change && test_and_clear_bit(QDF_CHANGE, &qd->qd_flags)) {
+ slot_put(qd);
+ qd_put(qd);
+ return 0;
+ }
+
if (!lockref_get_not_dead(&qd->qd_lockref))
return 0;
@@ -736,7 +760,7 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct inode *inode = &ip->i_inode;
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
struct buffer_head *bh;
u64 blk;
unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0;
@@ -745,15 +769,15 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift);
boff = off % bsize;
- page = grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- if (!page_has_buffers(page))
- create_empty_buffers(page, bsize, 0);
+ folio = filemap_grab_folio(mapping, index);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ bh = folio_buffers(folio);
+ if (!bh)
+ bh = create_empty_buffers(folio, bsize, 0);
- bh = page_buffers(page);
- for(;;) {
- /* Find the beginning block within the page */
+ for (;;) {
+ /* Find the beginning block within the folio */
if (pg_off >= ((bnum * bsize) + bsize)) {
bh = bh->b_this_page;
bnum++;
@@ -766,9 +790,10 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
goto unlock_out;
/* If it's a newly allocated disk block, zero it */
if (buffer_new(bh))
- zero_user(page, bnum * bsize, bh->b_size);
+ folio_zero_range(folio, bnum * bsize,
+ bh->b_size);
}
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
if (bh_read(bh, REQ_META | REQ_PRIO) < 0)
goto unlock_out;
@@ -784,17 +809,17 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
break;
}
- /* Write to the page, now that we have setup the buffer(s) */
- memcpy_to_page(page, off, buf, bytes);
- flush_dcache_page(page);
- unlock_page(page);
- put_page(page);
+ /* Write to the folio, now that we have setup the buffer(s) */
+ memcpy_to_folio(folio, off, buf, bytes);
+ flush_dcache_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return 0;
unlock_out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return -EIO;
}
@@ -898,7 +923,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
{
struct gfs2_sbd *sdp = (*qda)->qd_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
- struct gfs2_alloc_parms ap = { .aflags = 0, };
+ struct gfs2_alloc_parms ap = {};
unsigned int data_blocks, ind_blocks;
struct gfs2_holder *ghs, i_gh;
unsigned int qx, x;
@@ -1072,8 +1097,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
u32 x;
int error;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
- sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
error = gfs2_quota_hold(ip, uid, gid);
@@ -1180,17 +1204,16 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
#define MAX_LINE 256
-static int print_message(struct gfs2_quota_data *qd, char *type)
+static void print_message(struct gfs2_quota_data *qd, char *type)
{
struct gfs2_sbd *sdp = qd->qd_sbd;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) {
fs_info(sdp, "quota %s for %s %u\n",
type,
(qd->qd_id.type == USRQUOTA) ? "user" : "group",
from_kqid(&init_user_ns, qd->qd_id));
-
- return 0;
+ }
}
/**
@@ -1260,7 +1283,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
* HZ)) {
quota_send_warning(qd->qd_id,
sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
- error = print_message(qd, "warning");
+ print_message(qd, "warning");
+ error = 0;
qd->qd_last_warn = jiffies;
}
}
@@ -1274,8 +1298,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
u32 x;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- if ((sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
- sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) ||
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ||
gfs2_assert_warn(sdp, change))
return;
if (ip->i_diskflags & GFS2_DIF_SYSTEM)
@@ -1732,7 +1755,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
if (gfs2_is_stuffed(ip))
alloc_required = 1;
if (alloc_required) {
- struct gfs2_alloc_parms ap = { .aflags = 0, };
+ struct gfs2_alloc_parms ap = {};
gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
&data_blocks, &ind_blocks);
blocks = 1 + data_blocks + ind_blocks;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 1429945215a0..f462d9cb3087 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,27 +15,27 @@ struct gfs2_sbd;
#define NO_UID_QUOTA_CHANGE INVALID_UID
#define NO_GID_QUOTA_CHANGE INVALID_GID
-extern int gfs2_qa_get(struct gfs2_inode *ip);
-extern void gfs2_qa_put(struct gfs2_inode *ip);
-extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
-extern void gfs2_quota_unhold(struct gfs2_inode *ip);
+int gfs2_qa_get(struct gfs2_inode *ip);
+void gfs2_qa_put(struct gfs2_inode *ip);
+int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+void gfs2_quota_unhold(struct gfs2_inode *ip);
-extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
-extern void gfs2_quota_unlock(struct gfs2_inode *ip);
+int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+void gfs2_quota_unlock(struct gfs2_inode *ip);
-extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
- struct gfs2_alloc_parms *ap);
-extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
- kuid_t uid, kgid_t gid);
+int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+ struct gfs2_alloc_parms *ap);
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+ kuid_t uid, kgid_t gid);
-extern int gfs2_quota_sync(struct super_block *sb, int type);
-extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);
+int gfs2_quota_sync(struct super_block *sb, int type);
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);
-extern int gfs2_quota_init(struct gfs2_sbd *sdp);
-extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
-extern int gfs2_quotad(void *data);
+int gfs2_quota_init(struct gfs2_sbd *sdp);
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+int gfs2_quotad(void *data);
-extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
struct gfs2_alloc_parms *ap)
@@ -50,8 +50,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (ret)
return ret;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
- sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
return 0;
ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap);
if (ret)
@@ -60,8 +59,10 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
}
extern const struct quotactl_ops gfs2_quotactl_ops;
-extern struct shrinker gfs2_qd_shrinker;
+int __init gfs2_qd_shrinker_init(void);
+void gfs2_qd_shrinker_exit(void);
extern struct list_lru gfs2_qd_lru;
-extern void __init gfs2_quota_hash_init(void);
+
+void __init gfs2_quota_hash_init(void);
#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 7a0c9d0b7503..6a0fd42e1120 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -17,18 +17,18 @@ static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, u32 *blk)
*blk = 0;
}
-extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
struct buffer_head **bh);
-extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
-extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
-extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
+int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
+int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
+void gfs2_revoke_clean(struct gfs2_jdesc *jd);
-extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
-extern void gfs2_recover_func(struct work_struct *work);
-extern int __get_log_header(struct gfs2_sbd *sdp,
- const struct gfs2_log_header *lh, unsigned int blkno,
- struct gfs2_log_header_host *head);
+int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
+void gfs2_recover_func(struct work_struct *work);
+int __get_log_header(struct gfs2_sbd *sdp,
+ const struct gfs2_log_header *lh, unsigned int blkno,
+ struct gfs2_log_header_host *head);
#endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9308190895c8..c2060203b98a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2411,13 +2411,12 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
* @bn: Used to return the starting block number
* @nblocks: requested number of blocks/extent length (value/result)
* @dinode: 1 if we're allocating a dinode block, else 0
- * @generation: the generation number of the inode
*
* Returns: 0 or error
*/
int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
- bool dinode, u64 *generation)
+ bool dinode)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct buffer_head *dibh;
@@ -2477,10 +2476,13 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
rbm.rgd->rd_free -= *nblocks;
spin_unlock(&rbm.rgd->rd_rsspin);
if (dinode) {
+ u64 generation;
+
rbm.rgd->rd_dinodes++;
- *generation = rbm.rgd->rd_igeneration++;
- if (*generation == 0)
- *generation = rbm.rgd->rd_igeneration++;
+ generation = rbm.rgd->rd_igeneration++;
+ if (generation == 0)
+ generation = rbm.rgd->rd_igeneration++;
+ ip->i_generation = generation;
}
gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 00b30cf893af..8d20e99385db 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -22,38 +22,38 @@ struct gfs2_rgrpd;
struct gfs2_sbd;
struct gfs2_holder;
-extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
-extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact);
-extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
-extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact);
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
-extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
-extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
-extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl);
-extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+int gfs2_rindex_update(struct gfs2_sbd *sdp);
+void gfs2_free_clones(struct gfs2_rgrpd *rgd);
+int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl);
+void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
-extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
#define GFS2_AF_ORLOV 1
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip,
- struct gfs2_alloc_parms *ap);
-extern void gfs2_inplace_release(struct gfs2_inode *ip);
-
-extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
- bool dinode, u64 *generation);
-
-extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip);
-extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
- u64 bstart, u32 blen, int meta);
-extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
- u64 bstart, u32 blen);
-extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
-extern void gfs2_unlink_di(struct inode *inode);
-extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
- unsigned int type);
+int gfs2_inplace_reserve(struct gfs2_inode *ip,
+ struct gfs2_alloc_parms *ap);
+void gfs2_inplace_release(struct gfs2_inode *ip);
+
+int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
+ bool dinode);
+
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
+void gfs2_rs_delete(struct gfs2_inode *ip);
+void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+ u64 bstart, u32 blen, int meta);
+void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+ u64 bstart, u32 blen);
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+void gfs2_unlink_di(struct inode *inode);
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+ unsigned int type);
struct gfs2_rgrp_list {
unsigned int rl_rgrps;
@@ -62,18 +62,19 @@ struct gfs2_rgrp_list {
struct gfs2_holder *rl_ghs;
};
-extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
- u64 block);
-extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist,
- unsigned int state, u16 flags);
-extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
-extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
- const char *fs_id_buf);
-extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
- struct buffer_head *bh,
- const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
-extern int gfs2_fitrim(struct file *filp, void __user *argp);
+void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
+ u64 block);
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist,
+ unsigned int state, u16 flags);
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
+ const char *fs_id_buf);
+int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+ struct buffer_head *bh,
+ const struct gfs2_bitmap *bi, unsigned minlen,
+ u64 *ptrimmed);
+int gfs2_fitrim(struct file *filp, void __user *argp);
/* This is how to tell if a reservation is in the rgrp tree: */
static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
@@ -88,9 +89,9 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
return first <= block && block < last;
}
-extern void check_and_update_goal(struct gfs2_inode *ip);
+void check_and_update_goal(struct gfs2_inode *ip);
-extern void rgrp_lock_local(struct gfs2_rgrpd *rgd);
-extern void rgrp_unlock_local(struct gfs2_rgrpd *rgd);
+void rgrp_lock_local(struct gfs2_rgrpd *rgd);
+void rgrp_unlock_local(struct gfs2_rgrpd *rgd);
#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 52a878fa7139..d21c04a22d73 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -602,13 +602,15 @@ restart:
}
spin_unlock(&sdp->sd_jindex_spin);
- if (!sb_rdonly(sb)) {
+ if (!sb_rdonly(sb))
gfs2_make_fs_ro(sdp);
- }
- if (gfs2_withdrawn(sdp)) {
- gfs2_destroy_threads(sdp);
+ else {
+ if (gfs2_withdrawn(sdp))
+ gfs2_destroy_threads(sdp);
+
gfs2_quota_cleanup(sdp);
}
+
WARN_ON(gfs2_withdrawing(sdp));
/* At this point, we're through modifying the disk */
@@ -1006,6 +1008,7 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = sc.sc_dinodes + sc.sc_free;
buf->f_ffree = sc.sc_free;
buf->f_namelen = GFS2_FNAMESIZE;
+ buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
return 0;
}
@@ -1299,18 +1302,8 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode)
* As a last resort, if another node keeps holding the iopen glock
* without showing any activity on the inode glock, we will eventually
* time out and fail the iopen glock upgrade.
- *
- * Note that we're passing the LM_FLAG_TRY_1CB flag to the first
- * locking request as an optimization to notify lock holders as soon as
- * possible. Without that flag, they'd be notified implicitly by the
- * second locking request.
*/
- gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, gh);
- error = gfs2_glock_nq(gh);
- if (error != GLR_TRYFAILED)
- return !error;
-
gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh);
error = gfs2_glock_nq(gh);
if (error)
@@ -1550,7 +1543,7 @@ out:
wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
gfs2_glock_add_to_lru(ip->i_gl);
gfs2_glock_put_eventually(ip->i_gl);
- ip->i_gl = NULL;
+ rcu_assign_pointer(ip->i_gl, NULL);
}
}
@@ -1576,7 +1569,7 @@ static void gfs2_free_inode(struct inode *inode)
kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode));
}
-extern void free_local_statfs_inodes(struct gfs2_sbd *sdp)
+void free_local_statfs_inodes(struct gfs2_sbd *sdp)
{
struct local_statfs_inode *lsi, *safe;
@@ -1591,8 +1584,8 @@ extern void free_local_statfs_inodes(struct gfs2_sbd *sdp)
}
}
-extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
- unsigned int index)
+struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
+ unsigned int index)
{
struct local_statfs_inode *lsi;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b4ddf6244586..b27a774d9580 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -15,7 +15,7 @@
#define GFS2_FS_FORMAT_MIN (1801)
#define GFS2_FS_FORMAT_MAX (1802)
-extern void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
{
@@ -26,33 +26,33 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
return x;
}
-extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
+void gfs2_jindex_free(struct gfs2_sbd *sdp);
-extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
-extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
- struct gfs2_inode **ipp);
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+ struct gfs2_inode **ipp);
-extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp);
-extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
-extern void gfs2_destroy_threads(struct gfs2_sbd *sdp);
-extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
-extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
- s64 dinodes);
-extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
- const void *buf);
-extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc,
- void *buf);
-extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh);
-extern int gfs2_statfs_sync(struct super_block *sb, int type);
-extern void gfs2_freeze_func(struct work_struct *work);
-extern void gfs2_thaw_freeze_initiator(struct super_block *sb);
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+void gfs2_make_fs_ro(struct gfs2_sbd *sdp);
+void gfs2_online_uevent(struct gfs2_sbd *sdp);
+void gfs2_destroy_threads(struct gfs2_sbd *sdp);
+int gfs2_statfs_init(struct gfs2_sbd *sdp);
+void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+ s64 dinodes);
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+ const void *buf);
+void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc,
+ void *buf);
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh);
+int gfs2_statfs_sync(struct super_block *sb, int type);
+void gfs2_freeze_func(struct work_struct *work);
+void gfs2_thaw_freeze_initiator(struct super_block *sb);
-extern void free_local_statfs_inodes(struct gfs2_sbd *sdp);
-extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
- unsigned int index);
-extern void free_sbd(struct gfs2_sbd *sdp);
+void free_local_statfs_inodes(struct gfs2_sbd *sdp);
+struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
+ unsigned int index);
+void free_sbd(struct gfs2_sbd *sdp);
extern struct file_system_type gfs2_fs_type;
extern struct file_system_type gfs2meta_fs_type;
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index c76ad9a4c75a..f8ce5302280d 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -34,17 +34,17 @@ static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned
return rgd->rd_length;
}
-extern int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp,
- unsigned int blocks, unsigned int revokes,
- unsigned long ip);
-extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
- unsigned int revokes);
-
-extern void gfs2_trans_end(struct gfs2_sbd *sdp);
-extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
-extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
-extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-extern void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
-extern void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr);
+int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp,
+ unsigned int blocks, unsigned int revokes,
+ unsigned long ip);
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+ unsigned int revokes);
+
+void gfs2_trans_end(struct gfs2_sbd *sdp);
+void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
+void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
+void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr);
#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index cdb839529175..11c9d59b6889 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -147,10 +147,10 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
char *file, unsigned int line);
-extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
- bool verbose);
-extern int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp);
-extern void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh);
+int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+ bool verbose);
+int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp);
+void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh);
#define gfs2_io_error(sdp) \
gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 79d5c5559512..8c96ba6230d1 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -639,7 +639,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
u64 block;
int error;
- error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+ error = gfs2_alloc_blocks(ip, &block, &n, 0);
if (error)
return error;
gfs2_trans_remove_revoke(sdp, block, 1);
@@ -701,7 +701,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
int mh_size = sizeof(struct gfs2_meta_header);
unsigned int n = 1;
- error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+ error = gfs2_alloc_blocks(ip, &block, &n, 0);
if (error)
return error;
gfs2_trans_remove_revoke(sdp, block, 1);
@@ -1002,7 +1002,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
} else {
u64 blk;
unsigned int n = 1;
- error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL);
+ error = gfs2_alloc_blocks(ip, &blk, &n, 0);
if (error)
return error;
gfs2_trans_remove_revoke(sdp, blk, 1);
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index 2aed9d7d483d..eb12eb7e37c1 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -50,14 +50,14 @@ struct gfs2_ea_location {
struct gfs2_ea_header *el_prev;
};
-extern int __gfs2_xattr_set(struct inode *inode, const char *name,
- const void *value, size_t size,
- int flags, int type);
-extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
+int __gfs2_xattr_set(struct inode *inode, const char *name,
+ const void *value, size_t size,
+ int flags, int type);
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int gfs2_ea_dealloc(struct gfs2_inode *ip);
/* Exported to acl.c */
-extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index da217eaba102..f757d4f7ad98 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,29 +83,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
{}
};
-#ifdef CONFIG_NUMA
-static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index)
-{
- vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
- index);
-}
-
-static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
-{
- mpol_cond_put(vma->vm_policy);
-}
-#else
-static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index)
-{
-}
-
-static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
-{
-}
-#endif
-
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
@@ -135,7 +112,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
vma->vm_ops = &hugetlb_vm_ops;
- ret = seal_check_future_write(info->seals, vma);
+ ret = seal_check_write(info->seals, vma);
if (ret)
return ret;
@@ -295,7 +272,7 @@ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t byt
size_t res = 0;
/* First subpage to start the loop. */
- page += offset / PAGE_SIZE;
+ page = nth_page(page, offset / PAGE_SIZE);
offset %= PAGE_SIZE;
while (1) {
if (is_raw_hwpoison_page_in_hugepage(page))
@@ -309,7 +286,7 @@ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t byt
break;
offset += n;
if (offset == PAGE_SIZE) {
- page++;
+ page = nth_page(page, 1);
offset = 0;
}
}
@@ -334,7 +311,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
ssize_t retval = 0;
while (iov_iter_count(to)) {
- struct page *page;
+ struct folio *folio;
size_t nr, copied, want;
/* nr is the maximum number of bytes to copy from this page */
@@ -352,18 +329,18 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
nr = nr - offset;
- /* Find the page */
- page = find_lock_page(mapping, index);
- if (unlikely(page == NULL)) {
+ /* Find the folio */
+ folio = filemap_lock_hugetlb_folio(h, mapping, index);
+ if (IS_ERR(folio)) {
/*
* We have a HOLE, zero out the user-buffer for the
* length of the hole or request.
*/
copied = iov_iter_zero(nr, to);
} else {
- unlock_page(page);
+ folio_unlock(folio);
- if (!PageHWPoison(page))
+ if (!folio_test_has_hwpoisoned(folio))
want = nr;
else {
/*
@@ -371,19 +348,19 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
* touching the 1st raw HWPOISON subpage after
* offset.
*/
- want = adjust_range_hwpoison(page, offset, nr);
+ want = adjust_range_hwpoison(&folio->page, offset, nr);
if (want == 0) {
- put_page(page);
+ folio_put(folio);
retval = -EIO;
break;
}
}
/*
- * We have the page, copy it to user space buffer.
+ * We have the folio, copy it to user space buffer.
*/
- copied = copy_page_to_iter(page, offset, want, to);
- put_page(page);
+ copied = copy_folio_to_iter(folio, offset, want, to);
+ folio_put(folio);
}
offset += copied;
retval += copied;
@@ -661,21 +638,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
- const pgoff_t start = lstart >> huge_page_shift(h);
- const pgoff_t end = lend >> huge_page_shift(h);
+ const pgoff_t end = lend >> PAGE_SHIFT;
struct folio_batch fbatch;
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
folio_batch_init(&fbatch);
- next = start;
+ next = lstart >> PAGE_SHIFT;
while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
for (i = 0; i < folio_batch_count(&fbatch); ++i) {
struct folio *folio = fbatch.folios[i];
u32 hash = 0;
- index = folio->index;
+ index = folio->index >> huge_page_order(h);
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -693,7 +669,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
}
if (truncate_op)
- (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
+ (void)hugetlb_unreserve_pages(inode,
+ lstart >> huge_page_shift(h),
+ LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
@@ -741,7 +719,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
pgoff_t idx = start >> huge_page_shift(h);
struct folio *folio;
- folio = filemap_lock_folio(mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(folio))
return;
@@ -852,8 +830,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
/*
* Initialize a pseudo vma as this is required by the huge page
- * allocation routines. If NUMA is configured, use page index
- * as input to create an allocation policy.
+ * allocation routines.
*/
vma_init(&pseudo_vma, mm);
vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
@@ -886,7 +863,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
- folio = filemap_get_folio(mapping, index);
+ folio = filemap_get_folio(mapping, index << huge_page_order(h));
if (!IS_ERR(folio)) {
folio_put(folio);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -901,9 +878,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
* folios in these areas, we need to consume the reserves
* to keep reservation accounting consistent.
*/
- hugetlb_set_vma_policy(&pseudo_vma, inode, index);
folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0);
- hugetlb_drop_vma_policy(&pseudo_vma);
if (IS_ERR(folio)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
error = PTR_ERR(folio);
@@ -1204,7 +1179,9 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
struct hstate *h = hstate_inode(d_inode(dentry));
+ u64 id = huge_encode_dev(dentry->d_sb->s_dev);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_type = HUGETLBFS_MAGIC;
buf->f_bsize = huge_page_size(h);
if (sbinfo) {
@@ -1282,18 +1259,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
hugetlbfs_inc_free_inodes(sbinfo);
return NULL;
}
-
- /*
- * Any time after allocation, hugetlbfs_destroy_inode can be called
- * for the inode. mpol_free_shared_policy is unconditionally called
- * as part of hugetlbfs_destroy_inode. So, initialize policy here
- * in case of a quick call to destroy.
- *
- * Note that the policy is initialized even if we are creating a
- * private inode. This simplifies hugetlbfs_destroy_inode.
- */
- mpol_shared_policy_init(&p->policy, NULL);
-
return &p->vfs_inode;
}
@@ -1305,7 +1270,6 @@ static void hugetlbfs_free_inode(struct inode *inode)
static void hugetlbfs_destroy_inode(struct inode *inode)
{
hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
- mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
}
static const struct address_space_operations hugetlbfs_aops = {
diff --git a/fs/inode.c b/fs/inode.c
index 4f8984b97df0..edcd8a61975f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -54,9 +54,9 @@
* inode_hash_lock
*/
-static unsigned int i_hash_mask __read_mostly;
-static unsigned int i_hash_shift __read_mostly;
-static struct hlist_head *inode_hashtable __read_mostly;
+static unsigned int i_hash_mask __ro_after_init;
+static unsigned int i_hash_shift __ro_after_init;
+static struct hlist_head *inode_hashtable __ro_after_init;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
/*
@@ -70,7 +70,7 @@ EXPORT_SYMBOL(empty_aops);
static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);
-static struct kmem_cache *inode_cachep __read_mostly;
+static struct kmem_cache *inode_cachep __ro_after_init;
static long get_nr_inodes(void)
{
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 2bc0aa23fde3..f72df2babe56 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -29,9 +29,9 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
* and I/O completions.
*/
struct iomap_folio_state {
- atomic_t read_bytes_pending;
- atomic_t write_bytes_pending;
spinlock_t state_lock;
+ unsigned int read_bytes_pending;
+ atomic_t write_bytes_pending;
/*
* Each block has two bits in this bitmap:
@@ -57,30 +57,32 @@ static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
return test_bit(block, ifs->state);
}
-static void ifs_set_range_uptodate(struct folio *folio,
+static bool ifs_set_range_uptodate(struct folio *folio,
struct iomap_folio_state *ifs, size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
unsigned int first_blk = off >> inode->i_blkbits;
unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
unsigned int nr_blks = last_blk - first_blk + 1;
- unsigned long flags;
- spin_lock_irqsave(&ifs->state_lock, flags);
bitmap_set(ifs->state, first_blk, nr_blks);
- if (ifs_is_fully_uptodate(folio, ifs))
- folio_mark_uptodate(folio);
- spin_unlock_irqrestore(&ifs->state_lock, flags);
+ return ifs_is_fully_uptodate(folio, ifs);
}
static void iomap_set_range_uptodate(struct folio *folio, size_t off,
size_t len)
{
struct iomap_folio_state *ifs = folio->private;
+ unsigned long flags;
+ bool uptodate = true;
- if (ifs)
- ifs_set_range_uptodate(folio, ifs, off, len);
- else
+ if (ifs) {
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+ }
+
+ if (uptodate)
folio_mark_uptodate(folio);
}
@@ -181,7 +183,7 @@ static void ifs_free(struct folio *folio)
if (!ifs)
return;
- WARN_ON_ONCE(atomic_read(&ifs->read_bytes_pending));
+ WARN_ON_ONCE(ifs->read_bytes_pending != 0);
WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending));
WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) !=
folio_test_uptodate(folio));
@@ -248,20 +250,28 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
*lenp = plen;
}
-static void iomap_finish_folio_read(struct folio *folio, size_t offset,
+static void iomap_finish_folio_read(struct folio *folio, size_t off,
size_t len, int error)
{
struct iomap_folio_state *ifs = folio->private;
+ bool uptodate = !error;
+ bool finished = true;
- if (unlikely(error)) {
- folio_clear_uptodate(folio);
- folio_set_error(folio);
- } else {
- iomap_set_range_uptodate(folio, offset, len);
+ if (ifs) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ if (!error)
+ uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+ ifs->read_bytes_pending -= len;
+ finished = !ifs->read_bytes_pending;
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
}
- if (!ifs || atomic_sub_and_test(len, &ifs->read_bytes_pending))
- folio_unlock(folio);
+ if (error)
+ folio_set_error(folio);
+ if (finished)
+ folio_end_read(folio, uptodate);
}
static void iomap_read_end_io(struct bio *bio)
@@ -358,8 +368,11 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
}
ctx->cur_folio_in_bio = true;
- if (ifs)
- atomic_add(plen, &ifs->read_bytes_pending);
+ if (ifs) {
+ spin_lock_irq(&ifs->state_lock);
+ ifs->read_bytes_pending += plen;
+ spin_unlock_irq(&ifs->state_lock);
+ }
sector = iomap_sector(iomap, pos);
if (!ctx->bio ||
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 30dec2bd2ecc..ed53188472f9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1290,7 +1290,7 @@ static int jbd2_min_tag_size(void)
static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+ journal_t *journal = shrink->private_data;
unsigned long nr_to_scan = sc->nr_to_scan;
unsigned long nr_shrunk;
unsigned long count;
@@ -1316,7 +1316,7 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+ journal_t *journal = shrink->private_data;
unsigned long count;
count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
@@ -1588,14 +1588,21 @@ static journal_t *journal_init_common(struct block_device *bdev,
goto err_cleanup;
journal->j_shrink_transaction = NULL;
- journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
- journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
- journal->j_shrinker.seeks = DEFAULT_SEEKS;
- journal->j_shrinker.batch = journal->j_max_transaction_buffers;
- err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
- MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
- if (err)
+
+ journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)",
+ MAJOR(bdev->bd_dev),
+ MINOR(bdev->bd_dev));
+ if (!journal->j_shrinker) {
+ err = -ENOMEM;
goto err_cleanup;
+ }
+
+ journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
+ journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
+ journal->j_shrinker->batch = journal->j_max_transaction_buffers;
+ journal->j_shrinker->private_data = journal;
+
+ shrinker_register(journal->j_shrinker);
return journal;
@@ -2172,9 +2179,9 @@ int jbd2_journal_destroy(journal_t *journal)
brelse(journal->j_sb_buffer);
}
- if (journal->j_shrinker.flags & SHRINKER_REGISTERED) {
+ if (journal->j_shrinker) {
percpu_counter_destroy(&journal->j_checkpoint_jh_count);
- unregister_shrinker(&journal->j_shrinker);
+ shrinker_free(journal->j_shrinker);
}
if (journal->j_proc_entry)
jbd2_stats_proc_exit(journal);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7ea37f49f1e1..f99591a634b4 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -150,6 +150,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
}
static const struct export_operations jffs2_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.get_parent = jffs2_get_parent,
.fh_to_dentry = jffs2_fh_to_dentry,
.fh_to_parent = jffs2_fh_to_parent,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 966826c394ee..8d8e556bd610 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -896,6 +896,7 @@ static const struct super_operations jfs_super_operations = {
};
static const struct export_operations jfs_export_operations = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = jfs_fh_to_dentry,
.fh_to_parent = jfs_fh_to_parent,
.get_parent = jfs_get_parent,
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 180906c36f51..f0cb729e9a97 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -429,60 +429,11 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
return ret;
}
-#ifdef CONFIG_NUMA
-static int kernfs_vma_set_policy(struct vm_area_struct *vma,
- struct mempolicy *new)
-{
- struct file *file = vma->vm_file;
- struct kernfs_open_file *of = kernfs_of(file);
- int ret;
-
- if (!of->vm_ops)
- return 0;
-
- if (!kernfs_get_active(of->kn))
- return -EINVAL;
-
- ret = 0;
- if (of->vm_ops->set_policy)
- ret = of->vm_ops->set_policy(vma, new);
-
- kernfs_put_active(of->kn);
- return ret;
-}
-
-static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
- unsigned long addr)
-{
- struct file *file = vma->vm_file;
- struct kernfs_open_file *of = kernfs_of(file);
- struct mempolicy *pol;
-
- if (!of->vm_ops)
- return vma->vm_policy;
-
- if (!kernfs_get_active(of->kn))
- return vma->vm_policy;
-
- pol = vma->vm_policy;
- if (of->vm_ops->get_policy)
- pol = of->vm_ops->get_policy(vma, addr);
-
- kernfs_put_active(of->kn);
- return pol;
-}
-
-#endif
-
static const struct vm_operations_struct kernfs_vm_ops = {
.open = kernfs_vma_open,
.fault = kernfs_vma_fault,
.page_mkwrite = kernfs_vma_page_mkwrite,
.access = kernfs_vma_access,
-#ifdef CONFIG_NUMA
- .set_policy = kernfs_vma_set_policy,
- .get_policy = kernfs_vma_get_policy,
-#endif
};
static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
@@ -903,6 +854,33 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
return ret;
}
+static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct kernfs_open_file *of = kernfs_of(file);
+ const struct kernfs_ops *ops;
+ loff_t ret;
+
+ /*
+ * @of->mutex nests outside active ref and is primarily to ensure that
+ * the ops aren't called concurrently for the same open file.
+ */
+ mutex_lock(&of->mutex);
+ if (!kernfs_get_active(of->kn)) {
+ mutex_unlock(&of->mutex);
+ return -ENODEV;
+ }
+
+ ops = kernfs_ops(of->kn);
+ if (ops->llseek)
+ ret = ops->llseek(of, offset, whence);
+ else
+ ret = generic_file_llseek(file, offset, whence);
+
+ kernfs_put_active(of->kn);
+ mutex_unlock(&of->mutex);
+ return ret;
+}
+
static void kernfs_notify_workfn(struct work_struct *work)
{
struct kernfs_node *kn;
@@ -1005,7 +983,7 @@ EXPORT_SYMBOL_GPL(kernfs_notify);
const struct file_operations kernfs_file_fops = {
.read_iter = kernfs_fop_read_iter,
.write_iter = kernfs_fop_write_iter,
- .llseek = generic_file_llseek,
+ .llseek = kernfs_fop_llseek,
.mmap = kernfs_fop_mmap,
.open = kernfs_fop_open,
.release = kernfs_fop_release,
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index c4bf26142eec..4628edde2e7e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -21,8 +21,9 @@
#include "kernfs-internal.h"
-struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
-struct kernfs_global_locks *kernfs_locks;
+struct kmem_cache *kernfs_node_cache __ro_after_init;
+struct kmem_cache *kernfs_iattrs_cache __ro_after_init;
+struct kernfs_global_locks *kernfs_locks __ro_after_init;
static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
{
@@ -265,7 +266,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
sb->s_time_gran = 1;
/* sysfs dentries and inodes don't require IO to create */
- sb->s_shrink.seeks = 0;
+ sb->s_shrink->seeks = 0;
/* get root inode, initialize and unlock it */
down_read(&kf_root->kernfs_rwsem);
diff --git a/fs/libfs.c b/fs/libfs.c
index abe2b5a40ba1..e9440d55073c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -41,6 +41,9 @@ EXPORT_SYMBOL(simple_getattr);
int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
{
+ u64 id = huge_encode_dev(dentry->d_sb->s_dev);
+
+ buf->f_fsid = u64_to_fsid(id);
buf->f_type = dentry->d_sb->s_magic;
buf->f_bsize = PAGE_SIZE;
buf->f_namelen = NAME_MAX;
@@ -1310,6 +1313,47 @@ ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
EXPORT_SYMBOL_GPL(simple_attr_write_signed);
/**
+ * generic_encode_ino32_fh - generic export_operations->encode_fh function
+ * @inode: the object to encode
+ * @fh: where to store the file handle fragment
+ * @max_len: maximum length to store there (in 4 byte units)
+ * @parent: parent directory inode, if wanted
+ *
+ * This generic encode_fh function assumes that the 32 inode number
+ * is suitable for locating an inode, and that the generation number
+ * can be used to check that it is still valid. It places them in the
+ * filehandle fragment where export_decode_fh expects to find them.
+ */
+int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ struct fid *fid = (void *)fh;
+ int len = *max_len;
+ int type = FILEID_INO32_GEN;
+
+ if (parent && (len < 4)) {
+ *max_len = 4;
+ return FILEID_INVALID;
+ } else if (len < 2) {
+ *max_len = 2;
+ return FILEID_INVALID;
+ }
+
+ len = 2;
+ fid->i32.ino = inode->i_ino;
+ fid->i32.gen = inode->i_generation;
+ if (parent) {
+ fid->i32.parent_ino = parent->i_ino;
+ fid->i32.parent_gen = parent->i_generation;
+ len = 4;
+ type = FILEID_INO32_GEN_PARENT;
+ }
+ *max_len = len;
+ return type;
+}
+EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);
+
+/**
* generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
* @sb: filesystem to do the file handle conversion on
* @fid: file handle to convert
diff --git a/fs/locks.c b/fs/locks.c
index d4e49a990a8d..46d88b9e222c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -167,8 +167,8 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
*/
static DEFINE_SPINLOCK(blocked_lock_lock);
-static struct kmem_cache *flctx_cache __read_mostly;
-static struct kmem_cache *filelock_cache __read_mostly;
+static struct kmem_cache *flctx_cache __ro_after_init;
+static struct kmem_cache *filelock_cache __ro_after_init;
static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2a4b8b549e93..82aa7a35db26 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -37,7 +37,7 @@ struct mb_cache {
struct list_head c_list;
/* Number of entries in cache */
unsigned long c_entry_count;
- struct shrinker c_shrink;
+ struct shrinker *c_shrink;
/* Work for shrinking when the cache has too many entries */
struct work_struct c_shrink_work;
};
@@ -293,8 +293,7 @@ EXPORT_SYMBOL(mb_cache_entry_touch);
static unsigned long mb_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct mb_cache *cache = container_of(shrink, struct mb_cache,
- c_shrink);
+ struct mb_cache *cache = shrink->private_data;
return cache->c_entry_count;
}
@@ -333,8 +332,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
static unsigned long mb_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct mb_cache *cache = container_of(shrink, struct mb_cache,
- c_shrink);
+ struct mb_cache *cache = shrink->private_data;
return mb_cache_shrink(cache, sc->nr_to_scan);
}
@@ -377,15 +375,19 @@ struct mb_cache *mb_cache_create(int bucket_bits)
for (i = 0; i < bucket_count; i++)
INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
- cache->c_shrink.count_objects = mb_cache_count;
- cache->c_shrink.scan_objects = mb_cache_scan;
- cache->c_shrink.seeks = DEFAULT_SEEKS;
- if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) {
+ cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker");
+ if (!cache->c_shrink) {
kfree(cache->c_hash);
kfree(cache);
goto err_out;
}
+ cache->c_shrink->count_objects = mb_cache_count;
+ cache->c_shrink->scan_objects = mb_cache_scan;
+ cache->c_shrink->private_data = cache;
+
+ shrinker_register(cache->c_shrink);
+
INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
return cache;
@@ -406,7 +408,7 @@ void mb_cache_destroy(struct mb_cache *cache)
{
struct mb_cache_entry *entry, *next;
- unregister_shrinker(&cache->c_shrink);
+ shrinker_free(cache->c_shrink);
/*
* We don't bother with any locking. Cache must not be used at this
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 4905665c47d0..57d1dedf3f8f 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -256,6 +256,7 @@ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
return idmap;
}
+EXPORT_SYMBOL_GPL(mnt_idmap_get);
/**
* mnt_idmap_put - put a reference to an idmapping
@@ -271,3 +272,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap)
kfree(idmap);
}
}
+EXPORT_SYMBOL_GPL(mnt_idmap_put);
diff --git a/fs/mpage.c b/fs/mpage.c
index 242e213ee064..ffb064ed9d04 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -119,8 +119,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
folio_mark_uptodate(folio);
return;
}
- create_empty_buffers(&folio->page, i_blocksize(inode), 0);
- head = folio_buffers(folio);
+ head = create_empty_buffers(folio, i_blocksize(inode), 0);
}
page_bh = head;
diff --git a/fs/namespace.c b/fs/namespace.c
index 6bde71735efa..fbf0e596fcd3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -39,10 +39,10 @@
/* Maximum number of mounts in a mount namespace */
static unsigned int sysctl_mount_max __read_mostly = 100000;
-static unsigned int m_hash_mask __read_mostly;
-static unsigned int m_hash_shift __read_mostly;
-static unsigned int mp_hash_mask __read_mostly;
-static unsigned int mp_hash_shift __read_mostly;
+static unsigned int m_hash_mask __ro_after_init;
+static unsigned int m_hash_shift __ro_after_init;
+static unsigned int mp_hash_mask __ro_after_init;
+static unsigned int mp_hash_shift __ro_after_init;
static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
@@ -68,9 +68,9 @@ static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
-static struct hlist_head *mount_hashtable __read_mostly;
-static struct hlist_head *mountpoint_hashtable __read_mostly;
-static struct kmem_cache *mnt_cache __read_mostly;
+static struct hlist_head *mount_hashtable __ro_after_init;
+static struct hlist_head *mountpoint_hashtable __ro_after_init;
+static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
@@ -86,7 +86,7 @@ struct mount_kattr {
};
/* /sys/fs */
-struct kobject *fs_kobj;
+struct kobject *fs_kobj __ro_after_init;
EXPORT_SYMBOL_GPL(fs_kobj);
/*
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 7df2503cef6c..01ac733a6320 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -125,7 +125,7 @@ config PNFS_BLOCK
config PNFS_FLEXFILE_LAYOUT
tristate
- depends on NFS_V4_1 && NFS_V3
+ depends on NFS_V4_1
default NFS_V4
config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cf7365581031..fa1a14def45c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -448,6 +448,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
delegation->cred = get_cred(cred);
delegation->inode = inode;
delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+ delegation->test_gen = 0;
spin_lock_init(&delegation->lock);
spin_lock(&clp->cl_lock);
@@ -1294,6 +1295,8 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server,
struct inode *inode;
const struct cred *cred;
nfs4_stateid stateid;
+ unsigned long gen = ++server->delegation_gen;
+
restart:
rcu_read_lock();
restart_locked:
@@ -1303,7 +1306,8 @@ restart_locked:
test_bit(NFS_DELEGATION_RETURNING,
&delegation->flags) ||
test_bit(NFS_DELEGATION_TEST_EXPIRED,
- &delegation->flags) == 0)
+ &delegation->flags) == 0 ||
+ delegation->test_gen == gen)
continue;
inode = nfs_delegation_grab_inode(delegation);
if (inode == NULL)
@@ -1312,6 +1316,7 @@ restart_locked:
cred = get_cred_rcu(delegation->cred);
nfs4_stateid_copy(&stateid, &delegation->stateid);
spin_unlock(&delegation->lock);
+ delegation->test_gen = gen;
clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
rcu_read_unlock();
nfs_delegation_test_free_expired(inode, &stateid, cred);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 1c378992b7c0..a6f495d012cf 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -21,6 +21,7 @@ struct nfs_delegation {
fmode_t type;
unsigned long pagemod_limit;
__u64 change_attr;
+ unsigned long test_gen;
unsigned long flags;
refcount_t refcount;
spinlock_t lock;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e6a51fd94fea..13dffe4201e6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2532,7 +2532,7 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
- struct page *page;
+ struct folio *folio;
char *kaddr;
struct iattr attr;
unsigned int pathlen = strlen(symname);
@@ -2547,24 +2547,24 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
attr.ia_mode = S_IFLNK | S_IRWXUGO;
attr.ia_valid = ATTR_MODE;
- page = alloc_page(GFP_USER);
- if (!page)
+ folio = folio_alloc(GFP_USER, 0);
+ if (!folio)
return -ENOMEM;
- kaddr = page_address(page);
+ kaddr = folio_address(folio);
memcpy(kaddr, symname, pathlen);
if (pathlen < PAGE_SIZE)
memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
trace_nfs_symlink_enter(dir, dentry);
- error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
+ error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr);
trace_nfs_symlink_exit(dir, dentry, error);
if (error != 0) {
dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
dir->i_sb->s_id, dir->i_ino,
dentry, symname, error);
d_drop(dentry);
- __free_page(page);
+ folio_put(folio);
return error;
}
@@ -2574,18 +2574,13 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
* No big deal if we can't add this page to the page cache here.
* READLINK will get the missing page from the server if needed.
*/
- if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0,
- GFP_KERNEL)) {
- SetPageUptodate(page);
- unlock_page(page);
- /*
- * add_to_page_cache_lru() grabs an extra page refcount.
- * Drop it here to avoid leaking this page later.
- */
- put_page(page);
- } else
- __free_page(page);
+ if (filemap_add_folio(d_inode(dentry)->i_mapping, folio, 0,
+ GFP_KERNEL) == 0) {
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
return 0;
}
EXPORT_SYMBOL_GPL(nfs_symlink);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 4bf208a0a8e9..2de66e4e8280 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -543,9 +543,10 @@ out:
}
static int
-nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio,
unsigned int len, struct iattr *sattr)
{
+ struct page *page = &folio->page;
struct nfs3_createdata *data;
struct dentry *d_alias;
int status = -ENOMEM;
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 911f634ba3da..2ad66a8922f4 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -796,28 +796,9 @@ static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink,
static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink,
struct shrink_control *sc);
-static struct shrinker nfs4_xattr_cache_shrinker = {
- .count_objects = nfs4_xattr_cache_count,
- .scan_objects = nfs4_xattr_cache_scan,
- .seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_MEMCG_AWARE,
-};
-
-static struct shrinker nfs4_xattr_entry_shrinker = {
- .count_objects = nfs4_xattr_entry_count,
- .scan_objects = nfs4_xattr_entry_scan,
- .seeks = DEFAULT_SEEKS,
- .batch = 512,
- .flags = SHRINKER_MEMCG_AWARE,
-};
-
-static struct shrinker nfs4_xattr_large_entry_shrinker = {
- .count_objects = nfs4_xattr_entry_count,
- .scan_objects = nfs4_xattr_entry_scan,
- .seeks = 1,
- .batch = 512,
- .flags = SHRINKER_MEMCG_AWARE,
-};
+static struct shrinker *nfs4_xattr_cache_shrinker;
+static struct shrinker *nfs4_xattr_entry_shrinker;
+static struct shrinker *nfs4_xattr_large_entry_shrinker;
static enum lru_status
cache_lru_isolate(struct list_head *item,
@@ -943,7 +924,7 @@ nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc)
struct nfs4_xattr_entry *entry;
struct list_lru *lru;
- lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ lru = (shrink == nfs4_xattr_large_entry_shrinker) ?
&nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose);
@@ -971,7 +952,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
unsigned long count;
struct list_lru *lru;
- lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ lru = (shrink == nfs4_xattr_large_entry_shrinker) ?
&nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
count = list_lru_shrink_count(lru, sc);
@@ -991,18 +972,34 @@ static void nfs4_xattr_cache_init_once(void *p)
INIT_LIST_HEAD(&cache->dispose);
}
-static int nfs4_xattr_shrinker_init(struct shrinker *shrinker,
- struct list_lru *lru, const char *name)
+typedef unsigned long (*count_objects_cb)(struct shrinker *s,
+ struct shrink_control *sc);
+typedef unsigned long (*scan_objects_cb)(struct shrinker *s,
+ struct shrink_control *sc);
+
+static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker,
+ struct list_lru *lru, const char *name,
+ count_objects_cb count,
+ scan_objects_cb scan, long batch, int seeks)
{
- int ret = 0;
+ int ret;
- ret = register_shrinker(shrinker, name);
- if (ret)
+ *shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name);
+ if (!*shrinker)
+ return -ENOMEM;
+
+ ret = list_lru_init_memcg(lru, *shrinker);
+ if (ret) {
+ shrinker_free(*shrinker);
return ret;
+ }
- ret = list_lru_init_memcg(lru, shrinker);
- if (ret)
- unregister_shrinker(shrinker);
+ (*shrinker)->count_objects = count;
+ (*shrinker)->scan_objects = scan;
+ (*shrinker)->batch = batch;
+ (*shrinker)->seeks = seeks;
+
+ shrinker_register(*shrinker);
return ret;
}
@@ -1010,7 +1007,7 @@ static int nfs4_xattr_shrinker_init(struct shrinker *shrinker,
static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker,
struct list_lru *lru)
{
- unregister_shrinker(shrinker);
+ shrinker_free(shrinker);
list_lru_destroy(lru);
}
@@ -1026,27 +1023,31 @@ int __init nfs4_xattr_cache_init(void)
return -ENOMEM;
ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker,
- &nfs4_xattr_cache_lru,
- "nfs-xattr_cache");
+ &nfs4_xattr_cache_lru, "nfs-xattr_cache",
+ nfs4_xattr_cache_count,
+ nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS);
if (ret)
goto out1;
ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker,
- &nfs4_xattr_entry_lru,
- "nfs-xattr_entry");
+ &nfs4_xattr_entry_lru, "nfs-xattr_entry",
+ nfs4_xattr_entry_count,
+ nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS);
if (ret)
goto out2;
ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker,
&nfs4_xattr_large_entry_lru,
- "nfs-xattr_large_entry");
+ "nfs-xattr_large_entry",
+ nfs4_xattr_entry_count,
+ nfs4_xattr_entry_scan, 512, 1);
if (!ret)
return 0;
- nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker,
+ nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker,
&nfs4_xattr_entry_lru);
out2:
- nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker,
+ nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker,
&nfs4_xattr_cache_lru);
out1:
kmem_cache_destroy(nfs4_xattr_cache_cachep);
@@ -1056,11 +1057,11 @@ out1:
void nfs4_xattr_cache_exit(void)
{
- nfs4_xattr_shrinker_destroy(&nfs4_xattr_large_entry_shrinker,
+ nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker,
&nfs4_xattr_large_entry_lru);
- nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker,
+ nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker,
&nfs4_xattr_entry_lru);
- nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker,
+ nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker,
&nfs4_xattr_cache_lru);
kmem_cache_destroy(nfs4_xattr_cache_cachep);
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 827d00e2f094..581698f1b7b2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -209,6 +209,7 @@ struct nfs4_exception {
struct inode *inode;
nfs4_stateid *stateid;
long timeout;
+ unsigned short retrans;
unsigned char task_is_privileged : 1;
unsigned char delay : 1,
recovering : 1,
@@ -546,6 +547,7 @@ extern unsigned short max_session_slots;
extern unsigned short max_session_cb_slots;
extern unsigned short send_implementation_id;
extern bool recover_lost_locks;
+extern short nfs_delay_retrans;
#define NFS4_CLIENT_ID_UNIQ_LEN (64)
extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a654d7234f51..8a943fffaad5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -585,6 +585,21 @@ wait_on_recovery:
return 0;
}
+/*
+ * Track the number of NFS4ERR_DELAY related retransmissions and return
+ * EAGAIN if the 'softerr' mount option is set, and we've exceeded the limit
+ * set by 'nfs_delay_retrans'.
+ */
+static int nfs4_exception_should_retrans(const struct nfs_server *server,
+ struct nfs4_exception *exception)
+{
+ if (server->flags & NFS_MOUNT_SOFTERR && nfs_delay_retrans >= 0) {
+ if (exception->retrans++ >= (unsigned short)nfs_delay_retrans)
+ return -EAGAIN;
+ }
+ return 0;
+}
+
/* This is the error handling routine for processes that are allowed
* to sleep.
*/
@@ -595,6 +610,11 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
ret = nfs4_do_handle_exception(server, errorcode, exception);
if (exception->delay) {
+ int ret2 = nfs4_exception_should_retrans(server, exception);
+ if (ret2 < 0) {
+ exception->retry = 0;
+ return ret2;
+ }
ret = nfs4_delay(&exception->timeout,
exception->interruptible);
goto out_retry;
@@ -623,6 +643,11 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
ret = nfs4_do_handle_exception(server, errorcode, exception);
if (exception->delay) {
+ int ret2 = nfs4_exception_should_retrans(server, exception);
+ if (ret2 < 0) {
+ exception->retry = 0;
+ return ret2;
+ }
rpc_delay(task, nfs4_update_delay(&exception->timeout));
goto out_retry;
}
@@ -5011,9 +5036,10 @@ static void nfs4_free_createdata(struct nfs4_createdata *data)
}
static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
- struct page *page, unsigned int len, struct iattr *sattr,
+ struct folio *folio, unsigned int len, struct iattr *sattr,
struct nfs4_label *label)
{
+ struct page *page = &folio->page;
struct nfs4_createdata *data;
int status = -ENAMETOOLONG;
@@ -5038,7 +5064,7 @@ out:
}
static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
- struct page *page, unsigned int len, struct iattr *sattr)
+ struct folio *folio, unsigned int len, struct iattr *sattr)
{
struct nfs4_exception exception = {
.interruptible = true,
@@ -5049,7 +5075,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
label = nfs4_label_init_security(dir, dentry, sattr, &l);
do {
- err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label);
+ err = _nfs4_proc_symlink(dir, dentry, folio, len, sattr, label);
trace_nfs4_symlink(dir, &dentry->d_name, err);
err = nfs4_handle_exception(NFS_SERVER(dir), err,
&exception);
@@ -5622,7 +5648,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
- nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr);
+ nfs4_state_protect_write(hdr->ds_clp ? hdr->ds_clp : server->nfs_client, clnt, msg, hdr);
}
static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -5663,7 +5689,8 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
data->res.server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0);
- nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
+ nfs4_state_protect(data->ds_clp ? data->ds_clp : server->nfs_client,
+ NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
}
static int _nfs4_proc_commit(struct file *dst, struct nfs_commitargs *args,
@@ -8934,6 +8961,7 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
+try_again:
/* Test connection for session trunking. Async exchange_id call */
task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
if (IS_ERR(task))
@@ -8946,11 +8974,15 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
if (status == 0)
rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
- else if (rpc_clnt_xprt_switch_has_addr(clnt,
+ else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt,
(struct sockaddr *)&xprt->addr))
rpc_clnt_xprt_switch_remove_xprt(clnt, xprt);
rpc_put_task(task);
+ if (status == -NFS4ERR_DELAY) {
+ ssleep(1);
+ goto try_again;
+ }
}
EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
@@ -9621,6 +9653,9 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
nfs4_sequence_free_slot(&lgp->res.seq_res);
+ exception->state = NULL;
+ exception->stateid = NULL;
+
switch (nfs4err) {
case 0:
goto out;
@@ -9716,7 +9751,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
};
struct pnfs_layout_segment *
-nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp,
+ struct nfs4_exception *exception)
{
struct inode *inode = lgp->args.inode;
struct nfs_server *server = NFS_SERVER(inode);
@@ -9736,13 +9772,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
RPC_TASK_MOVEABLE,
};
struct pnfs_layout_segment *lseg = NULL;
- struct nfs4_exception exception = {
- .inode = inode,
- .timeout = *timeout,
- };
int status = 0;
nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
+ exception->retry = 0;
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
@@ -9753,11 +9786,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
goto out;
if (task->tk_status < 0) {
- status = nfs4_layoutget_handle_exception(task, lgp, &exception);
- *timeout = exception.timeout;
+ exception->retry = 1;
+ status = nfs4_layoutget_handle_exception(task, lgp, exception);
} else if (lgp->res.layoutp->len == 0) {
+ exception->retry = 1;
status = -EAGAIN;
- *timeout = nfs4_update_delay(&exception.timeout);
+ nfs4_update_delay(&exception->timeout);
} else
lseg = pnfs_layout_process(lgp);
out:
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 84343aefbbd6..21a365357629 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1980,7 +1980,9 @@ pnfs_update_layout(struct inode *ino,
struct pnfs_layout_segment *lseg = NULL;
struct nfs4_layoutget *lgp;
nfs4_stateid stateid;
- long timeout = 0;
+ struct nfs4_exception exception = {
+ .inode = ino,
+ };
unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
bool first;
@@ -2144,7 +2146,7 @@ lookup_again:
lgp->lo = lo;
pnfs_get_layout_hdr(lo);
- lseg = nfs4_proc_layoutget(lgp, &timeout);
+ lseg = nfs4_proc_layoutget(lgp, &exception);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
nfs_layoutget_end(lo);
@@ -2171,6 +2173,8 @@ lookup_again:
goto out_put_layout_hdr;
}
if (lseg) {
+ if (!exception.retry)
+ goto out_put_layout_hdr;
if (first)
pnfs_clear_first_layoutget(lo);
trace_pnfs_update_layout(ino, pos, count,
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d886c8226d8f..db57a85500ee 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -35,6 +35,7 @@
#include <linux/nfs_page.h>
#include <linux/workqueue.h>
+struct nfs4_exception;
struct nfs4_opendata;
enum {
@@ -245,7 +246,9 @@ extern size_t max_response_pages(struct nfs_server *server);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev,
const struct cred *cred);
-extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout);
+extern struct pnfs_layout_segment *
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp,
+ struct nfs4_exception *exception);
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
/* pnfs.c */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e3570c656b0f..ad3a321ae997 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -396,9 +396,10 @@ nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
}
static int
-nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio,
unsigned int len, struct iattr *sattr)
{
+ struct page *page = &folio->page;
struct nfs_fh *fh;
struct nfs_fattr *fattr;
struct nfs_symlinkargs arg = {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9b1cfca8112a..075b31c93f87 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -129,11 +129,7 @@ static void nfs_ssc_unregister_ops(void)
}
#endif /* CONFIG_NFS_V4_2 */
-static struct shrinker acl_shrinker = {
- .count_objects = nfs_access_cache_count,
- .scan_objects = nfs_access_cache_scan,
- .seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *acl_shrinker;
/*
* Register the NFS filesystems
@@ -153,9 +149,18 @@ int __init register_nfs_fs(void)
ret = nfs_register_sysctl();
if (ret < 0)
goto error_2;
- ret = register_shrinker(&acl_shrinker, "nfs-acl");
- if (ret < 0)
+
+ acl_shrinker = shrinker_alloc(0, "nfs-acl");
+ if (!acl_shrinker) {
+ ret = -ENOMEM;
goto error_3;
+ }
+
+ acl_shrinker->count_objects = nfs_access_cache_count;
+ acl_shrinker->scan_objects = nfs_access_cache_scan;
+
+ shrinker_register(acl_shrinker);
+
#ifdef CONFIG_NFS_V4_2
nfs_ssc_register_ops();
#endif
@@ -175,7 +180,7 @@ error_0:
*/
void __exit unregister_nfs_fs(void)
{
- unregister_shrinker(&acl_shrinker);
+ shrinker_free(acl_shrinker);
nfs_unregister_sysctl();
unregister_nfs4_fs();
#ifdef CONFIG_NFS_V4_2
@@ -1366,6 +1371,7 @@ unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE;
unsigned short send_implementation_id = 1;
char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
bool recover_lost_locks = false;
+short nfs_delay_retrans = -1;
EXPORT_SYMBOL_GPL(nfs_callback_nr_threads);
EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
@@ -1376,6 +1382,7 @@ EXPORT_SYMBOL_GPL(max_session_cb_slots);
EXPORT_SYMBOL_GPL(send_implementation_id);
EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
EXPORT_SYMBOL_GPL(recover_lost_locks);
+EXPORT_SYMBOL_GPL(nfs_delay_retrans);
#define NFS_CALLBACK_MAXPORTNR (65535U)
@@ -1424,5 +1431,9 @@ MODULE_PARM_DESC(recover_lost_locks,
"If the server reports that a lock might be lost, "
"try to recover it risking data corruption.");
-
+module_param_named(delay_retrans, nfs_delay_retrans, short, 0644);
+MODULE_PARM_DESC(delay_retrans,
+ "Unless negative, specifies the number of times the NFSv4 "
+ "client retries a request before returning an EAGAIN error, "
+ "after a reply of NFS4ERR_DELAY from the server.");
#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9d82d50ce0b1..b664caea8b4e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -739,6 +739,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
&pgio);
pgio.pg_error = 0;
nfs_pageio_complete(&pgio);
+ if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR)
+ break;
} while (err < 0 && !nfs_error_is_fatal(err));
nfs_io_completion_put(ioc);
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 929248c6ca84..4cbe0434cbb8 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,8 +84,8 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn);
void nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
-int nfsd_cache_lookup(struct svc_rqst *rqstp,
- struct nfsd_cacherep **cacherep);
+int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+ unsigned int len, struct nfsd_cacherep **cacherep);
void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
int cachetype, __be32 *statp);
int nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b7da17e53007..7b641095a665 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -426,8 +426,7 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid)
return -EINVAL;
}
- if (!inode->i_sb->s_export_op ||
- !inode->i_sb->s_export_op->fh_to_dentry) {
+ if (!exportfs_can_decode_fh(inode->i_sb->s_export_op)) {
dprintk("exp_export: export of invalid fs type.\n");
return -EINVAL;
}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 07bf219f9ae4..ef063f93fde9 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -521,11 +521,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
return ret;
}
-static struct shrinker nfsd_file_shrinker = {
- .scan_objects = nfsd_file_lru_scan,
- .count_objects = nfsd_file_lru_count,
- .seeks = 1,
-};
+static struct shrinker *nfsd_file_shrinker;
/**
* nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file
@@ -746,12 +742,19 @@ nfsd_file_cache_init(void)
goto out_err;
}
- ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache");
- if (ret) {
- pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
+ nfsd_file_shrinker = shrinker_alloc(0, "nfsd-filecache");
+ if (!nfsd_file_shrinker) {
+ ret = -ENOMEM;
+ pr_err("nfsd: failed to allocate nfsd_file_shrinker\n");
goto out_lru;
}
+ nfsd_file_shrinker->count_objects = nfsd_file_lru_count;
+ nfsd_file_shrinker->scan_objects = nfsd_file_lru_scan;
+ nfsd_file_shrinker->seeks = 1;
+
+ shrinker_register(nfsd_file_shrinker);
+
ret = lease_register_notifier(&nfsd_file_lease_notifier);
if (ret) {
pr_err("nfsd: unable to register lease notifier: %d\n", ret);
@@ -774,7 +777,7 @@ out:
out_notifier:
lease_unregister_notifier(&nfsd_file_lease_notifier);
out_shrinker:
- unregister_shrinker(&nfsd_file_shrinker);
+ shrinker_free(nfsd_file_shrinker);
out_lru:
list_lru_destroy(&nfsd_file_lru);
out_err:
@@ -891,7 +894,7 @@ nfsd_file_cache_shutdown(void)
return;
lease_unregister_notifier(&nfsd_file_lease_notifier);
- unregister_shrinker(&nfsd_file_shrinker);
+ shrinker_free(nfsd_file_shrinker);
/*
* make sure all callers of nfsd_file_lru_cb are done before
* calling nfsd_file_cache_purge
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ec49b200b797..ab303a8b77d5 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -177,7 +177,7 @@ struct nfsd_net {
/* size of cache when we saw the longest hash chain */
unsigned int longest_chain_cachesize;
- struct shrinker nfsd_reply_cache_shrinker;
+ struct shrinker *nfsd_reply_cache_shrinker;
/* tracking server-to-server copy mounts */
spinlock_t nfsd_ssc_lock;
@@ -195,7 +195,7 @@ struct nfsd_net {
int nfs4_max_clients;
atomic_t nfsd_courtesy_clients;
- struct shrinker nfsd_client_shrinker;
+ struct shrinker *nfsd_client_shrinker;
struct work_struct nfsd_shrinker_work;
};
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 65fd5510323a..40415929e2ae 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2804,7 +2804,7 @@ static int client_opens_release(struct inode *inode, struct file *file)
/* XXX: alternatively, we could get/drop in seq start/stop */
drop_client(clp);
- return 0;
+ return seq_release(inode, file);
}
static const struct file_operations client_states_fops = {
@@ -4452,8 +4452,7 @@ static unsigned long
nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
{
int count;
- struct nfsd_net *nn = container_of(shrink,
- struct nfsd_net, nfsd_client_shrinker);
+ struct nfsd_net *nn = shrink->private_data;
count = atomic_read(&nn->nfsd_courtesy_clients);
if (!count)
@@ -8235,12 +8234,16 @@ static int nfs4_state_create_net(struct net *net)
INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker);
get_net(net);
- nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan;
- nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count;
- nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
-
- if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"))
+ nn->nfsd_client_shrinker = shrinker_alloc(0, "nfsd-client");
+ if (!nn->nfsd_client_shrinker)
goto err_shrinker;
+
+ nn->nfsd_client_shrinker->scan_objects = nfsd4_state_shrinker_scan;
+ nn->nfsd_client_shrinker->count_objects = nfsd4_state_shrinker_count;
+ nn->nfsd_client_shrinker->private_data = nn;
+
+ shrinker_register(nn->nfsd_client_shrinker);
+
return 0;
err_shrinker:
@@ -8338,7 +8341,7 @@ nfs4_state_shutdown_net(struct net *net)
struct list_head *pos, *next, reaplist;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- unregister_shrinker(&nn->nfsd_client_shrinker);
+ shrinker_free(nn->nfsd_client_shrinker);
cancel_work(&nn->nfsd_shrinker_work);
cancel_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 80621a709510..d3273a396659 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -201,26 +201,29 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
{
unsigned int hashsize;
unsigned int i;
- int status = 0;
nn->max_drc_entries = nfsd_cache_size_limit();
atomic_set(&nn->num_drc_entries, 0);
hashsize = nfsd_hashsize(nn->max_drc_entries);
nn->maskbits = ilog2(hashsize);
- nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
- nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
- nn->nfsd_reply_cache_shrinker.seeks = 1;
- status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
- "nfsd-reply:%s", nn->nfsd_name);
- if (status)
- return status;
-
nn->drc_hashtbl = kvzalloc(array_size(hashsize,
sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
if (!nn->drc_hashtbl)
+ return -ENOMEM;
+
+ nn->nfsd_reply_cache_shrinker = shrinker_alloc(0, "nfsd-reply:%s",
+ nn->nfsd_name);
+ if (!nn->nfsd_reply_cache_shrinker)
goto out_shrinker;
+ nn->nfsd_reply_cache_shrinker->scan_objects = nfsd_reply_cache_scan;
+ nn->nfsd_reply_cache_shrinker->count_objects = nfsd_reply_cache_count;
+ nn->nfsd_reply_cache_shrinker->seeks = 1;
+ nn->nfsd_reply_cache_shrinker->private_data = nn;
+
+ shrinker_register(nn->nfsd_reply_cache_shrinker);
+
for (i = 0; i < hashsize; i++) {
INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head);
spin_lock_init(&nn->drc_hashtbl[i].cache_lock);
@@ -229,7 +232,7 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
return 0;
out_shrinker:
- unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
+ kvfree(nn->drc_hashtbl);
printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
return -ENOMEM;
}
@@ -239,7 +242,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
struct nfsd_cacherep *rp;
unsigned int i;
- unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
+ shrinker_free(nn->nfsd_reply_cache_shrinker);
for (i = 0; i < nn->drc_hashsize; i++) {
struct list_head *head = &nn->drc_hashtbl[i].lru_head;
@@ -323,8 +326,7 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
static unsigned long
nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
- struct nfsd_net *nn = container_of(shrink,
- struct nfsd_net, nfsd_reply_cache_shrinker);
+ struct nfsd_net *nn = shrink->private_data;
return atomic_read(&nn->num_drc_entries);
}
@@ -343,8 +345,7 @@ nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long
nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- struct nfsd_net *nn = container_of(shrink,
- struct nfsd_net, nfsd_reply_cache_shrinker);
+ struct nfsd_net *nn = shrink->private_data;
unsigned long freed = 0;
LIST_HEAD(dispose);
unsigned int i;
@@ -368,33 +369,52 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
return freed;
}
-/*
- * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+/**
+ * nfsd_cache_csum - Checksum incoming NFS Call arguments
+ * @buf: buffer containing a whole RPC Call message
+ * @start: starting byte of the NFS Call header
+ * @remaining: size of the NFS Call header, in bytes
+ *
+ * Compute a weak checksum of the leading bytes of an NFS procedure
+ * call header to help verify that a retransmitted Call matches an
+ * entry in the duplicate reply cache.
+ *
+ * To avoid assumptions about how the RPC message is laid out in
+ * @buf and what else it might contain (eg, a GSS MIC suffix), the
+ * caller passes us the exact location and length of the NFS Call
+ * header.
+ *
+ * Returns a 32-bit checksum value, as defined in RFC 793.
*/
-static __wsum
-nfsd_cache_csum(struct svc_rqst *rqstp)
+static __wsum nfsd_cache_csum(struct xdr_buf *buf, unsigned int start,
+ unsigned int remaining)
{
+ unsigned int base, len;
+ struct xdr_buf subbuf;
+ __wsum csum = 0;
+ void *p;
int idx;
- unsigned int base;
- __wsum csum;
- struct xdr_buf *buf = &rqstp->rq_arg;
- const unsigned char *p = buf->head[0].iov_base;
- size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
- RC_CSUMLEN);
- size_t len = min(buf->head[0].iov_len, csum_len);
+
+ if (remaining > RC_CSUMLEN)
+ remaining = RC_CSUMLEN;
+ if (xdr_buf_subsegment(buf, &subbuf, start, remaining))
+ return csum;
/* rq_arg.head first */
- csum = csum_partial(p, len, 0);
- csum_len -= len;
+ if (subbuf.head[0].iov_len) {
+ len = min_t(unsigned int, subbuf.head[0].iov_len, remaining);
+ csum = csum_partial(subbuf.head[0].iov_base, len, csum);
+ remaining -= len;
+ }
/* Continue into page array */
- idx = buf->page_base / PAGE_SIZE;
- base = buf->page_base & ~PAGE_MASK;
- while (csum_len) {
- p = page_address(buf->pages[idx]) + base;
- len = min_t(size_t, PAGE_SIZE - base, csum_len);
+ idx = subbuf.page_base / PAGE_SIZE;
+ base = subbuf.page_base & ~PAGE_MASK;
+ while (remaining) {
+ p = page_address(subbuf.pages[idx]) + base;
+ len = min_t(unsigned int, PAGE_SIZE - base, remaining);
csum = csum_partial(p, len, csum);
- csum_len -= len;
+ remaining -= len;
base = 0;
++idx;
}
@@ -465,6 +485,8 @@ out:
/**
* nfsd_cache_lookup - Find an entry in the duplicate reply cache
* @rqstp: Incoming Call to find
+ * @start: starting byte in @rqstp->rq_arg of the NFS Call header
+ * @len: size of the NFS Call header, in bytes
* @cacherep: OUT: DRC entry for this request
*
* Try to find an entry matching the current call in the cache. When none
@@ -478,7 +500,8 @@ out:
* %RC_REPLY: Reply from cache
* %RC_DROPIT: Do not process the request further
*/
-int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
+int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+ unsigned int len, struct nfsd_cacherep **cacherep)
{
struct nfsd_net *nn;
struct nfsd_cacherep *rp, *found;
@@ -494,7 +517,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
goto out;
}
- csum = nfsd_cache_csum(rqstp);
+ csum = nfsd_cache_csum(&rqstp->rq_arg, start, len);
/*
* Since the common case is a cache miss followed by an insert,
@@ -640,24 +663,17 @@ void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
return;
}
-/*
- * Copy cached reply to current reply buffer. Should always fit.
- * FIXME as reply is in a page, we should just attach the page, and
- * keep a refcount....
- */
static int
nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
{
- struct kvec *vec = &rqstp->rq_res.head[0];
-
- if (vec->iov_len + data->iov_len > PAGE_SIZE) {
- printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n",
- data->iov_len);
- return 0;
- }
- memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
- vec->iov_len += data->iov_len;
- return 1;
+ __be32 *p;
+
+ p = xdr_reserve_space(&rqstp->rq_res_stream, data->iov_len);
+ if (unlikely(!p))
+ return false;
+ memcpy(p, data->iov_base, data->iov_len);
+ xdr_commit_encode(&rqstp->rq_res_stream);
+ return true;
}
/*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d6122bb2d167..fe61d9bbcc1f 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -981,6 +981,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
const struct svc_procedure *proc = rqstp->rq_procinfo;
__be32 *statp = rqstp->rq_accept_statp;
struct nfsd_cacherep *rp;
+ unsigned int start, len;
+ __be32 *nfs_reply;
/*
* Give the xdr decoder a chance to change this if it wants
@@ -988,6 +990,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
*/
rqstp->rq_cachetype = proc->pc_cachetype;
+ /*
+ * ->pc_decode advances the argument stream past the NFS
+ * Call header, so grab the header's starting location and
+ * size now for the call to nfsd_cache_lookup().
+ */
+ start = xdr_stream_pos(&rqstp->rq_arg_stream);
+ len = xdr_stream_remaining(&rqstp->rq_arg_stream);
if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
@@ -1001,7 +1010,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
rp = NULL;
- switch (nfsd_cache_lookup(rqstp, &rp)) {
+ switch (nfsd_cache_lookup(rqstp, start, len, &rp)) {
case RC_DOIT:
break;
case RC_REPLY:
@@ -1010,6 +1019,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
goto out_dropit;
}
+ nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0);
*statp = proc->pc_func(rqstp);
if (test_bit(RQ_DROPME, &rqstp->rq_flags))
goto out_update_drop;
@@ -1023,7 +1033,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
*/
smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
- nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
+ nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply);
out_cached_reply:
return 1;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 19c8158605ed..c97c77a39668 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -356,30 +356,28 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
*/
int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
{
- pgoff_t index = (pgoff_t)block >>
- (PAGE_SHIFT - inode->i_blkbits);
- struct page *page;
- unsigned long first_block;
+ pgoff_t index = block >> (PAGE_SHIFT - inode->i_blkbits);
+ struct folio *folio;
+ struct buffer_head *bh;
int ret = 0;
int still_dirty;
- page = find_lock_page(inode->i_mapping, index);
- if (!page)
+ folio = filemap_lock_folio(inode->i_mapping, index);
+ if (IS_ERR(folio))
return -ENOENT;
- wait_on_page_writeback(page);
-
- first_block = (unsigned long)index <<
- (PAGE_SHIFT - inode->i_blkbits);
- if (page_has_buffers(page)) {
- struct buffer_head *bh;
+ folio_wait_writeback(folio);
- bh = nilfs_page_get_nth_block(page, block - first_block);
+ bh = folio_buffers(folio);
+ if (bh) {
+ unsigned long first_block = index <<
+ (PAGE_SHIFT - inode->i_blkbits);
+ bh = get_nth_bh(bh, block - first_block);
nilfs_forget_buffer(bh);
}
- still_dirty = PageDirty(page);
- unlock_page(page);
- put_page(page);
+ still_dirty = folio_test_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
if (still_dirty ||
invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
@@ -560,17 +558,19 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
{
struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
struct buffer_head *bh_frozen;
- struct page *page;
+ struct folio *folio;
int blkbits = inode->i_blkbits;
- page = grab_cache_page(shadow->inode->i_mapping, bh->b_folio->index);
- if (!page)
- return -ENOMEM;
+ folio = filemap_grab_folio(shadow->inode->i_mapping,
+ bh->b_folio->index);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << blkbits, 0);
+ bh_frozen = folio_buffers(folio);
+ if (!bh_frozen)
+ bh_frozen = create_empty_buffers(folio, 1 << blkbits, 0);
- bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
+ bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits);
if (!buffer_uptodate(bh_frozen))
nilfs_copy_buffer(bh_frozen, bh);
@@ -582,8 +582,8 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
brelse(bh_frozen); /* already frozen */
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return 0;
}
@@ -592,17 +592,19 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
{
struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
struct buffer_head *bh_frozen = NULL;
- struct page *page;
+ struct folio *folio;
int n;
- page = find_lock_page(shadow->inode->i_mapping, bh->b_folio->index);
- if (page) {
- if (page_has_buffers(page)) {
+ folio = filemap_lock_folio(shadow->inode->i_mapping,
+ bh->b_folio->index);
+ if (!IS_ERR(folio)) {
+ bh_frozen = folio_buffers(folio);
+ if (bh_frozen) {
n = bh_offset(bh) >> inode->i_blkbits;
- bh_frozen = nilfs_page_get_nth_block(page, n);
+ bh_frozen = get_nth_bh(bh_frozen, n);
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
return bh_frozen;
}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index b4e54d079b7d..06b04758f289 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -25,19 +25,19 @@
(BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) | \
BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked))
-static struct buffer_head *
-__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
- int blkbits, unsigned long b_state)
+static struct buffer_head *__nilfs_get_folio_block(struct folio *folio,
+ unsigned long block, pgoff_t index, int blkbits,
+ unsigned long b_state)
{
unsigned long first_block;
- struct buffer_head *bh;
+ struct buffer_head *bh = folio_buffers(folio);
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << blkbits, b_state);
+ if (!bh)
+ bh = create_empty_buffers(folio, 1 << blkbits, b_state);
first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
- bh = nilfs_page_get_nth_block(page, block - first_block);
+ bh = get_nth_bh(bh, block - first_block);
touch_buffer(bh);
wait_on_buffer(bh);
@@ -51,17 +51,17 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
{
int blkbits = inode->i_blkbits;
pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits);
- struct page *page;
+ struct folio *folio;
struct buffer_head *bh;
- page = grab_cache_page(mapping, index);
- if (unlikely(!page))
+ folio = filemap_grab_folio(mapping, index);
+ if (IS_ERR(folio))
return NULL;
- bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
+ bh = __nilfs_get_folio_block(folio, blkoff, index, blkbits, b_state);
if (unlikely(!bh)) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return NULL;
}
return bh;
@@ -184,30 +184,32 @@ void nilfs_page_bug(struct page *page)
}
/**
- * nilfs_copy_page -- copy the page with buffers
- * @dst: destination page
- * @src: source page
- * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
+ * nilfs_copy_folio -- copy the folio with buffers
+ * @dst: destination folio
+ * @src: source folio
+ * @copy_dirty: flag whether to copy dirty states on the folio's buffer heads.
*
- * This function is for both data pages and btnode pages. The dirty flag
- * should be treated by caller. The page must not be under i/o.
- * Both src and dst page must be locked
+ * This function is for both data folios and btnode folios. The dirty flag
+ * should be treated by caller. The folio must not be under i/o.
+ * Both src and dst folio must be locked
*/
-static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
+static void nilfs_copy_folio(struct folio *dst, struct folio *src,
+ bool copy_dirty)
{
struct buffer_head *dbh, *dbufs, *sbh;
unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
- BUG_ON(PageWriteback(dst));
+ BUG_ON(folio_test_writeback(dst));
- sbh = page_buffers(src);
- if (!page_has_buffers(dst))
- create_empty_buffers(dst, sbh->b_size, 0);
+ sbh = folio_buffers(src);
+ dbh = folio_buffers(dst);
+ if (!dbh)
+ dbh = create_empty_buffers(dst, sbh->b_size, 0);
if (copy_dirty)
mask |= BIT(BH_Dirty);
- dbh = dbufs = page_buffers(dst);
+ dbufs = dbh;
do {
lock_buffer(sbh);
lock_buffer(dbh);
@@ -218,16 +220,16 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
dbh = dbh->b_this_page;
} while (dbh != dbufs);
- copy_highpage(dst, src);
+ folio_copy(dst, src);
- if (PageUptodate(src) && !PageUptodate(dst))
- SetPageUptodate(dst);
- else if (!PageUptodate(src) && PageUptodate(dst))
- ClearPageUptodate(dst);
- if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
- SetPageMappedToDisk(dst);
- else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
- ClearPageMappedToDisk(dst);
+ if (folio_test_uptodate(src) && !folio_test_uptodate(dst))
+ folio_mark_uptodate(dst);
+ else if (!folio_test_uptodate(src) && folio_test_uptodate(dst))
+ folio_clear_uptodate(dst);
+ if (folio_test_mappedtodisk(src) && !folio_test_mappedtodisk(dst))
+ folio_set_mappedtodisk(dst);
+ else if (!folio_test_mappedtodisk(src) && folio_test_mappedtodisk(dst))
+ folio_clear_mappedtodisk(dst);
do {
unlock_buffer(sbh);
@@ -269,7 +271,7 @@ repeat:
NILFS_PAGE_BUG(&folio->page,
"found empty page in dat page cache");
- nilfs_copy_page(&dfolio->page, &folio->page, 1);
+ nilfs_copy_folio(dfolio, folio, true);
filemap_dirty_folio(folio_mapping(dfolio), dfolio);
folio_unlock(dfolio);
@@ -314,7 +316,7 @@ repeat:
if (!IS_ERR(dfolio)) {
/* overwrite existing folio in the destination cache */
WARN_ON(folio_test_dirty(dfolio));
- nilfs_copy_page(&dfolio->page, &folio->page, 0);
+ nilfs_copy_folio(dfolio, folio, false);
folio_unlock(dfolio);
folio_put(dfolio);
/* Do we not need to remove folio from smap here? */
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 21ddcdd4d63e..d249ea1cefff 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -52,15 +52,4 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
#define NILFS_PAGE_BUG(page, m, a...) \
do { nilfs_page_bug(page); BUG(); } while (0)
-static inline struct buffer_head *
-nilfs_page_get_nth_block(struct page *page, unsigned int count)
-{
- struct buffer_head *bh = page_buffers(page);
-
- while (count-- > 0)
- bh = bh->b_this_page;
- get_bh(bh);
- return bh;
-}
-
#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7ec16879756e..55e31cc903d1 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -731,10 +731,9 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
continue;
}
head = folio_buffers(folio);
- if (!head) {
- create_empty_buffers(&folio->page, i_blocksize(inode), 0);
- head = folio_buffers(folio);
- }
+ if (!head)
+ head = create_empty_buffers(folio,
+ i_blocksize(inode), 0);
folio_unlock(folio);
bh = head;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 869b016014d2..1cb9ad7e884e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -39,9 +39,9 @@ static void __init dnotify_sysctl_init(void)
#define dnotify_sysctl_init() do { } while (0)
#endif
-static struct kmem_cache *dnotify_struct_cache __read_mostly;
-static struct kmem_cache *dnotify_mark_cache __read_mostly;
-static struct fsnotify_group *dnotify_group __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __ro_after_init;
+static struct kmem_cache *dnotify_mark_cache __ro_after_init;
+static struct fsnotify_group *dnotify_group __ro_after_init;
/*
* dnotify will attach one of these to each inode (i_fsnotify_marks) which
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 62fe0b679e58..4d765c72496f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -112,10 +112,10 @@ static void __init fanotify_sysctls_init(void)
extern const struct fsnotify_ops fanotify_fsnotify_ops;
-struct kmem_cache *fanotify_mark_cache __read_mostly;
-struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
-struct kmem_cache *fanotify_path_event_cachep __read_mostly;
-struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+struct kmem_cache *fanotify_mark_cache __ro_after_init;
+struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
#define FANOTIFY_EVENT_ALIGN 4
#define FANOTIFY_FID_INFO_HDR_LEN \
@@ -1595,7 +1595,7 @@ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
* file handles so user can use name_to_handle_at() to compare fids
* reported with events to the file handle of watched objects.
*/
- if (!nop)
+ if (!exportfs_can_encode_fid(nop))
return -EOPNOTSUPP;
/*
@@ -1603,7 +1603,7 @@ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
* supports decoding file handles, so user has a way to map back the
* reported fids to filesystem objects.
*/
- if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry)
+ if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop))
return -EOPNOTSUPP;
return 0;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1c4bfdab008d..a3809ae92170 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -49,7 +49,7 @@
/* configurable via /proc/sys/fs/inotify/ */
static int inotify_max_queued_events __read_mostly;
-struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *inotify_inode_mark_cachep __ro_after_init;
#ifdef CONFIG_SYSCTL
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 4e158bce4192..71e31e789b29 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -145,13 +145,12 @@ still_busy:
}
/**
- * ntfs_read_block - fill a @page of an address space with data
- * @page: page cache page to fill with data
+ * ntfs_read_block - fill a @folio of an address space with data
+ * @folio: page cache folio to fill with data
*
- * Fill the page @page of the address space belonging to the @page->host inode.
* We read each buffer asynchronously and when all buffers are read in, our io
* completion handler ntfs_end_buffer_read_async(), if required, automatically
- * applies the mst fixups to the page before finally marking it uptodate and
+ * applies the mst fixups to the folio before finally marking it uptodate and
* unlocking it.
*
* We only enforce allocated_size limit because i_size is checked for in
@@ -161,7 +160,7 @@ still_busy:
*
* Contains an adapted version of fs/buffer.c::block_read_full_folio().
*/
-static int ntfs_read_block(struct page *page)
+static int ntfs_read_block(struct folio *folio)
{
loff_t i_size;
VCN vcn;
@@ -178,7 +177,7 @@ static int ntfs_read_block(struct page *page)
int i, nr;
unsigned char blocksize_bits;
- vi = page->mapping->host;
+ vi = folio->mapping->host;
ni = NTFS_I(vi);
vol = ni->vol;
@@ -188,15 +187,10 @@ static int ntfs_read_block(struct page *page)
blocksize = vol->sb->s_blocksize;
blocksize_bits = vol->sb->s_blocksize_bits;
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, blocksize, 0);
- if (unlikely(!page_has_buffers(page))) {
- unlock_page(page);
- return -ENOMEM;
- }
- }
- bh = head = page_buffers(page);
- BUG_ON(!bh);
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
+ bh = head;
/*
* We may be racing with truncate. To avoid some of the problems we
@@ -205,11 +199,11 @@ static int ntfs_read_block(struct page *page)
* may leave some buffers unmapped which are now allocated. This is
* not a problem since these buffers will just get mapped when a write
* occurs. In case of a shrinking truncate, we will detect this later
- * on due to the runlist being incomplete and if the page is being
+ * on due to the runlist being incomplete and if the folio is being
* fully truncated, truncate will throw it away as soon as we unlock
* it so no need to worry what we do with it.
*/
- iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
+ iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
read_lock_irqsave(&ni->size_lock, flags);
lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
init_size = ni->initialized_size;
@@ -221,7 +215,7 @@ static int ntfs_read_block(struct page *page)
}
zblock = (init_size + blocksize - 1) >> blocksize_bits;
- /* Loop through all the buffers in the page. */
+ /* Loop through all the buffers in the folio. */
rl = NULL;
nr = i = 0;
do {
@@ -299,7 +293,7 @@ lock_retry_remap:
if (!err)
err = -EIO;
bh->b_blocknr = -1;
- SetPageError(page);
+ folio_set_error(folio);
ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
"attribute type 0x%x, vcn 0x%llx, "
"offset 0x%x because its location on "
@@ -312,13 +306,13 @@ lock_retry_remap:
/*
* Either iblock was outside lblock limits or
* ntfs_rl_vcn_to_lcn() returned error. Just zero that portion
- * of the page and set the buffer uptodate.
+ * of the folio and set the buffer uptodate.
*/
handle_hole:
bh->b_blocknr = -1UL;
clear_buffer_mapped(bh);
handle_zblock:
- zero_user(page, i * blocksize, blocksize);
+ folio_zero_range(folio, i * blocksize, blocksize);
if (likely(!err))
set_buffer_uptodate(bh);
} while (i++, iblock++, (bh = bh->b_this_page) != head);
@@ -349,11 +343,11 @@ handle_zblock:
return 0;
}
/* No i/o was scheduled on any of the buffers. */
- if (likely(!PageError(page)))
- SetPageUptodate(page);
+ if (likely(!folio_test_error(folio)))
+ folio_mark_uptodate(folio);
else /* Signal synchronous i/o error. */
nr = -EIO;
- unlock_page(page);
+ folio_unlock(folio);
return nr;
}
@@ -433,7 +427,7 @@ retry_readpage:
/* NInoNonResident() == NInoIndexAllocPresent() */
if (NInoNonResident(ni)) {
/* Normal, non-resident data stream. */
- return ntfs_read_block(page);
+ return ntfs_read_block(folio);
}
/*
* Attribute is resident, implying it is not compressed or encrypted.
@@ -507,28 +501,29 @@ err_out:
#ifdef NTFS_RW
/**
- * ntfs_write_block - write a @page to the backing store
- * @page: page cache page to write out
+ * ntfs_write_block - write a @folio to the backing store
+ * @folio: page cache folio to write out
* @wbc: writeback control structure
*
- * This function is for writing pages belonging to non-resident, non-mst
+ * This function is for writing folios belonging to non-resident, non-mst
* protected attributes to their backing store.
*
- * For a page with buffers, map and write the dirty buffers asynchronously
- * under page writeback. For a page without buffers, create buffers for the
- * page, then proceed as above.
+ * For a folio with buffers, map and write the dirty buffers asynchronously
+ * under folio writeback. For a folio without buffers, create buffers for the
+ * folio, then proceed as above.
*
- * If a page doesn't have buffers the page dirty state is definitive. If a page
- * does have buffers, the page dirty state is just a hint, and the buffer dirty
- * state is definitive. (A hint which has rules: dirty buffers against a clean
- * page is illegal. Other combinations are legal and need to be handled. In
- * particular a dirty page containing clean buffers for example.)
+ * If a folio doesn't have buffers the folio dirty state is definitive. If
+ * a folio does have buffers, the folio dirty state is just a hint,
+ * and the buffer dirty state is definitive. (A hint which has rules:
+ * dirty buffers against a clean folio is illegal. Other combinations are
+ * legal and need to be handled. In particular a dirty folio containing
+ * clean buffers for example.)
*
* Return 0 on success and -errno on error.
*
* Based on ntfs_read_block() and __block_write_full_folio().
*/
-static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
+static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc)
{
VCN vcn;
LCN lcn;
@@ -546,41 +541,29 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
bool need_end_writeback;
unsigned char blocksize_bits;
- vi = page->mapping->host;
+ vi = folio->mapping->host;
ni = NTFS_I(vi);
vol = ni->vol;
ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
- "0x%lx.", ni->mft_no, ni->type, page->index);
+ "0x%lx.", ni->mft_no, ni->type, folio->index);
BUG_ON(!NInoNonResident(ni));
BUG_ON(NInoMstProtected(ni));
blocksize = vol->sb->s_blocksize;
blocksize_bits = vol->sb->s_blocksize_bits;
- if (!page_has_buffers(page)) {
- BUG_ON(!PageUptodate(page));
- create_empty_buffers(page, blocksize,
+ head = folio_buffers(folio);
+ if (!head) {
+ BUG_ON(!folio_test_uptodate(folio));
+ head = create_empty_buffers(folio, blocksize,
(1 << BH_Uptodate) | (1 << BH_Dirty));
- if (unlikely(!page_has_buffers(page))) {
- ntfs_warning(vol->sb, "Error allocating page "
- "buffers. Redirtying page so we try "
- "again later.");
- /*
- * Put the page back on mapping->dirty_pages, but leave
- * its buffers' dirty state as-is.
- */
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
}
- bh = head = page_buffers(page);
- BUG_ON(!bh);
+ bh = head;
/* NOTE: Different naming scheme to ntfs_read_block()! */
- /* The first block in the page. */
- block = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
+ /* The first block in the folio. */
+ block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
read_lock_irqsave(&ni->size_lock, flags);
i_size = i_size_read(vi);
@@ -597,14 +580,14 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
* Be very careful. We have no exclusion from block_dirty_folio
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
- * then we just miss that fact, and the page stays dirty.
+ * then we just miss that fact, and the folio stays dirty.
*
* Buffers outside i_size may be dirtied by block_dirty_folio;
* handle that here by just cleaning them.
*/
/*
- * Loop through all the buffers in the page, mapping all the dirty
+ * Loop through all the buffers in the folio, mapping all the dirty
* buffers to disk addresses and handling any aliases from the
* underlying block device's mapping.
*/
@@ -616,13 +599,13 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
if (unlikely(block >= dblock)) {
/*
* Mapped buffers outside i_size will occur, because
- * this page can be outside i_size when there is a
+ * this folio can be outside i_size when there is a
* truncate in progress. The contents of such buffers
* were zeroed by ntfs_writepage().
*
* FIXME: What about the small race window where
* ntfs_writepage() has not done any clearing because
- * the page was within i_size but before we get here,
+ * the folio was within i_size but before we get here,
* vmtruncate() modifies i_size?
*/
clear_buffer_dirty(bh);
@@ -638,38 +621,38 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
if (unlikely((block >= iblock) &&
(initialized_size < i_size))) {
/*
- * If this page is fully outside initialized
- * size, zero out all pages between the current
- * initialized size and the current page. Just
+ * If this folio is fully outside initialized
+ * size, zero out all folios between the current
+ * initialized size and the current folio. Just
* use ntfs_read_folio() to do the zeroing
* transparently.
*/
if (block > iblock) {
// TODO:
- // For each page do:
- // - read_cache_page()
- // Again for each page do:
- // - wait_on_page_locked()
- // - Check (PageUptodate(page) &&
- // !PageError(page))
+ // For each folio do:
+ // - read_cache_folio()
+ // Again for each folio do:
+ // - wait_on_folio_locked()
+ // - Check (folio_test_uptodate(folio) &&
+ // !folio_test_error(folio))
// Update initialized size in the attribute and
// in the inode.
- // Again, for each page do:
+ // Again, for each folio do:
// block_dirty_folio();
- // put_page()
+ // folio_put()
// We don't need to wait on the writes.
// Update iblock.
}
/*
- * The current page straddles initialized size. Zero
+ * The current folio straddles initialized size. Zero
* all non-uptodate buffers and set them uptodate (and
* dirty?). Note, there aren't any non-uptodate buffers
- * if the page is uptodate.
- * FIXME: For an uptodate page, the buffers may need to
+ * if the folio is uptodate.
+ * FIXME: For an uptodate folio, the buffers may need to
* be written out because they were not initialized on
* disk before.
*/
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
// TODO:
// Zero any non-uptodate buffers up to i_size.
// Set them uptodate and dirty.
@@ -727,14 +710,14 @@ lock_retry_remap:
unsigned long *bpos, *bend;
/* Check if the buffer is zero. */
- kaddr = kmap_atomic(page);
- bpos = (unsigned long *)(kaddr + bh_offset(bh));
- bend = (unsigned long *)((u8*)bpos + blocksize);
+ kaddr = kmap_local_folio(folio, bh_offset(bh));
+ bpos = (unsigned long *)kaddr;
+ bend = (unsigned long *)(kaddr + blocksize);
do {
if (unlikely(*bpos))
break;
} while (likely(++bpos < bend));
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (bpos == bend) {
/*
* Buffer is zero and sparse, no need to write
@@ -774,7 +757,7 @@ lock_retry_remap:
if (err == -ENOENT || lcn == LCN_ENOENT) {
bh->b_blocknr = -1;
clear_buffer_dirty(bh);
- zero_user(page, bh_offset(bh), blocksize);
+ folio_zero_range(folio, bh_offset(bh), blocksize);
set_buffer_uptodate(bh);
err = 0;
continue;
@@ -801,7 +784,7 @@ lock_retry_remap:
bh = head;
/* Just an optimization, so ->read_folio() is not called later. */
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(!folio_test_uptodate(folio))) {
int uptodate = 1;
do {
if (!buffer_uptodate(bh)) {
@@ -811,7 +794,7 @@ lock_retry_remap:
}
} while ((bh = bh->b_this_page) != head);
if (uptodate)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
/* Setup all mapped, dirty buffers for async write i/o. */
@@ -826,7 +809,7 @@ lock_retry_remap:
} else if (unlikely(err)) {
/*
* For the error case. The buffer may have been set
- * dirty during attachment to a dirty page.
+ * dirty during attachment to a dirty folio.
*/
if (err != -ENOMEM)
clear_buffer_dirty(bh);
@@ -839,20 +822,20 @@ lock_retry_remap:
err = 0;
else if (err == -ENOMEM) {
ntfs_warning(vol->sb, "Error allocating memory. "
- "Redirtying page so we try again "
+ "Redirtying folio so we try again "
"later.");
/*
- * Put the page back on mapping->dirty_pages, but
+ * Put the folio back on mapping->dirty_pages, but
* leave its buffer's dirty state as-is.
*/
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
err = 0;
} else
- SetPageError(page);
+ folio_set_error(folio);
}
- BUG_ON(PageWriteback(page));
- set_page_writeback(page); /* Keeps try_to_free_buffers() away. */
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */
/* Submit the prepared buffers for i/o. */
need_end_writeback = true;
@@ -864,11 +847,11 @@ lock_retry_remap:
}
bh = next;
} while (bh != head);
- unlock_page(page);
+ folio_unlock(folio);
- /* If no i/o was started, need to end_page_writeback(). */
+ /* If no i/o was started, need to end writeback here. */
if (unlikely(need_end_writeback))
- end_page_writeback(page);
+ folio_end_writeback(folio);
ntfs_debug("Done.");
return err;
@@ -1337,8 +1320,9 @@ done:
*/
static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
loff_t i_size;
- struct inode *vi = page->mapping->host;
+ struct inode *vi = folio->mapping->host;
ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
char *addr;
ntfs_attr_search_ctx *ctx = NULL;
@@ -1347,14 +1331,13 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
int err;
retry_writepage:
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
i_size = i_size_read(vi);
- /* Is the page fully outside i_size? (truncate in progress) */
- if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+ /* Is the folio fully outside i_size? (truncate in progress) */
+ if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
PAGE_SHIFT)) {
- struct folio *folio = page_folio(page);
/*
- * The page may have dirty, unmapped buffers. Make them
+ * The folio may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
block_invalidate_folio(folio, 0, folio_size(folio));
@@ -1373,7 +1356,7 @@ retry_writepage:
if (ni->type != AT_INDEX_ALLOCATION) {
/* If file is encrypted, deny access, just like NT4. */
if (NInoEncrypted(ni)) {
- unlock_page(page);
+ folio_unlock(folio);
BUG_ON(ni->type != AT_DATA);
ntfs_debug("Denying write access to encrypted file.");
return -EACCES;
@@ -1384,14 +1367,14 @@ retry_writepage:
BUG_ON(ni->name_len);
// TODO: Implement and replace this with
// return ntfs_write_compressed_block(page);
- unlock_page(page);
+ folio_unlock(folio);
ntfs_error(vi->i_sb, "Writing to compressed files is "
"not supported yet. Sorry.");
return -EOPNOTSUPP;
}
// TODO: Implement and remove this check.
if (NInoNonResident(ni) && NInoSparse(ni)) {
- unlock_page(page);
+ folio_unlock(folio);
ntfs_error(vi->i_sb, "Writing to sparse files is not "
"supported yet. Sorry.");
return -EOPNOTSUPP;
@@ -1400,34 +1383,34 @@ retry_writepage:
/* NInoNonResident() == NInoIndexAllocPresent() */
if (NInoNonResident(ni)) {
/* We have to zero every time due to mmap-at-end-of-file. */
- if (page->index >= (i_size >> PAGE_SHIFT)) {
- /* The page straddles i_size. */
- unsigned int ofs = i_size & ~PAGE_MASK;
- zero_user_segment(page, ofs, PAGE_SIZE);
+ if (folio->index >= (i_size >> PAGE_SHIFT)) {
+ /* The folio straddles i_size. */
+ unsigned int ofs = i_size & (folio_size(folio) - 1);
+ folio_zero_segment(folio, ofs, folio_size(folio));
}
/* Handle mst protected attributes. */
if (NInoMstProtected(ni))
return ntfs_write_mst_block(page, wbc);
/* Normal, non-resident data stream. */
- return ntfs_write_block(page, wbc);
+ return ntfs_write_block(folio, wbc);
}
/*
* Attribute is resident, implying it is not compressed, encrypted, or
* mst protected. This also means the attribute is smaller than an mft
- * record and hence smaller than a page, so can simply return error on
- * any pages with index above 0. Note the attribute can actually be
+ * record and hence smaller than a folio, so can simply return error on
+ * any folios with index above 0. Note the attribute can actually be
* marked compressed but if it is resident the actual data is not
* compressed so we are ok to ignore the compressed flag here.
*/
- BUG_ON(page_has_buffers(page));
- BUG_ON(!PageUptodate(page));
- if (unlikely(page->index > 0)) {
- ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. "
- "Aborting write.", page->index);
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
- end_page_writeback(page);
+ BUG_ON(folio_buffers(folio));
+ BUG_ON(!folio_test_uptodate(folio));
+ if (unlikely(folio->index > 0)) {
+ ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. "
+ "Aborting write.", folio->index);
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
return -EIO;
}
if (!NInoAttr(ni))
@@ -1460,12 +1443,12 @@ retry_writepage:
if (unlikely(err))
goto err_out;
/*
- * Keep the VM happy. This must be done otherwise the radix-tree tag
- * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
+ * Keep the VM happy. This must be done otherwise
+ * PAGECACHE_TAG_DIRTY remains set even though the folio is clean.
*/
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio);
+ folio_unlock(folio);
attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
i_size = i_size_read(vi);
if (unlikely(attr_len > i_size)) {
@@ -1480,18 +1463,18 @@ retry_writepage:
/* Shrinking cannot fail. */
BUG_ON(err);
}
- addr = kmap_atomic(page);
- /* Copy the data from the page to the mft record. */
+ addr = kmap_local_folio(folio, 0);
+ /* Copy the data from the folio to the mft record. */
memcpy((u8*)ctx->attr +
le16_to_cpu(ctx->attr->data.resident.value_offset),
addr, attr_len);
- /* Zero out of bounds area in the page cache page. */
- memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
- kunmap_atomic(addr);
- flush_dcache_page(page);
+ /* Zero out of bounds area in the page cache folio. */
+ memset(addr + attr_len, 0, folio_size(folio) - attr_len);
+ kunmap_local(addr);
+ flush_dcache_folio(folio);
flush_dcache_mft_record_page(ctx->ntfs_ino);
- /* We are done with the page. */
- end_page_writeback(page);
+ /* We are done with the folio. */
+ folio_end_writeback(folio);
/* Finally, mark the mft record dirty, so it gets written back. */
mark_mft_record_dirty(ctx->ntfs_ino);
ntfs_attr_put_search_ctx(ctx);
@@ -1502,18 +1485,18 @@ err_out:
ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
"page so we try again later.");
/*
- * Put the page back on mapping->dirty_pages, but leave its
+ * Put the folio back on mapping->dirty_pages, but leave its
* buffers' dirty state as-is.
*/
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
err = 0;
} else {
ntfs_error(vi->i_sb, "Resident attribute write failed with "
"error %i.", err);
- SetPageError(page);
+ folio_set_error(folio);
NVolSetErrors(ni->vol);
}
- unlock_page(page);
+ folio_unlock(folio);
if (ctx)
ntfs_attr_put_search_ctx(ctx);
if (m)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index cbc545999cfe..297c0b9db621 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -567,7 +567,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
LCN lcn;
s64 bh_pos, vcn_len, end, initialized_size;
sector_t lcn_block;
- struct page *page;
+ struct folio *folio;
struct inode *vi;
ntfs_inode *ni, *base_ni = NULL;
ntfs_volume *vol;
@@ -601,20 +601,6 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
(long long)pos, bytes);
blocksize = vol->sb->s_blocksize;
blocksize_bits = vol->sb->s_blocksize_bits;
- u = 0;
- do {
- page = pages[u];
- BUG_ON(!page);
- /*
- * create_empty_buffers() will create uptodate/dirty buffers if
- * the page is uptodate/dirty.
- */
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, blocksize, 0);
- if (unlikely(!page_has_buffers(page)))
- return -ENOMEM;
- }
- } while (++u < nr_pages);
rl_write_locked = false;
rl = NULL;
err = 0;
@@ -626,14 +612,21 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
end = pos + bytes;
cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
/*
- * Loop over each page and for each page over each buffer. Use goto to
+ * Loop over each buffer in each folio. Use goto to
* reduce indentation.
*/
u = 0;
-do_next_page:
- page = pages[u];
- bh_pos = (s64)page->index << PAGE_SHIFT;
- bh = head = page_buffers(page);
+do_next_folio:
+ folio = page_folio(pages[u]);
+ bh_pos = folio_pos(folio);
+ head = folio_buffers(folio);
+ if (!head)
+ /*
+ * create_empty_buffers() will create uptodate/dirty
+ * buffers if the folio is uptodate/dirty.
+ */
+ head = create_empty_buffers(folio, blocksize, 0);
+ bh = head;
do {
VCN cdelta;
s64 bh_end;
@@ -653,15 +646,15 @@ do_next_page:
if (buffer_uptodate(bh))
continue;
/*
- * The buffer is not uptodate. If the page is uptodate
+ * The buffer is not uptodate. If the folio is uptodate
* set the buffer uptodate and otherwise ignore it.
*/
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
set_buffer_uptodate(bh);
continue;
}
/*
- * Neither the page nor the buffer are uptodate. If
+ * Neither the folio nor the buffer are uptodate. If
* the buffer is only partially being written to, we
* need to read it in before the write, i.e. now.
*/
@@ -679,7 +672,7 @@ do_next_page:
ntfs_submit_bh_for_read(bh);
*wait_bh++ = bh;
} else {
- zero_user(page, bh_offset(bh),
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
set_buffer_uptodate(bh);
}
@@ -706,7 +699,7 @@ map_buffer_cached:
(bh_cofs >> blocksize_bits);
set_buffer_mapped(bh);
/*
- * If the page is uptodate so is the buffer. If the
+ * If the folio is uptodate so is the buffer. If the
* buffer is fully outside the write, we ignore it if
* it was already allocated and we mark it dirty so it
* gets written out if we allocated it. On the other
@@ -714,7 +707,7 @@ map_buffer_cached:
* marking it dirty we set buffer_new so we can do
* error recovery.
*/
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
if (unlikely(was_hole)) {
@@ -754,7 +747,8 @@ map_buffer_cached:
ntfs_submit_bh_for_read(bh);
*wait_bh++ = bh;
} else {
- zero_user(page, bh_offset(bh),
+ folio_zero_range(folio,
+ bh_offset(bh),
blocksize);
set_buffer_uptodate(bh);
}
@@ -773,7 +767,7 @@ map_buffer_cached:
*/
if (bh_end <= pos || bh_pos >= end) {
if (!buffer_uptodate(bh)) {
- zero_user(page, bh_offset(bh),
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
set_buffer_uptodate(bh);
}
@@ -786,7 +780,7 @@ map_buffer_cached:
u8 *kaddr;
unsigned pofs;
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_folio(folio, 0);
if (bh_pos < pos) {
pofs = bh_pos & ~PAGE_MASK;
memset(kaddr + pofs, 0, pos - bh_pos);
@@ -795,8 +789,8 @@ map_buffer_cached:
pofs = end & ~PAGE_MASK;
memset(kaddr + pofs, 0, bh_end - end);
}
- kunmap_atomic(kaddr);
- flush_dcache_page(page);
+ kunmap_local(kaddr);
+ flush_dcache_folio(folio);
}
continue;
}
@@ -809,11 +803,12 @@ map_buffer_cached:
initialized_size = ni->allocated_size;
read_unlock_irqrestore(&ni->size_lock, flags);
if (bh_pos > initialized_size) {
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
} else if (!buffer_uptodate(bh)) {
- zero_user(page, bh_offset(bh), blocksize);
+ folio_zero_range(folio, bh_offset(bh),
+ blocksize);
set_buffer_uptodate(bh);
}
continue;
@@ -927,17 +922,17 @@ rl_not_mapped_enoent:
bh->b_blocknr = -1;
/*
* If the buffer is uptodate we skip it. If it
- * is not but the page is uptodate, we can set
- * the buffer uptodate. If the page is not
+ * is not but the folio is uptodate, we can set
+ * the buffer uptodate. If the folio is not
* uptodate, we can clear the buffer and set it
* uptodate. Whether this is worthwhile is
* debatable and this could be removed.
*/
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
} else if (!buffer_uptodate(bh)) {
- zero_user(page, bh_offset(bh),
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
set_buffer_uptodate(bh);
}
@@ -1167,7 +1162,7 @@ rl_not_mapped_enoent:
} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
/* If there are no errors, do the next page. */
if (likely(!err && ++u < nr_pages))
- goto do_next_page;
+ goto do_next_folio;
/* If there are no errors, release the runlist lock if we took it. */
if (likely(!err)) {
if (unlikely(rl_write_locked)) {
@@ -1185,9 +1180,8 @@ rl_not_mapped_enoent:
bh = *--wait_bh;
wait_on_buffer(bh);
if (likely(buffer_uptodate(bh))) {
- page = bh->b_page;
- bh_pos = ((s64)page->index << PAGE_SHIFT) +
- bh_offset(bh);
+ folio = bh->b_folio;
+ bh_pos = folio_pos(folio) + bh_offset(bh);
/*
* If the buffer overflows the initialized size, need
* to zero the overflowing region.
@@ -1197,7 +1191,7 @@ rl_not_mapped_enoent:
if (likely(bh_pos < initialized_size))
ofs = initialized_size - bh_pos;
- zero_user_segment(page, bh_offset(bh) + ofs,
+ folio_zero_segment(folio, bh_offset(bh) + ofs,
blocksize);
}
} else /* if (unlikely(!buffer_uptodate(bh))) */
@@ -1324,21 +1318,20 @@ rl_not_mapped_enoent:
u = 0;
end = bh_cpos << vol->cluster_size_bits;
do {
- page = pages[u];
- bh = head = page_buffers(page);
+ folio = page_folio(pages[u]);
+ bh = head = folio_buffers(folio);
do {
if (u == nr_pages &&
- ((s64)page->index << PAGE_SHIFT) +
- bh_offset(bh) >= end)
+ folio_pos(folio) + bh_offset(bh) >= end)
break;
if (!buffer_new(bh))
continue;
clear_buffer_new(bh);
if (!buffer_uptodate(bh)) {
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
else {
- zero_user(page, bh_offset(bh),
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
set_buffer_uptodate(bh);
}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index ab44f2db533b..d7498ddc4a72 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -384,6 +384,7 @@ static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
* and due to using iget() whereas NTFS needs ntfs_iget().
*/
const struct export_operations ntfs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.get_parent = ntfs_get_parent, /* Find the parent of a given
directory. */
.fh_to_dentry = ntfs_fh_to_dentry,
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index ad4a70b5d432..a5a30a24ce5d 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -187,7 +187,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
struct buffer_head *head, *bh;
u32 bh_next, bh_off, to;
sector_t iblock;
- struct page *page;
+ struct folio *folio;
for (; idx < idx_end; idx += 1, from = 0) {
page_off = (loff_t)idx << PAGE_SHIFT;
@@ -195,16 +195,17 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
PAGE_SIZE;
iblock = page_off >> inode->i_blkbits;
- page = find_or_create_page(mapping, idx,
- mapping_gfp_constraint(mapping,
- ~__GFP_FS));
- if (!page)
- return -ENOMEM;
+ folio = __filemap_get_folio(mapping, idx,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, blocksize, 0);
- bh = head = page_buffers(page);
+ bh = head;
bh_off = 0;
do {
bh_next = bh_off + blocksize;
@@ -220,14 +221,14 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
}
/* Ok, it's mapped. Make sure it's up-to-date. */
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
err = bh_read(bh, 0);
if (err < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto out;
}
}
@@ -237,10 +238,10 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
} while (bh_off = bh_next, iblock += 1,
head != (bh = bh->b_this_page));
- zero_user_segment(page, from, to);
+ folio_zero_segment(folio, from, to);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
cond_resched();
}
out:
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index f763e3256ccc..9153dffde950 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -811,6 +811,7 @@ static int ntfs_nfs_commit_metadata(struct inode *inode)
}
static const struct export_operations ntfs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = ntfs_fh_to_dentry,
.fh_to_parent = ntfs_fh_to_parent,
.get_parent = ntfs3_get_parent,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f0937902f7b4..91b32b2377ac 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -967,7 +967,14 @@ int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
el = &eb->h_list;
}
- BUG_ON(el->l_tree_depth != 0);
+ if (el->l_tree_depth != 0) {
+ retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)last_eb_blk,
+ le16_to_cpu(el->l_tree_depth));
+ goto bail;
+ }
retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
bail:
@@ -7642,7 +7649,7 @@ out_mutex:
goto next_group;
}
out:
- range->len = trimmed * sb->s_blocksize;
+ range->len = trimmed * osb->s_clustersize;
return ret;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 6ab03494fc6e..ba790219d528 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -568,10 +568,10 @@ static void ocfs2_clear_page_regions(struct page *page,
* read-in the blocks at the tail of our file. Avoid reading them by
* testing i_size against each block offset.
*/
-static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
+static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio,
unsigned int block_start)
{
- u64 offset = page_offset(page) + block_start;
+ u64 offset = folio_pos(folio) + block_start;
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
return 1;
@@ -593,15 +593,16 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
struct inode *inode, unsigned int from,
unsigned int to, int new)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
unsigned int block_end, block_start;
unsigned int bsize = i_blocksize(inode);
- if (!page_has_buffers(page))
- create_empty_buffers(page, bsize, 0);
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, bsize, 0);
- head = page_buffers(page);
for (bh = head, block_start = 0; bh != head || !block_start;
bh = bh->b_this_page, block_start += bsize) {
block_end = block_start + bsize;
@@ -613,7 +614,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
* they may belong to unallocated clusters.
*/
if (block_start >= to || block_end <= from) {
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
continue;
}
@@ -630,11 +631,11 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
clean_bdev_bh_alias(bh);
}
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
set_buffer_uptodate(bh);
} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_new(bh) &&
- ocfs2_should_read_blk(inode, page, block_start) &&
+ ocfs2_should_read_blk(inode, folio, block_start) &&
(block_start < from || block_end > to)) {
bh_read_nowait(bh, 0);
*wait_bh++=bh;
@@ -668,7 +669,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
if (block_start >= to)
break;
- zero_user(page, block_start, bh->b_size);
+ folio_zero_range(folio, block_start, bh->b_size);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 196638a22b48..cdb9b9bdea1f 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -158,7 +158,7 @@ read_failure:
if (new_bh && bh) {
/* If middle bh fails, let previous bh
* finish its read and then put it to
- * aovoid bh leak
+ * avoid bh leak
*/
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -345,7 +345,7 @@ read_failure:
if (new_bh && bh) {
/* If middle bh fails, let previous bh
* finish its read and then put it to
- * aovoid bh leak
+ * avoid bh leak
*/
if (!buffer_jbd(bh))
wait_on_buffer(bh);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 9b57d012fd5c..85215162c9dd 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -80,8 +80,7 @@ static int param_set_dlmfs_capabilities(const char *val,
static int param_get_dlmfs_capabilities(char *buffer,
const struct kernel_param *kp)
{
- return strlcpy(buffer, DLMFS_CAPABILITIES,
- strlen(DLMFS_CAPABILITIES) + 1);
+ return sysfs_emit(buffer, DLMFS_CAPABILITIES);
}
module_param_call(capabilities, param_set_dlmfs_capabilities,
param_get_dlmfs_capabilities, NULL, 0444);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ce215565d061..604fea3a26ff 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -90,7 +90,7 @@ enum ocfs2_replay_state {
struct ocfs2_replay_map {
unsigned int rm_slots;
enum ocfs2_replay_state rm_state;
- unsigned char rm_replay_slots[];
+ unsigned char rm_replay_slots[] __counted_by(rm_slots);
};
static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 681e9501cdd3..814733ba2f4b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1597,6 +1597,10 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
&old_inode_dot_dot_res, new_dir);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
drop_nlink(old_dir);
if (new_inode) {
drop_nlink(new_inode);
@@ -1636,6 +1640,10 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
INODE_CACHE(old_dir),
old_dir_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
ocfs2_set_links_count(fe, old_dir->i_nlink);
ocfs2_journal_dirty(handle, old_dir_bh);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dfaae1e52412..e09842fc9d4d 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -1240,6 +1240,10 @@ int ocfs2_create_local_dquot(struct dquot *dquot)
&od->dq_local_phys_blk,
&pcount,
NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
/* Initialize dquot structure on disk */
status = ocfs2_local_write_dquot(dquot);
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
index 4e173d56b11f..5648954f8588 100644
--- a/fs/overlayfs/Makefile
+++ b/fs/overlayfs/Makefile
@@ -6,4 +6,4 @@
obj-$(CONFIG_OVERLAY_FS) += overlay.o
overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \
- copy_up.o export.o params.o
+ copy_up.o export.o params.o xattrs.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index ada3fcc9c6d5..4382881b0709 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -252,7 +252,9 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
return PTR_ERR(old_file);
/* Try to use clone_file_range to clone up within the same fs */
+ ovl_start_write(dentry);
cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
+ ovl_end_write(dentry);
if (cloned == len)
goto out_fput;
/* Couldn't clone, so now we try to copy the data */
@@ -287,8 +289,12 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
* it may not recognize all kind of holes and sometimes
* only skips partial of hole area. However, it will be
* enough for most of the use cases.
+ *
+ * We do not hold upper sb_writers throughout the loop to avert
+ * lockdep warning with llseek of lower file in nested overlay:
+ * - upper sb_writers
+ * -- lower ovl_inode_lock (ovl_llseek)
*/
-
if (skip_hole && data_pos < old_pos) {
data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA);
if (data_pos > old_pos) {
@@ -303,9 +309,11 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
}
}
+ ovl_start_write(dentry);
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
+ ovl_end_write(dentry);
if (bytes <= 0) {
error = bytes;
break;
@@ -426,29 +434,29 @@ out_err:
return ERR_PTR(err);
}
-int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
- struct dentry *upper)
+struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin)
{
- const struct ovl_fh *fh = NULL;
- int err;
-
/*
* When lower layer doesn't support export operations store a 'null' fh,
* so we can use the overlay.origin xattr to distignuish between a copy
* up and a pure upper inode.
*/
- if (ovl_can_decode_fh(lower->d_sb)) {
- fh = ovl_encode_real_fh(ofs, lower, false);
- if (IS_ERR(fh))
- return PTR_ERR(fh);
- }
+ if (!ovl_can_decode_fh(origin->d_sb))
+ return NULL;
+
+ return ovl_encode_real_fh(ofs, origin, false);
+}
+
+int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
+ struct dentry *upper)
+{
+ int err;
/*
* Do not fail when upper doesn't support xattrs.
*/
err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf,
fh ? fh->fb.len : 0, 0);
- kfree(fh);
/* Ignore -EPERM from setting "user.*" on symlink/special */
return err == -EPERM ? 0 : err;
@@ -476,7 +484,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
*
* Caller must hold i_mutex on indexdir.
*/
-static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
+static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
struct dentry *upper)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
@@ -502,7 +510,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry))))
return -EIO;
- err = ovl_get_index_name(ofs, origin, &name);
+ err = ovl_get_index_name_fh(fh, &name);
if (err)
return err;
@@ -541,6 +549,7 @@ struct ovl_copy_up_ctx {
struct dentry *destdir;
struct qstr destname;
struct dentry *workdir;
+ const struct ovl_fh *origin_fh;
bool origin;
bool indexed;
bool metacopy;
@@ -555,14 +564,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *udir = d_inode(upperdir);
+ ovl_start_write(c->dentry);
+
/* Mark parent "impure" because it may now contain non-pure upper */
err = ovl_set_impure(c->parent, upperdir);
if (err)
- return err;
+ goto out;
err = ovl_set_nlink_lower(c->dentry);
if (err)
- return err;
+ goto out;
inode_lock_nested(udir, I_MUTEX_PARENT);
upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir,
@@ -581,10 +592,12 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
}
inode_unlock(udir);
if (err)
- return err;
+ goto out;
err = ovl_set_nlink_upper(c->dentry);
+out:
+ ovl_end_write(c->dentry);
return err;
}
@@ -637,7 +650,7 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
* hard link.
*/
if (c->origin) {
- err = ovl_set_origin(ofs, c->lowerpath.dentry, temp);
+ err = ovl_set_origin_fh(ofs, c->origin_fh, temp);
if (err)
return err;
}
@@ -719,21 +732,19 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
.link = c->link
};
- /* workdir and destdir could be the same when copying up to indexdir */
- err = -EIO;
- if (lock_rename(c->workdir, c->destdir) != NULL)
- goto unlock;
-
err = ovl_prep_cu_creds(c->dentry, &cc);
if (err)
- goto unlock;
+ return err;
+ ovl_start_write(c->dentry);
+ inode_lock(wdir);
temp = ovl_create_temp(ofs, c->workdir, &cattr);
+ inode_unlock(wdir);
+ ovl_end_write(c->dentry);
ovl_revert_cu_creds(&cc);
- err = PTR_ERR(temp);
if (IS_ERR(temp))
- goto unlock;
+ return PTR_ERR(temp);
/*
* Copy up data first and then xattrs. Writing data after
@@ -741,15 +752,28 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
*/
path.dentry = temp;
err = ovl_copy_up_data(c, &path);
- if (err)
+ /*
+ * We cannot hold lock_rename() throughout this helper, because or
+ * lock ordering with sb_writers, which shouldn't be held when calling
+ * ovl_copy_up_data(), so lock workdir and destdir and make sure that
+ * temp wasn't moved before copy up completion or cleanup.
+ * If temp was moved, abort without the cleanup.
+ */
+ ovl_start_write(c->dentry);
+ if (lock_rename(c->workdir, c->destdir) != NULL ||
+ temp->d_parent != c->workdir) {
+ err = -EIO;
+ goto unlock;
+ } else if (err) {
goto cleanup;
+ }
err = ovl_copy_up_metadata(c, temp);
if (err)
goto cleanup;
if (S_ISDIR(c->stat.mode) && c->indexed) {
- err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
+ err = ovl_create_index(c->dentry, c->origin_fh, temp);
if (err)
goto cleanup;
}
@@ -779,6 +803,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
ovl_set_flag(OVL_WHITEOUTS, inode);
unlock:
unlock_rename(c->workdir, c->destdir);
+ ovl_end_write(c->dentry);
return err;
@@ -802,9 +827,10 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
if (err)
return err;
+ ovl_start_write(c->dentry);
tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
+ ovl_end_write(c->dentry);
ovl_revert_cu_creds(&cc);
-
if (IS_ERR(tmpfile))
return PTR_ERR(tmpfile);
@@ -815,9 +841,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
goto out_fput;
}
+ ovl_start_write(c->dentry);
+
err = ovl_copy_up_metadata(c, temp);
if (err)
- goto out_fput;
+ goto out;
inode_lock_nested(udir, I_MUTEX_PARENT);
@@ -831,7 +859,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
inode_unlock(udir);
if (err)
- goto out_fput;
+ goto out;
if (c->metacopy_digest)
ovl_set_flag(OVL_HAS_DIGEST, d_inode(c->dentry));
@@ -843,6 +871,8 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
ovl_set_upperdata(d_inode(c->dentry));
ovl_inode_update(d_inode(c->dentry), dget(temp));
+out:
+ ovl_end_write(c->dentry);
out_fput:
fput(tmpfile);
return err;
@@ -861,6 +891,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
{
int err;
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct dentry *origin = c->lowerpath.dentry;
+ struct ovl_fh *fh = NULL;
bool to_index = false;
/*
@@ -877,25 +909,35 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
to_index = true;
}
- if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index)
+ if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) {
+ fh = ovl_get_origin_fh(ofs, origin);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
+ /* origin_fh may be NULL */
+ c->origin_fh = fh;
c->origin = true;
+ }
if (to_index) {
c->destdir = ovl_indexdir(c->dentry->d_sb);
- err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname);
+ err = ovl_get_index_name(ofs, origin, &c->destname);
if (err)
- return err;
+ goto out_free_fh;
} else if (WARN_ON(!c->parent)) {
/* Disconnected dentry must be copied up to index dir */
- return -EIO;
+ err = -EIO;
+ goto out_free_fh;
} else {
/*
* Mark parent "impure" because it may now contain non-pure
* upper
*/
+ ovl_start_write(c->dentry);
err = ovl_set_impure(c->parent, c->destdir);
+ ovl_end_write(c->dentry);
if (err)
- return err;
+ goto out_free_fh;
}
/* Should we copyup with O_TMPFILE or with workdir? */
@@ -909,6 +951,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
if (c->indexed)
ovl_set_flag(OVL_INDEX, d_inode(c->dentry));
+ ovl_start_write(c->dentry);
if (to_index) {
/* Initialize nlink for copy up of disconnected dentry */
err = ovl_set_nlink_upper(c->dentry);
@@ -923,10 +966,13 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
ovl_dentry_set_upper_alias(c->dentry);
ovl_dentry_update_reval(c->dentry, ovl_dentry_upper(c->dentry));
}
+ ovl_end_write(c->dentry);
out:
if (to_index)
kfree(c->destname.name);
+out_free_fh:
+ kfree(fh);
return err;
}
@@ -1011,15 +1057,16 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
* Writing to upper file will clear security.capability xattr. We
* don't want that to happen for normal copy-up operation.
*/
+ ovl_start_write(c->dentry);
if (capability) {
err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS,
capability, cap_size, 0);
- if (err)
- goto out_free;
}
-
-
- err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY);
+ if (!err) {
+ err = ovl_removexattr(ofs, upperpath.dentry,
+ OVL_XATTR_METACOPY);
+ }
+ ovl_end_write(c->dentry);
if (err)
goto out_free;
@@ -1170,17 +1217,10 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)
int ovl_maybe_copy_up(struct dentry *dentry, int flags)
{
- int err = 0;
-
- if (ovl_open_need_copy_up(dentry, flags)) {
- err = ovl_want_write(dentry);
- if (!err) {
- err = ovl_copy_up_flags(dentry, flags);
- ovl_drop_write(dentry);
- }
- }
+ if (!ovl_open_need_copy_up(dentry, flags))
+ return 0;
- return err;
+ return ovl_copy_up_flags(dentry, flags);
}
int ovl_copy_up_with_data(struct dentry *dentry)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 033fc0458a3d..aab3f5d93556 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -477,7 +477,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
goto out_unlock;
err = -ESTALE;
- if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper)))
+ if (d_is_negative(upper) || !ovl_upper_is_whiteout(ofs, upper))
goto out_dput;
newdentry = ovl_create_temp(ofs, workdir, cattr);
@@ -559,10 +559,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
struct cred *override_cred;
struct dentry *parent = dentry->d_parent;
- err = ovl_copy_up(parent);
- if (err)
- return err;
-
old_cred = ovl_override_creds(dentry->d_sb);
/*
@@ -626,6 +622,10 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
.link = link,
};
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ return err;
+
err = ovl_want_write(dentry);
if (err)
goto out;
@@ -700,28 +700,24 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
int err;
struct inode *inode;
- err = ovl_want_write(old);
+ err = ovl_copy_up(old);
if (err)
goto out;
- err = ovl_copy_up(old);
+ err = ovl_copy_up(new->d_parent);
if (err)
- goto out_drop_write;
+ goto out;
- err = ovl_copy_up(new->d_parent);
+ err = ovl_nlink_start(old);
if (err)
- goto out_drop_write;
+ goto out;
if (ovl_is_metacopy_dentry(old)) {
err = ovl_set_link_redirect(old);
if (err)
- goto out_drop_write;
+ goto out_nlink_end;
}
- err = ovl_nlink_start(old);
- if (err)
- goto out_drop_write;
-
inode = d_inode(old);
ihold(inode);
@@ -731,9 +727,8 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
if (err)
iput(inode);
+out_nlink_end:
ovl_nlink_end(old);
-out_drop_write:
- ovl_drop_write(old);
out:
return err;
}
@@ -891,17 +886,13 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
goto out;
}
- err = ovl_want_write(dentry);
- if (err)
- goto out;
-
err = ovl_copy_up(dentry->d_parent);
if (err)
- goto out_drop_write;
+ goto out;
err = ovl_nlink_start(dentry);
if (err)
- goto out_drop_write;
+ goto out;
old_cred = ovl_override_creds(dentry->d_sb);
if (!lower_positive)
@@ -926,8 +917,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
if (ovl_dentry_upper(dentry))
ovl_copyattr(d_inode(dentry));
-out_drop_write:
- ovl_drop_write(dentry);
out:
ovl_cache_free(&list);
return err;
@@ -1131,29 +1120,32 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
}
}
- err = ovl_want_write(old);
- if (err)
- goto out;
-
err = ovl_copy_up(old);
if (err)
- goto out_drop_write;
+ goto out;
err = ovl_copy_up(new->d_parent);
if (err)
- goto out_drop_write;
+ goto out;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
- goto out_drop_write;
+ goto out;
} else if (d_inode(new)) {
err = ovl_nlink_start(new);
if (err)
- goto out_drop_write;
+ goto out;
update_nlink = true;
}
+ if (!update_nlink) {
+ /* ovl_nlink_start() took ovl_want_write() */
+ err = ovl_want_write(old);
+ if (err)
+ goto out;
+ }
+
old_cred = ovl_override_creds(old->d_sb);
if (!list_empty(&list)) {
@@ -1219,7 +1211,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
}
} else {
if (!d_is_negative(newdentry)) {
- if (!new_opaque || !ovl_is_whiteout(newdentry))
+ if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry))
goto out_dput;
} else {
if (flags & RENAME_EXCHANGE)
@@ -1286,8 +1278,8 @@ out_revert_creds:
revert_creds(old_cred);
if (update_nlink)
ovl_nlink_end(new);
-out_drop_write:
- ovl_drop_write(old);
+ else
+ ovl_drop_write(old);
out:
dput(opaquedir);
ovl_cache_free(&list);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 26b782c53910..7e16bbcad95e 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -23,12 +23,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry)
if (ovl_dentry_upper(dentry))
return 0;
- err = ovl_want_write(dentry);
- if (!err) {
- err = ovl_copy_up(dentry);
- ovl_drop_write(dentry);
- }
-
+ err = ovl_copy_up(dentry);
if (err) {
pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n",
dentry, err);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index ec3671ca140c..131621daeb13 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -15,10 +15,15 @@
#include <linux/fs.h>
#include "overlayfs.h"
+#include "../internal.h" /* for sb_init_dio_done_wq */
+
struct ovl_aio_req {
struct kiocb iocb;
refcount_t ref;
struct kiocb *orig_iocb;
+ /* used for aio completion */
+ struct work_struct work;
+ long res;
};
static struct kmem_cache *ovl_aio_request_cachep;
@@ -235,6 +240,12 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
return ret;
}
+static void ovl_file_modified(struct file *file)
+{
+ /* Update size/mtime */
+ ovl_copyattr(file_inode(file));
+}
+
static void ovl_file_accessed(struct file *file)
{
struct inode *inode, *upperinode;
@@ -263,20 +274,12 @@ static void ovl_file_accessed(struct file *file)
touch_atime(&file->f_path);
}
-static rwf_t ovl_iocb_to_rwf(int ifl)
+#define OVL_IOCB_MASK \
+ (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
+
+static rwf_t iocb_to_rw_flags(int flags)
{
- rwf_t flags = 0;
-
- if (ifl & IOCB_NOWAIT)
- flags |= RWF_NOWAIT;
- if (ifl & IOCB_HIPRI)
- flags |= RWF_HIPRI;
- if (ifl & IOCB_DSYNC)
- flags |= RWF_DSYNC;
- if (ifl & IOCB_SYNC)
- flags |= RWF_SYNC;
-
- return flags;
+ return (__force rwf_t)(flags & OVL_IOCB_MASK);
}
static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
@@ -293,10 +296,8 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
struct kiocb *orig_iocb = aio_req->orig_iocb;
if (iocb->ki_flags & IOCB_WRITE) {
- struct inode *inode = file_inode(orig_iocb->ki_filp);
-
kiocb_end_write(iocb);
- ovl_copyattr(inode);
+ ovl_file_modified(orig_iocb->ki_filp);
}
orig_iocb->ki_pos = iocb->ki_pos;
@@ -313,6 +314,37 @@ static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
orig_iocb->ki_complete(orig_iocb, res);
}
+static void ovl_aio_complete_work(struct work_struct *work)
+{
+ struct ovl_aio_req *aio_req = container_of(work,
+ struct ovl_aio_req, work);
+
+ ovl_aio_rw_complete(&aio_req->iocb, aio_req->res);
+}
+
+static void ovl_aio_queue_completion(struct kiocb *iocb, long res)
+{
+ struct ovl_aio_req *aio_req = container_of(iocb,
+ struct ovl_aio_req, iocb);
+ struct kiocb *orig_iocb = aio_req->orig_iocb;
+
+ /*
+ * Punt to a work queue to serialize updates of mtime/size.
+ */
+ aio_req->res = res;
+ INIT_WORK(&aio_req->work, ovl_aio_complete_work);
+ queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
+ &aio_req->work);
+}
+
+static int ovl_init_aio_done_wq(struct super_block *sb)
+{
+ if (sb->s_dio_done_wq)
+ return 0;
+
+ return sb_init_dio_done_wq(sb);
+}
+
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
@@ -334,8 +366,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
- ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(iocb->ki_flags));
+ rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags);
+
+ ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf);
} else {
struct ovl_aio_req *aio_req;
@@ -401,15 +434,20 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(ifl);
+
file_start_write(real.file);
- ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(ifl));
+ ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf);
file_end_write(real.file);
/* Update size */
- ovl_copyattr(inode);
+ ovl_file_modified(file);
} else {
struct ovl_aio_req *aio_req;
+ ret = ovl_init_aio_done_wq(inode->i_sb);
+ if (ret)
+ goto out;
+
ret = -ENOMEM;
aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
if (!aio_req)
@@ -418,7 +456,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
aio_req->orig_iocb = iocb;
kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
aio_req->iocb.ki_flags = ifl;
- aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ aio_req->iocb.ki_complete = ovl_aio_queue_completion;
refcount_set(&aio_req->ref, 2);
kiocb_start_write(&aio_req->iocb);
ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
@@ -492,7 +530,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
file_end_write(real.file);
/* Update size */
- ovl_copyattr(inode);
+ ovl_file_modified(out);
revert_creds(old_cred);
fdput(real);
@@ -573,7 +611,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
revert_creds(old_cred);
/* Update size */
- ovl_copyattr(inode);
+ ovl_file_modified(file);
fdput(real);
@@ -657,7 +695,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
revert_creds(old_cred);
/* Update size */
- ovl_copyattr(inode_out);
+ ovl_file_modified(file_out);
fdput(real_in);
fdput(real_out);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index b6e98a7d36ce..345b8f161ca4 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -32,10 +32,6 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (err)
return err;
- err = ovl_want_write(dentry);
- if (err)
- goto out;
-
if (attr->ia_valid & ATTR_SIZE) {
/* Truncate should trigger data copy up as well */
full_copy_up = true;
@@ -54,7 +50,7 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
winode = d_inode(upperdentry);
err = get_write_access(winode);
if (err)
- goto out_drop_write;
+ goto out;
}
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
@@ -78,6 +74,10 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
*/
attr->ia_valid &= ~ATTR_OPEN;
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out_put_write;
+
inode_lock(upperdentry->d_inode);
old_cred = ovl_override_creds(dentry->d_sb);
err = ovl_do_notify_change(ofs, upperdentry, attr);
@@ -85,12 +85,12 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (!err)
ovl_copyattr(dentry->d_inode);
inode_unlock(upperdentry->d_inode);
+ ovl_drop_write(dentry);
+out_put_write:
if (winode)
put_write_access(winode);
}
-out_drop_write:
- ovl_drop_write(dentry);
out:
return err;
}
@@ -339,130 +339,6 @@ static const char *ovl_get_link(struct dentry *dentry,
return p;
}
-bool ovl_is_private_xattr(struct super_block *sb, const char *name)
-{
- struct ovl_fs *ofs = OVL_FS(sb);
-
- if (ofs->config.userxattr)
- return strncmp(name, OVL_XATTR_USER_PREFIX,
- sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0;
- else
- return strncmp(name, OVL_XATTR_TRUSTED_PREFIX,
- sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0;
-}
-
-int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- int err;
- struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
- struct dentry *upperdentry = ovl_i_dentry_upper(inode);
- struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
- struct path realpath;
- const struct cred *old_cred;
-
- err = ovl_want_write(dentry);
- if (err)
- goto out;
-
- if (!value && !upperdentry) {
- ovl_path_lower(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
- revert_creds(old_cred);
- if (err < 0)
- goto out_drop_write;
- }
-
- if (!upperdentry) {
- err = ovl_copy_up(dentry);
- if (err)
- goto out_drop_write;
-
- realdentry = ovl_dentry_upper(dentry);
- }
-
- old_cred = ovl_override_creds(dentry->d_sb);
- if (value) {
- err = ovl_do_setxattr(ofs, realdentry, name, value, size,
- flags);
- } else {
- WARN_ON(flags != XATTR_REPLACE);
- err = ovl_do_removexattr(ofs, realdentry, name);
- }
- revert_creds(old_cred);
-
- /* copy c/mtime */
- ovl_copyattr(inode);
-
-out_drop_write:
- ovl_drop_write(dentry);
-out:
- return err;
-}
-
-int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
- void *value, size_t size)
-{
- ssize_t res;
- const struct cred *old_cred;
- struct path realpath;
-
- ovl_i_path_real(inode, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size);
- revert_creds(old_cred);
- return res;
-}
-
-static bool ovl_can_list(struct super_block *sb, const char *s)
-{
- /* Never list private (.overlay) */
- if (ovl_is_private_xattr(sb, s))
- return false;
-
- /* List all non-trusted xattrs */
- if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
- return true;
-
- /* list other trusted for superuser only */
- return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
-}
-
-ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
-{
- struct dentry *realdentry = ovl_dentry_real(dentry);
- ssize_t res;
- size_t len;
- char *s;
- const struct cred *old_cred;
-
- old_cred = ovl_override_creds(dentry->d_sb);
- res = vfs_listxattr(realdentry, list, size);
- revert_creds(old_cred);
- if (res <= 0 || size == 0)
- return res;
-
- /* filter out private xattrs */
- for (s = list, len = res; len;) {
- size_t slen = strnlen(s, len) + 1;
-
- /* underlying fs providing us with an broken xattr list? */
- if (WARN_ON(slen > len))
- return -EIO;
-
- len -= slen;
- if (!ovl_can_list(dentry->d_sb, s)) {
- res -= slen;
- memmove(s, s + slen, len);
- } else {
- s += slen;
- }
- }
-
- return res;
-}
-
#ifdef CONFIG_FS_POSIX_ACL
/*
* Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone
@@ -611,10 +487,6 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
struct dentry *upperdentry = ovl_dentry_upper(dentry);
struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
- err = ovl_want_write(dentry);
- if (err)
- return err;
-
/*
* If ACL is to be removed from a lower file, check if it exists in
* the first place before copying it up.
@@ -630,7 +502,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
revert_creds(old_cred);
if (IS_ERR(real_acl)) {
err = PTR_ERR(real_acl);
- goto out_drop_write;
+ goto out;
}
posix_acl_release(real_acl);
}
@@ -638,23 +510,26 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
if (!upperdentry) {
err = ovl_copy_up(dentry);
if (err)
- goto out_drop_write;
+ goto out;
realdentry = ovl_dentry_upper(dentry);
}
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
old_cred = ovl_override_creds(dentry->d_sb);
if (acl)
err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
else
err = ovl_do_remove_acl(ofs, realdentry, acl_name);
revert_creds(old_cred);
+ ovl_drop_write(dentry);
/* copy c/mtime */
ovl_copyattr(inode);
-
-out_drop_write:
- ovl_drop_write(dentry);
+out:
return err;
}
@@ -778,14 +653,14 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
unsigned int flags;
int err;
- err = ovl_want_write(dentry);
- if (err)
- goto out;
-
err = ovl_copy_up(dentry);
if (!err) {
ovl_path_real(dentry, &upperpath);
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
old_cred = ovl_override_creds(inode->i_sb);
/*
* Store immutable/append-only flags in xattr and clear them
@@ -798,6 +673,7 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
if (!err)
err = ovl_real_fileattr_set(&upperpath, fa);
revert_creds(old_cred);
+ ovl_drop_write(dentry);
/*
* Merge real inode flags with inode flags read from
@@ -812,7 +688,6 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
/* Update ctime */
ovl_copyattr(inode);
}
- ovl_drop_write(dentry);
out:
return err;
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 80391c687c2a..03bc8d5dfa31 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -251,7 +251,10 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
err = -EREMOTE;
goto out_err;
}
- if (ovl_is_whiteout(this)) {
+
+ path.dentry = this;
+ path.mnt = d->mnt;
+ if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) {
d->stop = d->opaque = true;
goto put_and_out;
}
@@ -264,8 +267,6 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
goto put_and_out;
}
- path.dentry = this;
- path.mnt = d->mnt;
if (!d_can_lookup(this)) {
if (d->is_dir || !last_element) {
d->stop = true;
@@ -438,7 +439,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
else if (IS_ERR(origin))
return PTR_ERR(origin);
- if (upperdentry && !ovl_is_whiteout(upperdentry) &&
+ if (upperdentry && !ovl_upper_is_whiteout(ofs, upperdentry) &&
inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode))
goto invalid;
@@ -507,6 +508,19 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry,
return err;
}
+int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, const struct ovl_fh *fh,
+ bool is_upper, bool set)
+{
+ int err;
+
+ err = ovl_verify_fh(ofs, dentry, ox, fh);
+ if (set && err == -ENODATA)
+ err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
+
+ return err;
+}
+
/*
* Verify that @real dentry matches the file handle stored in xattr @name.
*
@@ -515,9 +529,9 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry,
*
* Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error.
*/
-int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
- enum ovl_xattr ox, struct dentry *real, bool is_upper,
- bool set)
+int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, struct dentry *real,
+ bool is_upper, bool set)
{
struct inode *inode;
struct ovl_fh *fh;
@@ -530,9 +544,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
goto fail;
}
- err = ovl_verify_fh(ofs, dentry, ox, fh);
- if (set && err == -ENODATA)
- err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
+ err = ovl_verify_set_fh(ofs, dentry, ox, fh, is_upper, set);
if (err)
goto fail;
@@ -548,6 +560,7 @@ fail:
goto out;
}
+
/* Get upper dentry from index */
struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
bool connected)
@@ -684,7 +697,7 @@ orphan:
goto out;
}
-static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name)
+int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name)
{
char *n, *s;
@@ -873,20 +886,27 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry,
struct dentry *lower, struct dentry *upper)
{
+ const struct ovl_fh *fh;
int err;
if (ovl_check_origin_xattr(ofs, upper))
return 0;
+ fh = ovl_get_origin_fh(ofs, lower);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
err = ovl_want_write(dentry);
if (err)
- return err;
+ goto out;
- err = ovl_set_origin(ofs, lower, upper);
+ err = ovl_set_origin_fh(ofs, fh, upper);
if (!err)
err = ovl_set_impure(dentry->d_parent, upper->d_parent);
ovl_drop_write(dentry);
+out:
+ kfree(fh);
return err;
}
@@ -1383,7 +1403,11 @@ bool ovl_lower_positive(struct dentry *dentry)
break;
}
} else {
- positive = !ovl_is_whiteout(this);
+ struct path path = {
+ .dentry = this,
+ .mnt = parentpath->layer->mnt,
+ };
+ positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path);
done = true;
dput(this);
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 9817b2dcb132..ca88b2636a57 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -28,7 +28,16 @@ enum ovl_path_type {
#define OVL_XATTR_NAMESPACE "overlay."
#define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1)
#define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_USER_PREFIX_LEN (sizeof(OVL_XATTR_USER_PREFIX) - 1)
+
+#define OVL_XATTR_ESCAPE_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_ESCAPE_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_PREFIX) - 1)
+#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX OVL_XATTR_TRUSTED_PREFIX OVL_XATTR_ESCAPE_PREFIX
+#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_TRUSTED_PREFIX) - 1)
+#define OVL_XATTR_ESCAPE_USER_PREFIX OVL_XATTR_USER_PREFIX OVL_XATTR_ESCAPE_PREFIX
+#define OVL_XATTR_ESCAPE_USER_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_USER_PREFIX) - 1)
enum ovl_xattr {
OVL_XATTR_OPAQUE,
@@ -40,6 +49,8 @@ enum ovl_xattr {
OVL_XATTR_UUID,
OVL_XATTR_METACOPY,
OVL_XATTR_PROTATTR,
+ OVL_XATTR_XWHITEOUT,
+ OVL_XATTR_XWHITEOUTS,
};
enum ovl_inode_flag {
@@ -398,6 +409,10 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
}
/* util.c */
+int ovl_get_write_access(struct dentry *dentry);
+void ovl_put_write_access(struct dentry *dentry);
+void ovl_start_write(struct dentry *dentry);
+void ovl_end_write(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
@@ -460,6 +475,7 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
u64 ovl_inode_version_get(struct inode *inode);
bool ovl_is_whiteout(struct dentry *dentry);
+bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path);
struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
@@ -467,9 +483,21 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags);
bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
enum ovl_xattr ox);
bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
+bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path);
+bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
const struct path *upperpath);
+static inline bool ovl_upper_is_whiteout(struct ovl_fs *ofs,
+ struct dentry *upperdentry)
+{
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+ return ovl_path_is_whiteout(ofs, &upperpath);
+}
+
static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs,
struct dentry *upperdentry)
{
@@ -624,11 +652,15 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
struct dentry *upperdentry, struct ovl_path **stackp);
int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
- enum ovl_xattr ox, struct dentry *real, bool is_upper,
- bool set);
+ enum ovl_xattr ox, const struct ovl_fh *fh,
+ bool is_upper, bool set);
+int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, struct dentry *real,
+ bool is_upper, bool set);
struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
bool connected);
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
+int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name);
int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
struct qstr *name);
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
@@ -640,17 +672,24 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
bool ovl_lower_positive(struct dentry *dentry);
+static inline int ovl_verify_origin_fh(struct ovl_fs *ofs, struct dentry *upper,
+ const struct ovl_fh *fh, bool set)
+{
+ return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, fh, false, set);
+}
+
static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper,
struct dentry *origin, bool set)
{
- return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, origin,
- false, set);
+ return ovl_verify_origin_xattr(ofs, upper, OVL_XATTR_ORIGIN, origin,
+ false, set);
}
static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index,
struct dentry *upper, bool set)
{
- return ovl_verify_set_fh(ofs, index, OVL_XATTR_UPPER, upper, true, set);
+ return ovl_verify_origin_xattr(ofs, index, OVL_XATTR_UPPER, upper,
+ true, set);
}
/* readdir.c */
@@ -684,17 +723,8 @@ int ovl_set_nlink_lower(struct dentry *dentry);
unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
struct dentry *upperdentry,
unsigned int fallback);
-int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct iattr *attr);
-int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
- struct kstat *stat, u32 request_mask, unsigned int flags);
int ovl_permission(struct mnt_idmap *idmap, struct inode *inode,
int mask);
-int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
- const void *value, size_t size, int flags);
-int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
- void *value, size_t size);
-ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap,
@@ -815,8 +845,9 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentr
int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
bool is_upper);
-int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
- struct dentry *upper);
+struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin);
+int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
+ struct dentry *upper);
/* export.c */
extern const struct export_operations ovl_export_operations;
@@ -830,3 +861,12 @@ static inline bool ovl_force_readonly(struct ovl_fs *ofs)
{
return (!ovl_upper_mnt(ofs) || !ofs->workdir);
}
+
+/* xattr.c */
+
+const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs);
+int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr);
+int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags);
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index f6ff23fd101c..3fe2dde1598f 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -43,8 +43,10 @@ module_param_named(metacopy, ovl_metacopy_def, bool, 0644);
MODULE_PARM_DESC(metacopy,
"Default to on or off for the metadata only copy up feature");
-enum {
+enum ovl_opt {
Opt_lowerdir,
+ Opt_lowerdir_add,
+ Opt_datadir_add,
Opt_upperdir,
Opt_workdir,
Opt_default_permissions,
@@ -140,8 +142,11 @@ static int ovl_verity_mode_def(void)
#define fsparam_string_empty(NAME, OPT) \
__fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
+
const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_string_empty("lowerdir", Opt_lowerdir),
+ fsparam_string("lowerdir+", Opt_lowerdir_add),
+ fsparam_string("datadir+", Opt_datadir_add),
fsparam_string("upperdir", Opt_upperdir),
fsparam_string("workdir", Opt_workdir),
fsparam_flag("default_permissions", Opt_default_permissions),
@@ -238,19 +243,8 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
pr_err("failed to resolve '%s': %i\n", name, err);
goto out;
}
- err = -EINVAL;
- if (ovl_dentry_weird(path->dentry)) {
- pr_err("filesystem on '%s' not supported\n", name);
- goto out_put;
- }
- if (!d_is_dir(path->dentry)) {
- pr_err("'%s' not a directory\n", name);
- goto out_put;
- }
return 0;
-out_put:
- path_put_init(path);
out:
return err;
}
@@ -268,7 +262,7 @@ static void ovl_unescape(char *s)
}
}
-static int ovl_mount_dir(const char *name, struct path *path, bool upper)
+static int ovl_mount_dir(const char *name, struct path *path)
{
int err = -ENOMEM;
char *tmp = kstrdup(name, GFP_KERNEL);
@@ -276,68 +270,147 @@ static int ovl_mount_dir(const char *name, struct path *path, bool upper)
if (tmp) {
ovl_unescape(tmp);
err = ovl_mount_dir_noesc(tmp, path);
-
- if (!err && upper && path->dentry->d_flags & DCACHE_OP_REAL) {
- pr_err("filesystem on '%s' not supported as upperdir\n",
- tmp);
- path_put_init(path);
- err = -EINVAL;
- }
kfree(tmp);
}
return err;
}
-static int ovl_parse_param_upperdir(const char *name, struct fs_context *fc,
- bool workdir)
+static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
+ enum ovl_opt layer, const char *name, bool upper)
{
- int err;
- struct ovl_fs *ofs = fc->s_fs_info;
- struct ovl_config *config = &ofs->config;
struct ovl_fs_context *ctx = fc->fs_private;
- struct path path;
- char *dup;
- err = ovl_mount_dir(name, &path, true);
- if (err)
- return err;
+ if (ovl_dentry_weird(path->dentry))
+ return invalfc(fc, "filesystem on %s not supported", name);
+
+ if (!d_is_dir(path->dentry))
+ return invalfc(fc, "%s is not a directory", name);
+
/*
* Check whether upper path is read-only here to report failures
* early. Don't forget to recheck when the superblock is created
* as the mount attributes could change.
*/
- if (__mnt_is_readonly(path.mnt)) {
- path_put(&path);
- return -EINVAL;
+ if (upper) {
+ if (path->dentry->d_flags & DCACHE_OP_REAL)
+ return invalfc(fc, "filesystem on %s not supported as upperdir", name);
+ if (__mnt_is_readonly(path->mnt))
+ return invalfc(fc, "filesystem on %s is read-only", name);
+ } else {
+ if (ctx->lowerdir_all && layer != Opt_lowerdir)
+ return invalfc(fc, "lowerdir+ and datadir+ cannot follow lowerdir");
+ if (ctx->nr_data && layer == Opt_lowerdir_add)
+ return invalfc(fc, "regular lower layers cannot follow data layers");
+ if (ctx->nr == OVL_MAX_STACK)
+ return invalfc(fc, "too many lower directories, limit is %d",
+ OVL_MAX_STACK);
}
+ return 0;
+}
- dup = kstrdup(name, GFP_KERNEL);
- if (!dup) {
- path_put(&path);
+static int ovl_ctx_realloc_lower(struct fs_context *fc)
+{
+ struct ovl_fs_context *ctx = fc->fs_private;
+ struct ovl_fs_context_layer *l;
+ size_t nr;
+
+ if (ctx->nr < ctx->capacity)
+ return 0;
+
+ nr = min_t(size_t, max(4096 / sizeof(*l), ctx->capacity * 2),
+ OVL_MAX_STACK);
+ l = krealloc_array(ctx->lower, nr, sizeof(*l), GFP_KERNEL_ACCOUNT);
+ if (!l)
return -ENOMEM;
+
+ ctx->lower = l;
+ ctx->capacity = nr;
+ return 0;
+}
+
+static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer,
+ struct path *path, char **pname)
+{
+ struct ovl_fs *ofs = fc->s_fs_info;
+ struct ovl_config *config = &ofs->config;
+ struct ovl_fs_context *ctx = fc->fs_private;
+ struct ovl_fs_context_layer *l;
+
+ switch (layer) {
+ case Opt_workdir:
+ swap(config->workdir, *pname);
+ swap(ctx->work, *path);
+ break;
+ case Opt_upperdir:
+ swap(config->upperdir, *pname);
+ swap(ctx->upper, *path);
+ break;
+ case Opt_datadir_add:
+ ctx->nr_data++;
+ fallthrough;
+ case Opt_lowerdir_add:
+ WARN_ON(ctx->nr >= ctx->capacity);
+ l = &ctx->lower[ctx->nr++];
+ memset(l, 0, sizeof(*l));
+ swap(l->name, *pname);
+ swap(l->path, *path);
+ break;
+ default:
+ WARN_ON(1);
}
+}
- if (workdir) {
- kfree(config->workdir);
- config->workdir = dup;
- path_put(&ctx->work);
- ctx->work = path;
- } else {
- kfree(config->upperdir);
- config->upperdir = dup;
- path_put(&ctx->upper);
- ctx->upper = path;
+static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param,
+ enum ovl_opt layer)
+{
+ char *name = kstrdup(param->string, GFP_KERNEL);
+ bool upper = (layer == Opt_upperdir || layer == Opt_workdir);
+ struct path path;
+ int err;
+
+ if (!name)
+ return -ENOMEM;
+
+ if (upper)
+ err = ovl_mount_dir(name, &path);
+ else
+ err = ovl_mount_dir_noesc(name, &path);
+ if (err)
+ goto out_free;
+
+ err = ovl_mount_dir_check(fc, &path, layer, name, upper);
+ if (err)
+ goto out_put;
+
+ if (!upper) {
+ err = ovl_ctx_realloc_lower(fc);
+ if (err)
+ goto out_put;
}
- return 0;
+
+ /* Store the user provided path string in ctx to show in mountinfo */
+ ovl_add_layer(fc, layer, &path, &name);
+
+out_put:
+ path_put(&path);
+out_free:
+ kfree(name);
+ return err;
}
-static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx)
+static void ovl_reset_lowerdirs(struct ovl_fs_context *ctx)
{
- for (size_t nr = 0; nr < ctx->nr; nr++) {
- path_put(&ctx->lower[nr].path);
- kfree(ctx->lower[nr].name);
- ctx->lower[nr].name = NULL;
+ struct ovl_fs_context_layer *l = ctx->lower;
+
+ // Reset old user provided lowerdir string
+ kfree(ctx->lowerdir_all);
+ ctx->lowerdir_all = NULL;
+
+ for (size_t nr = 0; nr < ctx->nr; nr++, l++) {
+ path_put(&l->path);
+ kfree(l->name);
+ l->name = NULL;
}
ctx->nr = 0;
ctx->nr_data = 0;
@@ -346,7 +419,7 @@ static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx)
/*
* Parse lowerdir= mount option:
*
- * (1) lowerdir=/lower1:/lower2:/lower3::/data1::/data2
+ * e.g.: lowerdir=/lower1:/lower2:/lower3::/data1::/data2
* Set "/lower1", "/lower2", and "/lower3" as lower layers and
* "/data1" and "/data2" as data lower layers. Any existing lower
* layers are replaced.
@@ -356,9 +429,9 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
int err;
struct ovl_fs_context *ctx = fc->fs_private;
struct ovl_fs_context_layer *l;
- char *dup = NULL, *dup_iter;
- ssize_t nr_lower = 0, nr = 0, nr_data = 0;
- bool append = false, data_layer = false;
+ char *dup = NULL, *iter;
+ ssize_t nr_lower, nr;
+ bool data_layer = false;
/*
* Ensure we're backwards compatible with mount(2)
@@ -366,16 +439,21 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
*/
/* drop all existing lower layers */
- if (!*name) {
- ovl_parse_param_drop_lowerdir(ctx);
+ ovl_reset_lowerdirs(ctx);
+
+ if (!*name)
return 0;
- }
if (*name == ':') {
pr_err("cannot append lower layer");
return -EINVAL;
}
+ // Store user provided lowerdir string to show in mount options
+ ctx->lowerdir_all = kstrdup(name, GFP_KERNEL);
+ if (!ctx->lowerdir_all)
+ return -ENOMEM;
+
dup = kstrdup(name, GFP_KERNEL);
if (!dup)
return -ENOMEM;
@@ -385,36 +463,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
if (nr_lower < 0)
goto out_err;
- if ((nr_lower > OVL_MAX_STACK) ||
- (append && (size_add(ctx->nr, nr_lower) > OVL_MAX_STACK))) {
+ if (nr_lower > OVL_MAX_STACK) {
pr_err("too many lower directories, limit is %d\n", OVL_MAX_STACK);
goto out_err;
}
- if (!append)
- ovl_parse_param_drop_lowerdir(ctx);
-
- /*
- * (1) append
- *
- * We want nr <= nr_lower <= capacity We know nr > 0 and nr <=
- * capacity. If nr == 0 this wouldn't be append. If nr +
- * nr_lower is <= capacity then nr <= nr_lower <= capacity
- * already holds. If nr + nr_lower exceeds capacity, we realloc.
- *
- * (2) replace
- *
- * Ensure we're backwards compatible with mount(2) which allows
- * "lowerdir=/a:/b:/c,lowerdir=/d:/e:/f" causing the last
- * specified lowerdir mount option to win.
- *
- * We want nr <= nr_lower <= capacity We know either (i) nr == 0
- * or (ii) nr > 0. We also know nr_lower > 0. The capacity
- * could've been changed multiple times already so we only know
- * nr <= capacity. If nr + nr_lower > capacity we realloc,
- * otherwise nr <= nr_lower <= capacity holds already.
- */
- nr_lower += ctx->nr;
if (nr_lower > ctx->capacity) {
err = -ENOMEM;
l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower),
@@ -426,59 +479,40 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
ctx->capacity = nr_lower;
}
- /*
- * (3) By (1) and (2) we know nr <= nr_lower <= capacity.
- * (4) If ctx->nr == 0 => replace
- * We have verified above that the lowerdir mount option
- * isn't an append, i.e., the lowerdir mount option
- * doesn't start with ":" or "::".
- * (4.1) The lowerdir mount options only contains regular lower
- * layers ":".
- * => Nothing to verify.
- * (4.2) The lowerdir mount options contains regular ":" and
- * data "::" layers.
- * => We need to verify that data lower layers "::" aren't
- * followed by regular ":" lower layers
- * (5) If ctx->nr > 0 => append
- * We know that there's at least one regular layer
- * otherwise we would've failed when parsing the previous
- * lowerdir mount option.
- * (5.1) The lowerdir mount option is a regular layer ":" append
- * => We need to verify that no data layers have been
- * specified before.
- * (5.2) The lowerdir mount option is a data layer "::" append
- * We know that there's at least one regular layer or
- * other data layers. => There's nothing to verify.
- */
- dup_iter = dup;
- for (nr = ctx->nr; nr < nr_lower; nr++) {
- l = &ctx->lower[nr];
+ iter = dup;
+ l = ctx->lower;
+ for (nr = 0; nr < nr_lower; nr++, l++) {
+ ctx->nr++;
memset(l, 0, sizeof(*l));
- err = ovl_mount_dir(dup_iter, &l->path, false);
+ err = ovl_mount_dir(iter, &l->path);
+ if (err)
+ goto out_put;
+
+ err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false);
if (err)
goto out_put;
err = -ENOMEM;
- l->name = kstrdup(dup_iter, GFP_KERNEL_ACCOUNT);
+ l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT);
if (!l->name)
goto out_put;
if (data_layer)
- nr_data++;
+ ctx->nr_data++;
/* Calling strchr() again would overrun. */
- if ((nr + 1) == nr_lower)
+ if (ctx->nr == nr_lower)
break;
err = -EINVAL;
- dup_iter = strchr(dup_iter, '\0') + 1;
- if (*dup_iter) {
+ iter = strchr(iter, '\0') + 1;
+ if (*iter) {
/*
* This is a regular layer so we require that
* there are no data layers.
*/
- if ((ctx->nr_data + nr_data) > 0) {
+ if (ctx->nr_data > 0) {
pr_err("regular lower layers cannot follow data lower layers");
goto out_put;
}
@@ -489,29 +523,13 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
/* This is a data lower layer. */
data_layer = true;
- dup_iter++;
+ iter++;
}
- ctx->nr = nr_lower;
- ctx->nr_data += nr_data;
kfree(dup);
return 0;
out_put:
- /*
- * We know nr >= ctx->nr < nr_lower. If we failed somewhere
- * we want to undo until nr == ctx->nr. This is correct for
- * both ctx->nr == 0 and ctx->nr > 0.
- */
- for (; nr >= ctx->nr; nr--) {
- l = &ctx->lower[nr];
- kfree(l->name);
- l->name = NULL;
- path_put(&l->path);
-
- /* don't overflow */
- if (nr == 0)
- break;
- }
+ ovl_reset_lowerdirs(ctx);
out_err:
kfree(dup);
@@ -556,11 +574,11 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_lowerdir:
err = ovl_parse_param_lowerdir(param->string, fc);
break;
+ case Opt_lowerdir_add:
+ case Opt_datadir_add:
case Opt_upperdir:
- fallthrough;
case Opt_workdir:
- err = ovl_parse_param_upperdir(param->string, fc,
- (Opt_workdir == opt));
+ err = ovl_parse_layer(fc, param, opt);
break;
case Opt_default_permissions:
config->default_permissions = true;
@@ -617,7 +635,7 @@ static int ovl_get_tree(struct fs_context *fc)
static inline void ovl_fs_context_free(struct ovl_fs_context *ctx)
{
- ovl_parse_param_drop_lowerdir(ctx);
+ ovl_reset_lowerdirs(ctx);
path_put(&ctx->upper);
path_put(&ctx->work);
kfree(ctx->lower);
@@ -933,23 +951,28 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry)
{
struct super_block *sb = dentry->d_sb;
struct ovl_fs *ofs = OVL_FS(sb);
- size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer;
+ size_t nr, nr_merged_lower, nr_lower = 0;
+ char **lowerdirs = ofs->config.lowerdirs;
/*
- * lowerdirs[] starts from offset 1, then
- * >= 0 regular lower layers prefixed with : and
- * >= 0 data-only lower layers prefixed with ::
- *
- * we need to escase comma and space like seq_show_option() does and
- * we also need to escape the colon separator from lowerdir paths.
+ * lowerdirs[0] holds the colon separated list that user provided
+ * with lowerdir mount option.
+ * lowerdirs[1..numlayer] hold the lowerdir paths that were added
+ * using the lowerdir+ and datadir+ mount options.
+ * For now, we do not allow mixing the legacy lowerdir mount option
+ * with the new lowerdir+ and datadir+ mount options.
*/
- seq_puts(m, ",lowerdir=");
- for (nr = 1; nr < ofs->numlayer; nr++) {
- if (nr > 1)
- seq_putc(m, ':');
- if (nr >= nr_merged_lower)
- seq_putc(m, ':');
- seq_escape(m, ofs->config.lowerdirs[nr], ":, \t\n\\");
+ if (lowerdirs[0]) {
+ seq_show_option(m, "lowerdir", lowerdirs[0]);
+ } else {
+ nr_lower = ofs->numlayer;
+ nr_merged_lower = nr_lower - ofs->numdatalayer;
+ }
+ for (nr = 1; nr < nr_lower; nr++) {
+ if (nr < nr_merged_lower)
+ seq_show_option(m, "lowerdir+", lowerdirs[nr]);
+ else
+ seq_show_option(m, "datadir+", lowerdirs[nr]);
}
if (ofs->config.upperdir) {
seq_show_option(m, "upperdir", ofs->config.upperdir);
diff --git a/fs/overlayfs/params.h b/fs/overlayfs/params.h
index 8750da68ab2a..c96d93982021 100644
--- a/fs/overlayfs/params.h
+++ b/fs/overlayfs/params.h
@@ -32,6 +32,7 @@ struct ovl_fs_context {
size_t nr_data;
struct ovl_opt_set set;
struct ovl_fs_context_layer *lower;
+ char *lowerdir_all; /* user provided lowerdir string */
};
int ovl_init_fs_context(struct fs_context *fc);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index de39e067ae65..a490fc47c3e7 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -25,6 +25,7 @@ struct ovl_cache_entry {
struct ovl_cache_entry *next_maybe_whiteout;
bool is_upper;
bool is_whiteout;
+ bool check_xwhiteout;
char name[];
};
@@ -47,6 +48,7 @@ struct ovl_readdir_data {
int err;
bool is_upper;
bool d_type_supported;
+ bool in_xwhiteouts_dir;
};
struct ovl_dir_file {
@@ -162,6 +164,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
p->ino = 0;
p->is_upper = rdd->is_upper;
p->is_whiteout = false;
+ /* Defer check for overlay.whiteout to ovl_iterate() */
+ p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
@@ -301,6 +305,8 @@ static inline int ovl_dir_read(const struct path *realpath,
if (IS_ERR(realfile))
return PTR_ERR(realfile);
+ rdd->in_xwhiteouts_dir = rdd->dentry &&
+ ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath);
rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
@@ -447,7 +453,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
}
/*
- * Set d_ino for upper entries. Non-upper entries should always report
+ * Set d_ino for upper entries if needed. Non-upper entries should always report
* the uppermost real inode ino and should not call this function.
*
* When not all layer are on same fs, report real ino also for upper.
@@ -455,8 +461,11 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
* When all layers are on the same fs, and upper has a reference to
* copy up origin, call vfs_getattr() on the overlay entry to make
* sure that d_ino will be consistent with st_ino from stat(2).
+ *
+ * Also checks the overlay.whiteout xattr by doing a full lookup which will return
+ * negative in this case.
*/
-static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p)
+static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino)
{
struct dentry *dir = path->dentry;
@@ -467,7 +476,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry
int xinobits = ovl_xino_bits(ofs);
int err = 0;
- if (!ovl_same_dev(ofs))
+ if (!ovl_same_dev(ofs) && !p->check_xwhiteout)
goto out;
if (p->name[0] == '.') {
@@ -481,6 +490,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry
goto get;
}
}
+ /* This checks also for xwhiteouts */
this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
if (IS_ERR_OR_NULL(this) || !this->d_inode) {
/* Mark a stale entry */
@@ -494,6 +504,9 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry
}
get:
+ if (!ovl_same_dev(ofs) || !update_ino)
+ goto out;
+
type = ovl_path_type(this);
if (OVL_TYPE_ORIGIN(type)) {
struct kstat stat;
@@ -572,7 +585,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
list_for_each_entry_safe(p, n, list, l_node) {
if (strcmp(p->name, ".") != 0 &&
strcmp(p->name, "..") != 0) {
- err = ovl_cache_update_ino(path, p);
+ err = ovl_cache_update(path, p, true);
if (err)
return err;
}
@@ -778,13 +791,13 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout) {
- if (!p->ino) {
- err = ovl_cache_update_ino(&file->f_path, p);
+ if (!p->ino || p->check_xwhiteout) {
+ err = ovl_cache_update(&file->f_path, p, !p->ino);
if (err)
goto out;
}
}
- /* ovl_cache_update_ino() sets is_whiteout on stale entry */
+ /* ovl_cache_update() sets is_whiteout on stale entry */
if (!p->is_whiteout) {
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 17864a8d2b85..a0967bb25003 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -445,68 +445,6 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
return ok;
}
-static int ovl_own_xattr_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- return -EOPNOTSUPP;
-}
-
-static int ovl_own_xattr_set(const struct xattr_handler *handler,
- struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static int ovl_other_xattr_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- return ovl_xattr_get(dentry, inode, name, buffer, size);
-}
-
-static int ovl_other_xattr_set(const struct xattr_handler *handler,
- struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- return ovl_xattr_set(dentry, inode, name, value, size, flags);
-}
-
-static const struct xattr_handler ovl_own_trusted_xattr_handler = {
- .prefix = OVL_XATTR_TRUSTED_PREFIX,
- .get = ovl_own_xattr_get,
- .set = ovl_own_xattr_set,
-};
-
-static const struct xattr_handler ovl_own_user_xattr_handler = {
- .prefix = OVL_XATTR_USER_PREFIX,
- .get = ovl_own_xattr_get,
- .set = ovl_own_xattr_set,
-};
-
-static const struct xattr_handler ovl_other_xattr_handler = {
- .prefix = "", /* catch all */
- .get = ovl_other_xattr_get,
- .set = ovl_other_xattr_set,
-};
-
-static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = {
- &ovl_own_trusted_xattr_handler,
- &ovl_other_xattr_handler,
- NULL
-};
-
-static const struct xattr_handler * const ovl_user_xattr_handlers[] = {
- &ovl_own_user_xattr_handler,
- &ovl_other_xattr_handler,
- NULL
-};
-
static int ovl_setup_trap(struct super_block *sb, struct dentry *dir,
struct inode **ptrap, const char *name)
{
@@ -647,7 +585,7 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
if (IS_ERR(whiteout))
goto cleanup_temp;
- err = ovl_is_whiteout(whiteout);
+ err = ovl_upper_is_whiteout(ofs, whiteout);
/* Best effort cleanup of whiteout and temp file */
if (err)
@@ -887,15 +825,20 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
{
struct vfsmount *mnt = ovl_upper_mnt(ofs);
struct dentry *indexdir;
+ struct dentry *origin = ovl_lowerstack(oe)->dentry;
+ const struct ovl_fh *fh;
int err;
+ fh = ovl_get_origin_fh(ofs, origin);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+
err = mnt_want_write(mnt);
if (err)
- return err;
+ goto out_free_fh;
/* Verify lower root is upper root origin */
- err = ovl_verify_origin(ofs, upperpath->dentry,
- ovl_lowerstack(oe)->dentry, true);
+ err = ovl_verify_origin_fh(ofs, upperpath->dentry, fh, true);
if (err) {
pr_err("failed to verify upper root origin\n");
goto out;
@@ -927,9 +870,10 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
* directory entries.
*/
if (ovl_check_origin_xattr(ofs, ofs->indexdir)) {
- err = ovl_verify_set_fh(ofs, ofs->indexdir,
- OVL_XATTR_ORIGIN,
- upperpath->dentry, true, false);
+ err = ovl_verify_origin_xattr(ofs, ofs->indexdir,
+ OVL_XATTR_ORIGIN,
+ upperpath->dentry, true,
+ false);
if (err)
pr_err("failed to verify index dir 'origin' xattr\n");
}
@@ -947,6 +891,8 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
out:
mnt_drop_write(mnt);
+out_free_fh:
+ kfree(fh);
return err;
}
@@ -1382,8 +1328,11 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
ofs->layers = layers;
/*
* Layer 0 is reserved for upper even if there's no upper.
- * For consistency, config.lowerdirs[0] is NULL.
+ * config.lowerdirs[0] is used for storing the user provided colon
+ * separated lowerdir string.
*/
+ ofs->config.lowerdirs[0] = ctx->lowerdir_all;
+ ctx->lowerdir_all = NULL;
ofs->numlayer = 1;
sb->s_stack_depth = 0;
@@ -1493,8 +1442,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
- sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
- ovl_trusted_xattr_handlers;
+ sb->s_xattr = ovl_xattr_handlers(ofs);
sb->s_fs_info = ofs;
#ifdef CONFIG_FS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 868afd8834c3..c3f020ca13a8 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -17,12 +17,38 @@
#include <linux/ratelimit.h>
#include "overlayfs.h"
+/* Get write access to upper mnt - may fail if upper sb was remounted ro */
+int ovl_get_write_access(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ return mnt_get_write_access(ovl_upper_mnt(ofs));
+}
+
+/* Get write access to upper sb - may block if upper sb is frozen */
+void ovl_start_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ sb_start_write(ovl_upper_mnt(ofs)->mnt_sb);
+}
+
int ovl_want_write(struct dentry *dentry)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
return mnt_want_write(ovl_upper_mnt(ofs));
}
+void ovl_put_write_access(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ mnt_put_write_access(ovl_upper_mnt(ofs));
+}
+
+void ovl_end_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ sb_end_write(ovl_upper_mnt(ofs)->mnt_sb);
+}
+
void ovl_drop_write(struct dentry *dentry)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
@@ -55,7 +81,7 @@ int ovl_can_decode_fh(struct super_block *sb)
if (!capable(CAP_DAC_READ_SEARCH))
return 0;
- if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry)
+ if (!exportfs_can_decode_fh(sb->s_export_op))
return 0;
return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
@@ -575,6 +601,16 @@ bool ovl_is_whiteout(struct dentry *dentry)
return inode && IS_WHITEOUT(inode);
}
+/*
+ * Use this over ovl_is_whiteout for upper and lower files, as it also
+ * handles overlay.whiteout xattr whiteout files.
+ */
+bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path)
+{
+ return ovl_is_whiteout(path->dentry) ||
+ ovl_path_check_xwhiteout_xattr(ofs, path);
+}
+
struct file *ovl_path_open(const struct path *path, int flags)
{
struct inode *inode = d_inode(path->dentry);
@@ -644,22 +680,36 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags)
return false;
}
+/*
+ * The copy up "transaction" keeps an elevated mnt write count on upper mnt,
+ * but leaves taking freeze protection on upper sb to lower level helpers.
+ */
int ovl_copy_up_start(struct dentry *dentry, int flags)
{
struct inode *inode = d_inode(dentry);
int err;
err = ovl_inode_lock_interruptible(inode);
- if (!err && ovl_already_copied_up_locked(dentry, flags)) {
+ if (err)
+ return err;
+
+ if (ovl_already_copied_up_locked(dentry, flags))
err = 1; /* Already copied up */
- ovl_inode_unlock(inode);
- }
+ else
+ err = ovl_get_write_access(dentry);
+ if (err)
+ goto out_unlock;
+
+ return 0;
+out_unlock:
+ ovl_inode_unlock(inode);
return err;
}
void ovl_copy_up_end(struct dentry *dentry)
{
+ ovl_put_write_access(dentry);
ovl_inode_unlock(d_inode(dentry));
}
@@ -676,6 +726,32 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path)
return false;
}
+bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path)
+{
+ struct dentry *dentry = path->dentry;
+ int res;
+
+ /* xattr.whiteout must be a zero size regular file */
+ if (!d_is_reg(dentry) || i_size_read(d_inode(dentry)) != 0)
+ return false;
+
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUT, NULL, 0);
+ return res >= 0;
+}
+
+bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path)
+{
+ struct dentry *dentry = path->dentry;
+ int res;
+
+ /* xattr.whiteouts must be a directory */
+ if (!d_is_dir(dentry))
+ return false;
+
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0);
+ return res >= 0;
+}
+
/*
* Load persistent uuid from xattr into s_uuid if found, or store a new
* random generated value in s_uuid and in xattr.
@@ -760,6 +836,8 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
#define OVL_XATTR_UUID_POSTFIX "uuid"
#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
#define OVL_XATTR_PROTATTR_POSTFIX "protattr"
+#define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout"
+#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts"
#define OVL_XATTR_TAB_ENTRY(x) \
[x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
@@ -775,6 +853,8 @@ const char *const ovl_xattr_table[][2] = {
OVL_XATTR_TAB_ENTRY(OVL_XATTR_UUID),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS),
};
int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
@@ -898,7 +978,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper,
return 0;
}
-/**
+/*
* Caller must hold a reference to inode to prevent it from being freed while
* it is marked inuse.
*/
@@ -973,12 +1053,18 @@ static void ovl_cleanup_index(struct dentry *dentry)
struct dentry *index = NULL;
struct inode *inode;
struct qstr name = { };
+ bool got_write = false;
int err;
err = ovl_get_index_name(ofs, lowerdentry, &name);
if (err)
goto fail;
+ err = ovl_want_write(dentry);
+ if (err)
+ goto fail;
+
+ got_write = true;
inode = d_inode(upperdentry);
if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) {
pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n",
@@ -1016,6 +1102,8 @@ static void ovl_cleanup_index(struct dentry *dentry)
goto fail;
out:
+ if (got_write)
+ ovl_drop_write(dentry);
kfree(name.name);
dput(index);
return;
@@ -1062,8 +1150,12 @@ int ovl_nlink_start(struct dentry *dentry)
if (err)
return err;
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out_unlock;
+
if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode))
- goto out;
+ return 0;
old_cred = ovl_override_creds(dentry->d_sb);
/*
@@ -1074,10 +1166,15 @@ int ovl_nlink_start(struct dentry *dentry)
*/
err = ovl_set_nlink_upper(dentry);
revert_creds(old_cred);
-
-out:
if (err)
- ovl_inode_unlock(inode);
+ goto out_drop_write;
+
+ return 0;
+
+out_drop_write:
+ ovl_drop_write(dentry);
+out_unlock:
+ ovl_inode_unlock(inode);
return err;
}
@@ -1086,6 +1183,8 @@ void ovl_nlink_end(struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
+ ovl_drop_write(dentry);
+
if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) {
const struct cred *old_cred;
@@ -1403,6 +1502,7 @@ void ovl_copyattr(struct inode *inode)
realinode = ovl_i_path_real(inode, &realpath);
real_idmap = mnt_idmap(realpath.mnt);
+ spin_lock(&inode->i_lock);
vfsuid = i_uid_into_vfsuid(real_idmap, realinode);
vfsgid = i_gid_into_vfsgid(real_idmap, realinode);
@@ -1413,4 +1513,5 @@ void ovl_copyattr(struct inode *inode)
inode_set_mtime_to_ts(inode, inode_get_mtime(realinode));
inode_set_ctime_to_ts(inode, inode_get_ctime(realinode));
i_size_write(inode, i_size_read(realinode));
+ spin_unlock(&inode->i_lock);
}
diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c
new file mode 100644
index 000000000000..383978e4663c
--- /dev/null
+++ b/fs/overlayfs/xattrs.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/fs.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+
+static bool ovl_is_escaped_xattr(struct super_block *sb, const char *name)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ if (ofs->config.userxattr)
+ return strncmp(name, OVL_XATTR_ESCAPE_USER_PREFIX,
+ OVL_XATTR_ESCAPE_USER_PREFIX_LEN) == 0;
+ else
+ return strncmp(name, OVL_XATTR_ESCAPE_TRUSTED_PREFIX,
+ OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN - 1) == 0;
+}
+
+static bool ovl_is_own_xattr(struct super_block *sb, const char *name)
+{
+ struct ovl_fs *ofs = OVL_FS(sb);
+
+ if (ofs->config.userxattr)
+ return strncmp(name, OVL_XATTR_USER_PREFIX,
+ OVL_XATTR_USER_PREFIX_LEN) == 0;
+ else
+ return strncmp(name, OVL_XATTR_TRUSTED_PREFIX,
+ OVL_XATTR_TRUSTED_PREFIX_LEN) == 0;
+}
+
+bool ovl_is_private_xattr(struct super_block *sb, const char *name)
+{
+ return ovl_is_own_xattr(sb, name) && !ovl_is_escaped_xattr(sb, name);
+}
+
+static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *upperdentry = ovl_i_dentry_upper(inode);
+ struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
+ struct path realpath;
+ const struct cred *old_cred;
+
+ if (!value && !upperdentry) {
+ ovl_path_lower(dentry, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
+ revert_creds(old_cred);
+ if (err < 0)
+ goto out;
+ }
+
+ if (!upperdentry) {
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out;
+
+ realdentry = ovl_dentry_upper(dentry);
+ }
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ if (value) {
+ err = ovl_do_setxattr(ofs, realdentry, name, value, size,
+ flags);
+ } else {
+ WARN_ON(flags != XATTR_REPLACE);
+ err = ovl_do_removexattr(ofs, realdentry, name);
+ }
+ revert_creds(old_cred);
+ ovl_drop_write(dentry);
+
+ /* copy c/mtime */
+ ovl_copyattr(inode);
+out:
+ return err;
+}
+
+static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
+ void *value, size_t size)
+{
+ ssize_t res;
+ const struct cred *old_cred;
+ struct path realpath;
+
+ ovl_i_path_real(inode, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size);
+ revert_creds(old_cred);
+ return res;
+}
+
+static bool ovl_can_list(struct super_block *sb, const char *s)
+{
+ /* Never list private (.overlay) */
+ if (ovl_is_private_xattr(sb, s))
+ return false;
+
+ /* List all non-trusted xattrs */
+ if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
+ return true;
+
+ /* list other trusted for superuser only */
+ return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
+}
+
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ struct dentry *realdentry = ovl_dentry_real(dentry);
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ ssize_t res;
+ size_t len;
+ char *s;
+ const struct cred *old_cred;
+ size_t prefix_len, name_len;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ res = vfs_listxattr(realdentry, list, size);
+ revert_creds(old_cred);
+ if (res <= 0 || size == 0)
+ return res;
+
+ prefix_len = ofs->config.userxattr ?
+ OVL_XATTR_USER_PREFIX_LEN : OVL_XATTR_TRUSTED_PREFIX_LEN;
+
+ /* filter out private xattrs */
+ for (s = list, len = res; len;) {
+ size_t slen = strnlen(s, len) + 1;
+
+ /* underlying fs providing us with an broken xattr list? */
+ if (WARN_ON(slen > len))
+ return -EIO;
+
+ len -= slen;
+ if (!ovl_can_list(dentry->d_sb, s)) {
+ res -= slen;
+ memmove(s, s + slen, len);
+ } else if (ovl_is_escaped_xattr(dentry->d_sb, s)) {
+ res -= OVL_XATTR_ESCAPE_PREFIX_LEN;
+ name_len = slen - prefix_len - OVL_XATTR_ESCAPE_PREFIX_LEN;
+ s += prefix_len;
+ memmove(s, s + OVL_XATTR_ESCAPE_PREFIX_LEN, name_len + len);
+ s += name_len;
+ } else {
+ s += slen;
+ }
+ }
+
+ return res;
+}
+
+static char *ovl_xattr_escape_name(const char *prefix, const char *name)
+{
+ size_t prefix_len = strlen(prefix);
+ size_t name_len = strlen(name);
+ size_t escaped_len;
+ char *escaped, *s;
+
+ escaped_len = prefix_len + OVL_XATTR_ESCAPE_PREFIX_LEN + name_len;
+ if (escaped_len > XATTR_NAME_MAX)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ escaped = kmalloc(escaped_len + 1, GFP_KERNEL);
+ if (escaped == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ s = escaped;
+ memcpy(s, prefix, prefix_len);
+ s += prefix_len;
+ memcpy(s, OVL_XATTR_ESCAPE_PREFIX, OVL_XATTR_ESCAPE_PREFIX_LEN);
+ s += OVL_XATTR_ESCAPE_PREFIX_LEN;
+ memcpy(s, name, name_len + 1);
+
+ return escaped;
+}
+
+static int ovl_own_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ char *escaped;
+ int r;
+
+ escaped = ovl_xattr_escape_name(handler->prefix, name);
+ if (IS_ERR(escaped))
+ return PTR_ERR(escaped);
+
+ r = ovl_xattr_get(dentry, inode, escaped, buffer, size);
+
+ kfree(escaped);
+
+ return r;
+}
+
+static int ovl_own_xattr_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ char *escaped;
+ int r;
+
+ escaped = ovl_xattr_escape_name(handler->prefix, name);
+ if (IS_ERR(escaped))
+ return PTR_ERR(escaped);
+
+ r = ovl_xattr_set(dentry, inode, escaped, value, size, flags);
+
+ kfree(escaped);
+
+ return r;
+}
+
+static int ovl_other_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ return ovl_xattr_get(dentry, inode, name, buffer, size);
+}
+
+static int ovl_other_xattr_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return ovl_xattr_set(dentry, inode, name, value, size, flags);
+}
+
+static const struct xattr_handler ovl_own_trusted_xattr_handler = {
+ .prefix = OVL_XATTR_TRUSTED_PREFIX,
+ .get = ovl_own_xattr_get,
+ .set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_own_user_xattr_handler = {
+ .prefix = OVL_XATTR_USER_PREFIX,
+ .get = ovl_own_xattr_get,
+ .set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_other_xattr_handler = {
+ .prefix = "", /* catch all */
+ .get = ovl_other_xattr_get,
+ .set = ovl_other_xattr_set,
+};
+
+static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = {
+ &ovl_own_trusted_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL
+};
+
+static const struct xattr_handler * const ovl_user_xattr_handlers[] = {
+ &ovl_own_user_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL
+};
+
+const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs)
+{
+ return ofs->config.userxattr ? ovl_user_xattr_handlers :
+ ovl_trusted_xattr_handlers;
+}
+
diff --git a/fs/pipe.c b/fs/pipe.c
index 8916c455a469..804a7d789452 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -864,7 +864,7 @@ void free_pipe_info(struct pipe_inode_info *pipe)
kfree(pipe);
}
-static struct vfsmount *pipe_mnt __read_mostly;
+static struct vfsmount *pipe_mnt __ro_after_init;
/*
* pipefs_dname() is called from d_path().
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 2c2efbe685d8..ff08a8957552 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -536,12 +536,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
/* add up live thread stats at the group level */
if (whole) {
- struct task_struct *t = task;
- do {
+ struct task_struct *t;
+
+ __for_each_thread(sig, t) {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
gtime += task_gtime(t);
- } while_each_thread(task, t);
+ }
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 83396ab14998..dd31e3b6bf77 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1153,11 +1153,10 @@ err_unlock:
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
@@ -1213,11 +1212,10 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_score_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
@@ -1358,13 +1356,13 @@ static ssize_t proc_fault_inject_write(struct file * file,
const char __user * buf, size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int make_it_fail;
int rv;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- memset(buffer, 0, sizeof(buffer));
+
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1509,11 +1507,10 @@ sched_autogroup_write(struct file *file, const char __user *buf,
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int nice;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1666,10 +1663,9 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[TASK_COMM_LEN];
+ char buffer[TASK_COMM_LEN] = {};
const size_t maxlen = sizeof(buffer) - 1;
- memset(buffer, 0, sizeof(buffer));
if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
return -EFAULT;
@@ -2976,8 +2972,7 @@ static const struct file_operations proc_coredump_filter_operations = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
- struct task_io_accounting acct = task->ioac;
- unsigned long flags;
+ struct task_io_accounting acct;
int result;
result = down_read_killable(&task->signal->exec_update_lock);
@@ -2989,15 +2984,28 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
goto out_unlock;
}
- if (whole && lock_task_sighand(task, &flags)) {
- struct task_struct *t = task;
+ if (whole) {
+ struct signal_struct *sig = task->signal;
+ struct task_struct *t;
+ unsigned int seq = 1;
+ unsigned long flags;
+
+ rcu_read_lock();
+ do {
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
- task_io_accounting_add(&acct, &task->signal->ioac);
- while_each_thread(task, t)
- task_io_accounting_add(&acct, &t->ioac);
+ acct = sig->ioac;
+ __for_each_thread(sig, t)
+ task_io_accounting_add(&acct, &t->ioac);
- unlock_task_sighand(task, &flags);
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+ rcu_read_unlock();
+ } else {
+ acct = task->ioac;
}
+
seq_printf(m,
"rchar: %llu\n"
"wchar: %llu\n"
@@ -3818,7 +3826,7 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
for_each_thread(task, pos) {
if (!nr--)
goto found;
- };
+ }
fail:
pos = NULL;
goto out;
@@ -3840,10 +3848,8 @@ static struct task_struct *next_tid(struct task_struct *start)
struct task_struct *pos = NULL;
rcu_read_lock();
if (pid_alive(start)) {
- pos = next_thread(start);
- if (thread_group_leader(pos))
- pos = NULL;
- else
+ pos = __next_thread(start);
+ if (pos)
get_task_struct(pos);
}
rcu_read_unlock();
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 592ed2516f47..b33e490e3fd9 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -110,18 +110,15 @@ void __init proc_init_kmemcache(void)
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
- struct inode *inode;
- struct proc_inode *ei;
struct hlist_node *node;
struct super_block *old_sb = NULL;
rcu_read_lock();
- for (;;) {
+ while ((node = hlist_first_rcu(inodes))) {
+ struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
struct super_block *sb;
- node = hlist_first_rcu(inodes);
- if (!node)
- break;
- ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+ struct inode *inode;
+
spin_lock(lock);
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(lock);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 23fc24d16b31..6422e569b080 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -546,7 +546,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
* and explicitly excluded physical ranges.
*/
if (!page || PageOffline(page) ||
- is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
+ is_page_hwpoison(page) || !pfn_is_ram(pfn) ||
+ pfn_is_unaccepted_memory(pfn)) {
if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9191248f2dac..b55dbc70287b 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -188,7 +188,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
/* procfs dentries and inodes don't require IO to create */
- s->s_shrink.seeks = 0;
+ s->s_shrink->seeks = 0;
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1593940ca01e..ef2eb12906da 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -20,6 +20,8 @@
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <linux/minmax.h>
+#include <linux/overflow.h>
#include <asm/elf.h>
#include <asm/tlb.h>
@@ -849,9 +851,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
static int show_smap(struct seq_file *m, void *v)
{
struct vm_area_struct *vma = v;
- struct mem_size_stats mss;
-
- memset(&mss, 0, sizeof(mss));
+ struct mem_size_stats mss = {};
smap_gather_stats(vma, &mss, 0);
@@ -877,7 +877,7 @@ static int show_smap(struct seq_file *m, void *v)
static int show_smaps_rollup(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct mem_size_stats mss;
+ struct mem_size_stats mss = {};
struct mm_struct *mm = priv->mm;
struct vm_area_struct *vma;
unsigned long vma_start = 0, last_vma_end = 0;
@@ -893,8 +893,6 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
goto out_put_task;
}
- memset(&mss, 0, sizeof(mss));
-
ret = mmap_read_lock_killable(mm);
if (ret)
goto out_put_mm;
@@ -1246,14 +1244,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
struct mm_struct *mm;
struct vm_area_struct *vma;
enum clear_refs_types type;
int itype;
int rv;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1761,11 +1758,737 @@ static int pagemap_release(struct inode *inode, struct file *file)
return 0;
}
+#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
+ PAGE_IS_FILE | PAGE_IS_PRESENT | \
+ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
+ PAGE_IS_HUGE)
+#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
+
+struct pagemap_scan_private {
+ struct pm_scan_arg arg;
+ unsigned long masks_of_interest, cur_vma_category;
+ struct page_region *vec_buf;
+ unsigned long vec_buf_len, vec_buf_index, found_pages;
+ struct page_region __user *vec_out;
+};
+
+static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ unsigned long categories = 0;
+
+ if (pte_present(pte)) {
+ struct page *page;
+
+ categories |= PAGE_IS_PRESENT;
+ if (!pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page(vma, addr, pte);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ } else if (is_swap_pte(pte)) {
+ swp_entry_t swp;
+
+ categories |= PAGE_IS_SWAPPED;
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ swp = pte_to_swp_entry(pte);
+ if (is_pfn_swap_entry(swp) &&
+ !PageAnon(pfn_swap_entry_to_page(swp)))
+ categories |= PAGE_IS_FILE;
+ }
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+ pte_t ptent = ptep_get(pte);
+
+ if (pte_present(ptent)) {
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_mkuffd_wp(ptent);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+ } else if (is_swap_pte(ptent)) {
+ ptent = pte_swp_mkuffd_wp(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+ } else {
+ set_pte_at(vma->vm_mm, addr, pte,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ }
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ if (pmd_present(pmd)) {
+ struct page *page;
+
+ categories |= PAGE_IS_PRESENT;
+ if (!pmd_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_zero_pfn(pmd_pfn(pmd)))
+ categories |= PAGE_IS_PFNZERO;
+ } else if (is_swap_pmd(pmd)) {
+ swp_entry_t swp;
+
+ categories |= PAGE_IS_SWAPPED;
+ if (!pmd_swp_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ swp = pmd_to_swp_entry(pmd);
+ if (is_pfn_swap_entry(swp) &&
+ !PageAnon(pfn_swap_entry_to_page(swp)))
+ categories |= PAGE_IS_FILE;
+ }
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old, pmd = *pmdp;
+
+ if (pmd_present(pmd)) {
+ old = pmdp_invalidate_ad(vma, addr, pmdp);
+ pmd = pmd_mkuffd_wp(old);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long pagemap_hugetlb_category(pte_t pte)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ /*
+ * According to pagemap_hugetlb_range(), file-backed HugeTLB
+ * page cannot be swapped. So PAGE_IS_FILE is not checked for
+ * swapped pages.
+ */
+ if (pte_present(pte)) {
+ categories |= PAGE_IS_PRESENT;
+ if (!huge_pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (!PageAnon(pte_page(pte)))
+ categories |= PAGE_IS_FILE;
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ } else if (is_swap_pte(pte)) {
+ categories |= PAGE_IS_SWAPPED;
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t ptent)
+{
+ unsigned long psize;
+
+ if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
+ return;
+
+ psize = huge_page_size(hstate_vma(vma));
+
+ if (is_hugetlb_entry_migration(ptent))
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ pte_swp_mkuffd_wp(ptent), psize);
+ else if (!huge_pte_none(ptent))
+ huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
+ huge_pte_mkuffd_wp(ptent));
+ else
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP), psize);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ if (cur_buf->start != addr)
+ cur_buf->end = addr;
+ else
+ cur_buf->start = cur_buf->end = 0;
+
+ p->found_pages -= (end - addr) / PAGE_SIZE;
+}
+#endif
+
+static bool pagemap_scan_is_interesting_page(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ categories ^= p->arg.category_inverted;
+ if ((categories & p->arg.category_mask) != p->arg.category_mask)
+ return false;
+ if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
+ return false;
+
+ return true;
+}
+
+static bool pagemap_scan_is_interesting_vma(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
+
+ categories ^= p->arg.category_inverted;
+ if ((categories & required) != required)
+ return false;
+
+ return true;
+}
+
+static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long vma_category = 0;
+
+ if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma))
+ vma_category |= PAGE_IS_WPALLOWED;
+ else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
+ return -EPERM;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (!pagemap_scan_is_interesting_vma(vma_category, p))
+ return 1;
+
+ p->cur_vma_category = vma_category;
+
+ return 0;
+}
+
+static bool pagemap_scan_push_range(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ /*
+ * When there is no output buffer provided at all, the sentinel values
+ * won't match here. There is no other way for `cur_buf->end` to be
+ * non-zero other than it being non-empty.
+ */
+ if (addr == cur_buf->end && categories == cur_buf->categories) {
+ cur_buf->end = end;
+ return true;
+ }
+
+ if (cur_buf->end) {
+ if (p->vec_buf_index >= p->vec_buf_len - 1)
+ return false;
+
+ cur_buf = &p->vec_buf[++p->vec_buf_index];
+ }
+
+ cur_buf->start = addr;
+ cur_buf->end = end;
+ cur_buf->categories = categories;
+
+ return true;
+}
+
+static int pagemap_scan_output(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long *end)
+{
+ unsigned long n_pages, total_pages;
+ int ret = 0;
+
+ if (!p->vec_buf)
+ return 0;
+
+ categories &= p->arg.return_mask;
+
+ n_pages = (*end - addr) / PAGE_SIZE;
+ if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
+ total_pages > p->arg.max_pages) {
+ size_t n_too_much = total_pages - p->arg.max_pages;
+ *end -= n_too_much * PAGE_SIZE;
+ n_pages -= n_too_much;
+ ret = -ENOSPC;
+ }
+
+ if (!pagemap_scan_push_range(categories, p, addr, *end)) {
+ *end = addr;
+ n_pages = 0;
+ ret = -ENOSPC;
+ }
+
+ p->found_pages += n_pages;
+ if (ret)
+ p->arg.walk_end = *end;
+
+ return ret;
+}
+
+static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return -ENOENT;
+
+ categories = p->cur_vma_category |
+ pagemap_thp_category(p, vma, start, *pmd);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ goto out_unlock;
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ /*
+ * Break huge page into small pages if the WP operation
+ * needs to be performed on a portion of the huge page.
+ */
+ if (end != start + HPAGE_SIZE) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, start);
+ pagemap_scan_backout_range(p, start, end);
+ /* Report as if there was no THP */
+ return -ENOENT;
+ }
+
+ make_uffd_wp_pmd(vma, start, pmd);
+ flush_tlb_range(vma, start, end);
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+ return -ENOENT;
+#endif
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long addr, flush_end = 0;
+ pte_t *pte, *start_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ arch_enter_lazy_mmu_mode();
+
+ ret = pagemap_scan_thp_entry(pmd, start, end, walk);
+ if (ret != -ENOENT) {
+ arch_leave_lazy_mmu_mode();
+ return ret;
+ }
+
+ ret = 0;
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+ if (!pte) {
+ arch_leave_lazy_mmu_mode();
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
+ if (!p->vec_out) {
+ /* Fast path for performing exclusive WP */
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ if (pte_uffd_wp(ptep_get(pte)))
+ continue;
+ make_uffd_wp_pte(vma, addr, pte);
+ if (!flush_end)
+ start = addr;
+ flush_end = addr + PAGE_SIZE;
+ }
+ goto flush_and_return;
+ }
+
+ if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
+ p->arg.category_mask == PAGE_IS_WRITTEN &&
+ p->arg.return_mask == PAGE_IS_WRITTEN) {
+ for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
+ unsigned long next = addr + PAGE_SIZE;
+
+ if (pte_uffd_wp(ptep_get(pte)))
+ continue;
+ ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
+ p, addr, &next);
+ if (next == addr)
+ break;
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ make_uffd_wp_pte(vma, addr, pte);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+ goto flush_and_return;
+ }
+
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ unsigned long categories = p->cur_vma_category |
+ pagemap_page_category(p, vma, addr, ptep_get(pte));
+ unsigned long next = addr + PAGE_SIZE;
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ continue;
+
+ ret = pagemap_scan_output(categories, p, addr, &next);
+ if (next == addr)
+ break;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ if (~categories & PAGE_IS_WRITTEN)
+ continue;
+
+ make_uffd_wp_pte(vma, addr, pte);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+
+flush_and_return:
+ if (flush_end)
+ flush_tlb_range(vma, start, addr);
+
+ pte_unmap_unlock(start_pte, ptl);
+ arch_leave_lazy_mmu_mode();
+
+ cond_resched();
+ return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+ pte_t pte;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
+ /* Go the short route when not write-protecting pages. */
+
+ pte = huge_ptep_get(ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ return 0;
+
+ return pagemap_scan_output(categories, p, start, &end);
+ }
+
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
+
+ pte = huge_ptep_get(ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ if (end != start + HPAGE_SIZE) {
+ /* Partial HugeTLB page WP isn't possible. */
+ pagemap_scan_backout_range(p, start, end);
+ p->arg.walk_end = start;
+ ret = 0;
+ goto out_unlock;
+ }
+
+ make_uffd_wp_huge_pte(vma, start, ptep, pte);
+ flush_hugetlb_tlb_range(vma, start, end);
+
+out_unlock:
+ spin_unlock(ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+ return ret;
+}
+#else
+#define pagemap_scan_hugetlb_entry NULL
+#endif
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
+ int depth, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int ret, err;
+
+ if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
+ return 0;
+
+ ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
+ if (addr == end)
+ return ret;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ return ret;
+
+ err = uffd_wp_range(vma, addr, end - addr, true);
+ if (err < 0)
+ ret = err;
+
+ return ret;
+}
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+ .test_walk = pagemap_scan_test_walk,
+ .pmd_entry = pagemap_scan_pmd_entry,
+ .pte_hole = pagemap_scan_pte_hole,
+ .hugetlb_entry = pagemap_scan_hugetlb_entry,
+};
+
+static int pagemap_scan_get_args(struct pm_scan_arg *arg,
+ unsigned long uarg)
+{
+ if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
+ return -EFAULT;
+
+ if (arg->size != sizeof(struct pm_scan_arg))
+ return -EINVAL;
+
+ /* Validate requested features */
+ if (arg->flags & ~PM_SCAN_FLAGS)
+ return -EINVAL;
+ if ((arg->category_inverted | arg->category_mask |
+ arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
+ return -EINVAL;
+
+ arg->start = untagged_addr((unsigned long)arg->start);
+ arg->end = untagged_addr((unsigned long)arg->end);
+ arg->vec = untagged_addr((unsigned long)arg->vec);
+
+ /* Validate memory pointers */
+ if (!IS_ALIGNED(arg->start, PAGE_SIZE))
+ return -EINVAL;
+ if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
+ return -EFAULT;
+ if (!arg->vec && arg->vec_len)
+ return -EINVAL;
+ if (arg->vec && !access_ok((void __user *)(long)arg->vec,
+ arg->vec_len * sizeof(struct page_region)))
+ return -EFAULT;
+
+ /* Fixup default values */
+ arg->end = ALIGN(arg->end, PAGE_SIZE);
+ arg->walk_end = 0;
+ if (!arg->max_pages)
+ arg->max_pages = ULONG_MAX;
+
+ return 0;
+}
+
+static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
+ unsigned long uargl)
+{
+ struct pm_scan_arg __user *uarg = (void __user *)uargl;
+
+ if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
+{
+ if (!p->arg.vec_len)
+ return 0;
+
+ p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
+ p->arg.vec_len);
+ p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
+ GFP_KERNEL);
+ if (!p->vec_buf)
+ return -ENOMEM;
+
+ p->vec_buf->start = p->vec_buf->end = 0;
+ p->vec_out = (struct page_region __user *)(long)p->arg.vec;
+
+ return 0;
+}
+
+static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
+{
+ const struct page_region *buf = p->vec_buf;
+ long n = p->vec_buf_index;
+
+ if (!p->vec_buf)
+ return 0;
+
+ if (buf[n].end != buf[n].start)
+ n++;
+
+ if (!n)
+ return 0;
+
+ if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
+ return -EFAULT;
+
+ p->arg.vec_len -= n;
+ p->vec_out += n;
+
+ p->vec_buf_index = 0;
+ p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
+ p->vec_buf->start = p->vec_buf->end = 0;
+
+ return n;
+}
+
+static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
+{
+ struct mmu_notifier_range range;
+ struct pagemap_scan_private p = {0};
+ unsigned long walk_start;
+ size_t n_ranges_out = 0;
+ int ret;
+
+ ret = pagemap_scan_get_args(&p.arg, uarg);
+ if (ret)
+ return ret;
+
+ p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
+ p.arg.return_mask;
+ ret = pagemap_scan_init_bounce_buffer(&p);
+ if (ret)
+ return ret;
+
+ /* Protection change for the range is going to happen. */
+ if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+ mm, p.arg.start, p.arg.end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+ for (walk_start = p.arg.start; walk_start < p.arg.end;
+ walk_start = p.arg.walk_end) {
+ long n_out;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ break;
+ ret = walk_page_range(mm, walk_start, p.arg.end,
+ &pagemap_scan_ops, &p);
+ mmap_read_unlock(mm);
+
+ n_out = pagemap_scan_flush_buffer(&p);
+ if (n_out < 0)
+ ret = n_out;
+ else
+ n_ranges_out += n_out;
+
+ if (ret != -ENOSPC)
+ break;
+
+ if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
+ break;
+ }
+
+ /* ENOSPC signifies early stop (buffer full) from the walk. */
+ if (!ret || ret == -ENOSPC)
+ ret = n_ranges_out;
+
+ /* The walk_end isn't set when ret is zero */
+ if (!p.arg.walk_end)
+ p.arg.walk_end = p.arg.end;
+ if (pagemap_scan_writeback_args(&p.arg, uarg))
+ ret = -EFAULT;
+
+ if (p.arg.flags & PM_SCAN_WP_MATCHING)
+ mmu_notifier_invalidate_range_end(&range);
+
+ kfree(p.vec_buf);
+ return ret;
+}
+
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mm_struct *mm = file->private_data;
+
+ switch (cmd) {
+ case PAGEMAP_SCAN:
+ return do_pagemap_scan(mm, arg);
+
+ default:
+ return -EINVAL;
+ }
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
.release = pagemap_release,
+ .unlocked_ioctl = do_pagemap_cmd,
+ .compat_ioctl = do_pagemap_cmd,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
@@ -1945,8 +2668,9 @@ static int show_numa_map(struct seq_file *m, void *v)
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
- struct mempolicy *pol;
char buffer[64];
+ struct mempolicy *pol;
+ pgoff_t ilx;
int nid;
if (!mm)
@@ -1955,7 +2679,7 @@ static int show_numa_map(struct seq_file *m, void *v)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- pol = __get_vma_policy(vma, vma->vm_start);
+ pol = __get_vma_policy(vma, vma->vm_start, &ilx);
if (pol) {
mpol_to_str(buffer, sizeof(buffer), pol);
mpol_cond_put(pol);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 023b91b4e1f0..58b5de081b57 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -803,12 +803,6 @@ dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
}
-static struct shrinker dqcache_shrinker = {
- .count_objects = dqcache_shrink_count,
- .scan_objects = dqcache_shrink_scan,
- .seeks = DEFAULT_SEEKS,
-};
-
/*
* Safely release dquot and put reference to dquot.
*/
@@ -2982,6 +2976,7 @@ static int __init dquot_init(void)
{
int i, ret;
unsigned long nr_hash, order;
+ struct shrinker *dqcache_shrinker;
printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -3016,8 +3011,14 @@ static int __init dquot_init(void)
pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
- if (register_shrinker(&dqcache_shrinker, "dquota-cache"))
- panic("Cannot register dquot shrinker");
+ dqcache_shrinker = shrinker_alloc(0, "dquota-cache");
+ if (!dqcache_shrinker)
+ panic("Cannot allocate dquot shrinker");
+
+ dqcache_shrinker->count_objects = dqcache_shrink_count;
+ dqcache_shrinker->scan_objects = dqcache_shrink_scan;
+
+ shrinker_register(dqcache_shrinker);
return 0;
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c8572346556f..1d825459ee6e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2503,10 +2503,10 @@ out:
* start/recovery path as __block_write_full_folio, along with special
* code to handle reiserfs tails.
*/
-static int reiserfs_write_full_page(struct page *page,
+static int reiserfs_write_full_folio(struct folio *folio,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned long end_index = inode->i_size >> PAGE_SHIFT;
int error = 0;
unsigned long block;
@@ -2514,7 +2514,7 @@ static int reiserfs_write_full_page(struct page *page,
struct buffer_head *head, *bh;
int partial = 0;
int nr = 0;
- int checked = PageChecked(page);
+ int checked = folio_test_checked(folio);
struct reiserfs_transaction_handle th;
struct super_block *s = inode->i_sb;
int bh_per_page = PAGE_SIZE / s->s_blocksize;
@@ -2522,47 +2522,46 @@ static int reiserfs_write_full_page(struct page *page,
/* no logging allowed when nonblocking or from PF_MEMALLOC */
if (checked && (current->flags & PF_MEMALLOC)) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
return 0;
}
/*
- * The page dirty bit is cleared before writepage is called, which
+ * The folio dirty bit is cleared before writepage is called, which
* means we have to tell create_empty_buffers to make dirty buffers
- * The page really should be up to date at this point, so tossing
+ * The folio really should be up to date at this point, so tossing
* in the BH_Uptodate is just a sanity check.
*/
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, s->s_blocksize,
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, s->s_blocksize,
(1 << BH_Dirty) | (1 << BH_Uptodate));
- }
- head = page_buffers(page);
/*
- * last page in the file, zero out any contents past the
+ * last folio in the file, zero out any contents past the
* last byte in the file
*/
- if (page->index >= end_index) {
+ if (folio->index >= end_index) {
unsigned last_offset;
last_offset = inode->i_size & (PAGE_SIZE - 1);
- /* no file contents in this page */
- if (page->index >= end_index + 1 || !last_offset) {
- unlock_page(page);
+ /* no file contents in this folio */
+ if (folio->index >= end_index + 1 || !last_offset) {
+ folio_unlock(folio);
return 0;
}
- zero_user_segment(page, last_offset, PAGE_SIZE);
+ folio_zero_segment(folio, last_offset, folio_size(folio));
}
bh = head;
- block = page->index << (PAGE_SHIFT - s->s_blocksize_bits);
+ block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits);
last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
/* first map all the buffers, logging any direct items we find */
do {
if (block > last_block) {
/*
* This can happen when the block size is less than
- * the page size. The corresponding bytes in the page
+ * the folio size. The corresponding bytes in the folio
* were zero filled above
*/
clear_buffer_dirty(bh);
@@ -2589,7 +2588,7 @@ static int reiserfs_write_full_page(struct page *page,
* blocks we're going to log
*/
if (checked) {
- ClearPageChecked(page);
+ folio_clear_checked(folio);
reiserfs_write_lock(s);
error = journal_begin(&th, s, bh_per_page + 1);
if (error) {
@@ -2598,7 +2597,7 @@ static int reiserfs_write_full_page(struct page *page,
}
reiserfs_update_inode_transaction(inode);
}
- /* now go through and lock any dirty buffers on the page */
+ /* now go through and lock any dirty buffers on the folio */
do {
get_bh(bh);
if (!buffer_mapped(bh))
@@ -2619,7 +2618,7 @@ static int reiserfs_write_full_page(struct page *page,
lock_buffer(bh);
} else {
if (!trylock_buffer(bh)) {
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
continue;
}
}
@@ -2636,13 +2635,13 @@ static int reiserfs_write_full_page(struct page *page,
if (error)
goto fail;
}
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio);
+ folio_unlock(folio);
/*
- * since any buffer might be the only dirty buffer on the page,
- * the first submit_bh can bring the page out of writeback.
+ * since any buffer might be the only dirty buffer on the folio,
+ * the first submit_bh can bring the folio out of writeback.
* be careful with the buffers.
*/
do {
@@ -2659,10 +2658,10 @@ static int reiserfs_write_full_page(struct page *page,
done:
if (nr == 0) {
/*
- * if this page only had a direct item, it is very possible for
+ * if this folio only had a direct item, it is very possible for
* no io to be required without there being an error. Or,
* someone else could have locked them and sent them down the
- * pipe without locking the page
+ * pipe without locking the folio
*/
bh = head;
do {
@@ -2673,18 +2672,18 @@ done:
bh = bh->b_this_page;
} while (bh != head);
if (!partial)
- SetPageUptodate(page);
- end_page_writeback(page);
+ folio_mark_uptodate(folio);
+ folio_end_writeback(folio);
}
return error;
fail:
/*
* catches various errors, we need to make sure any valid dirty blocks
- * get to the media. The page is currently locked and not marked for
+ * get to the media. The folio is currently locked and not marked for
* writeback
*/
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
bh = head;
do {
get_bh(bh);
@@ -2694,16 +2693,16 @@ fail:
} else {
/*
* clear any dirty bits that might have come from
- * getting attached to a dirty page
+ * getting attached to a dirty folio
*/
clear_buffer_dirty(bh);
}
bh = bh->b_this_page;
} while (bh != head);
- SetPageError(page);
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
+ folio_set_error(folio);
+ BUG_ON(folio_test_writeback(folio));
+ folio_start_writeback(folio);
+ folio_unlock(folio);
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
@@ -2724,9 +2723,10 @@ static int reiserfs_read_folio(struct file *f, struct folio *folio)
static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio->mapping->host;
reiserfs_wait_on_write_block(inode->i_sb);
- return reiserfs_write_full_page(page, wbc);
+ return reiserfs_write_full_folio(folio, wbc);
}
static void reiserfs_truncate_failed_write(struct inode *inode)
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index fe1bf5b6e0cb..59f6b8e32cc9 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -32,7 +32,7 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
* fully cached or it may be in the process of
* being deleted due to a lease break.
*/
- if (!cfid->has_lease) {
+ if (!cfid->time || !cfid->has_lease) {
spin_unlock(&cfids->cfid_list_lock);
return NULL;
}
@@ -193,10 +193,20 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
npath = path_no_prefix(cifs_sb, path);
if (IS_ERR(npath)) {
rc = PTR_ERR(npath);
- kfree(utf16_path);
- return rc;
+ goto out;
}
+ if (!npath[0]) {
+ dentry = dget(cifs_sb->root);
+ } else {
+ dentry = path_to_dentry(cifs_sb, npath);
+ if (IS_ERR(dentry)) {
+ rc = -ENOENT;
+ goto out;
+ }
+ }
+ cfid->dentry = dentry;
+
/*
* We do not hold the lock for the open because in case
* SMB2_open needs to reconnect.
@@ -249,6 +259,15 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
smb2_set_related(&rqst[1]);
+ /*
+ * Set @cfid->has_lease to true before sending out compounded request so
+ * its lease reference can be put in cached_dir_lease_break() due to a
+ * potential lease break right after the request is sent or while @cfid
+ * is still being cached. Concurrent processes won't be to use it yet
+ * due to @cfid->time being zero.
+ */
+ cfid->has_lease = true;
+
rc = compound_send_recv(xid, ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
@@ -263,6 +282,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
cfid->tcon = tcon;
cfid->is_open = true;
+ spin_lock(&cfids->cfid_list_lock);
+
o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
oparms.fid->persistent_fid = o_rsp->PersistentFileId;
oparms.fid->volatile_fid = o_rsp->VolatileFileId;
@@ -270,18 +291,25 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
- if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE)
+ rc = -EINVAL;
+ if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) {
+ spin_unlock(&cfids->cfid_list_lock);
goto oshr_free;
+ }
smb2_parse_contexts(server, o_rsp,
&oparms.fid->epoch,
oparms.fid->lease_key, &oplock,
NULL, NULL);
- if (!(oplock & SMB2_LEASE_READ_CACHING_HE))
+ if (!(oplock & SMB2_LEASE_READ_CACHING_HE)) {
+ spin_unlock(&cfids->cfid_list_lock);
goto oshr_free;
+ }
qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
- if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) {
+ spin_unlock(&cfids->cfid_list_lock);
goto oshr_free;
+ }
if (!smb2_validate_and_copy_iov(
le16_to_cpu(qi_rsp->OutputBufferOffset),
sizeof(struct smb2_file_all_info),
@@ -289,37 +317,24 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
(char *)&cfid->file_all_info))
cfid->file_all_info_is_valid = true;
- if (!npath[0])
- dentry = dget(cifs_sb->root);
- else {
- dentry = path_to_dentry(cifs_sb, npath);
- if (IS_ERR(dentry)) {
- rc = -ENOENT;
- goto oshr_free;
- }
- }
- spin_lock(&cfids->cfid_list_lock);
- cfid->dentry = dentry;
cfid->time = jiffies;
- cfid->has_lease = true;
spin_unlock(&cfids->cfid_list_lock);
+ /* At this point the directory handle is fully cached */
+ rc = 0;
oshr_free:
- kfree(utf16_path);
SMB2_open_free(&rqst[0]);
SMB2_query_info_free(&rqst[1]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- spin_lock(&cfids->cfid_list_lock);
- if (!cfid->has_lease) {
- if (rc) {
- if (cfid->on_list) {
- list_del(&cfid->entry);
- cfid->on_list = false;
- cfids->num_entries--;
- }
- rc = -ENOENT;
- } else {
+ if (rc) {
+ spin_lock(&cfids->cfid_list_lock);
+ if (cfid->on_list) {
+ list_del(&cfid->entry);
+ cfid->on_list = false;
+ cfids->num_entries--;
+ }
+ if (cfid->has_lease) {
/*
* We are guaranteed to have two references at this
* point. One for the caller and one for a potential
@@ -327,25 +342,24 @@ oshr_free:
* will be closed when the caller closes the cached
* handle.
*/
+ cfid->has_lease = false;
spin_unlock(&cfids->cfid_list_lock);
kref_put(&cfid->refcount, smb2_close_cached_fid);
goto out;
}
+ spin_unlock(&cfids->cfid_list_lock);
}
- spin_unlock(&cfids->cfid_list_lock);
+out:
if (rc) {
if (cfid->is_open)
SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
cfid->fid.volatile_fid);
free_cached_dir(cfid);
- cfid = NULL;
- }
-out:
- if (rc == 0) {
+ } else {
*ret_cfid = cfid;
atomic_inc(&tcon->num_remote_opens);
}
-
+ kfree(utf16_path);
return rc;
}
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 76922fcc4bc6..5596c9f30ccb 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -136,6 +136,11 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan)
{
struct TCP_Server_Info *server = chan->server;
+ if (!server) {
+ seq_printf(m, "\n\n\t\tChannel: %d DISABLED", i+1);
+ return;
+ }
+
seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx"
"\n\t\tNumber of credits: %d,%d,%d Dialect 0x%x"
"\n\t\tTCP status: %d Instance: %d"
@@ -279,6 +284,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifs_server_iface *iface;
+ size_t iface_weight = 0, iface_min_speed = 0;
+ struct cifs_server_iface *last_iface = NULL;
int c, i, j;
seq_puts(m,
@@ -427,6 +434,8 @@ skip_rdma:
if (server->nosharesock)
seq_printf(m, " nosharesock");
+ seq_printf(m, "\nServer capabilities: 0x%x", server->capabilities);
+
if (server->rdma)
seq_printf(m, "\nRDMA ");
seq_printf(m, "\nTCP status: %d Instance: %d"
@@ -452,6 +461,11 @@ skip_rdma:
seq_printf(m, "\n\n\tSessions: ");
i = 0;
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_EXITING) {
+ spin_unlock(&ses->ses_lock);
+ continue;
+ }
i++;
if ((ses->serverDomain == NULL) ||
(ses->serverOS == NULL) ||
@@ -472,6 +486,7 @@ skip_rdma:
ses->ses_count, ses->serverOS, ses->serverNOS,
ses->capabilities, ses->ses_status);
}
+ spin_unlock(&ses->ses_lock);
seq_printf(m, "\n\tSecurity type: %s ",
get_security_type_str(server->ops->select_sectype(server, ses->sectype)));
@@ -536,11 +551,25 @@ skip_rdma:
"\tLast updated: %lu seconds ago",
ses->iface_count,
(jiffies - ses->iface_last_update) / HZ);
+
+ last_iface = list_last_entry(&ses->iface_list,
+ struct cifs_server_iface,
+ iface_head);
+ iface_min_speed = last_iface->speed;
+
j = 0;
list_for_each_entry(iface, &ses->iface_list,
iface_head) {
seq_printf(m, "\n\t%d)", ++j);
cifs_dump_iface(m, iface);
+
+ iface_weight = iface->speed / iface_min_speed;
+ seq_printf(m, "\t\tWeight (cur,total): (%zu,%zu)"
+ "\n\t\tAllocated channels: %u\n",
+ iface->weight_fulfilled,
+ iface_weight,
+ iface->num_channels);
+
if (is_ses_using_iface(ses, iface))
seq_puts(m, "\t\t[CONNECTED]\n");
}
@@ -738,14 +767,14 @@ static ssize_t name##_write(struct file *file, const char __user *buffer, \
size_t count, loff_t *ppos) \
{ \
int rc; \
- rc = kstrtoint_from_user(buffer, count, 10, & name); \
+ rc = kstrtoint_from_user(buffer, count, 10, &name); \
if (rc) \
return rc; \
return count; \
} \
static int name##_proc_show(struct seq_file *m, void *v) \
{ \
- seq_printf(m, "%d\n", name ); \
+ seq_printf(m, "%d\n", name); \
return 0; \
} \
static int name##_open(struct inode *inode, struct file *file) \
diff --git a/fs/smb/client/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h
index 332588e77c31..26327442e383 100644
--- a/fs/smb/client/cifs_ioctl.h
+++ b/fs/smb/client/cifs_ioctl.h
@@ -26,6 +26,11 @@ struct smb_mnt_fs_info {
__u64 cifs_posix_caps;
} __packed;
+struct smb_mnt_tcon_info {
+ __u32 tid;
+ __u64 session_id;
+} __packed;
+
struct smb_snapshot_array {
__u32 number_of_snapshots;
__u32 number_of_snapshots_returned;
@@ -108,6 +113,7 @@ struct smb3_notify_info {
#define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify)
#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info)
#define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info)
+#define CIFS_IOC_GET_TCON_INFO _IOR(CIFS_IOCTL_MAGIC, 12, struct smb_mnt_tcon_info)
#define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32)
/*
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 6f3285f1dfee..af7849e5974f 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -64,8 +64,8 @@ struct key_type cifs_spnego_key_type = {
* strlen(";sec=ntlmsspi") */
#define MAX_MECH_STR_LEN 13
-/* strlen of "host=" */
-#define HOST_KEY_LEN 5
+/* strlen of ";host=" */
+#define HOST_KEY_LEN 6
/* strlen of ";ip4=" or ";ip6=" */
#define IP_KEY_LEN 5
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 22869cda1356..ea3a7a668b45 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1191,6 +1191,7 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
const struct inode_operations cifs_symlink_inode_ops = {
.get_link = cifs_get_link,
+ .setattr = cifs_setattr,
.permission = cifs_permission,
.listxattr = cifs_listxattr,
};
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 8ca3d7606bb4..3adea10aa9da 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -152,6 +152,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 45
-#define CIFS_VERSION "2.45"
+#define SMB3_PRODUCT_BUILD 46
+#define CIFS_VERSION "2.46"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 02082621d8e0..6ffbd81bd109 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -650,6 +650,7 @@ struct TCP_Server_Info {
bool noautotune; /* do not autotune send buf sizes */
bool nosharesock;
bool tcp_nodelay;
+ bool terminate;
unsigned int credits; /* send no more requests at once */
unsigned int max_credits; /* can override large 32000 default at mnt */
unsigned int in_flight; /* number of requests on the wire to server */
@@ -969,6 +970,8 @@ struct cifs_server_iface {
struct list_head iface_head;
struct kref refcount;
size_t speed;
+ size_t weight_fulfilled;
+ unsigned int num_channels;
unsigned int rdma_capable : 1;
unsigned int rss_capable : 1;
unsigned int is_active : 1; /* unset if non existent */
@@ -1050,6 +1053,7 @@ struct cifs_ses {
spinlock_t chan_lock;
/* ========= begin: protected by chan_lock ======== */
#define CIFS_MAX_CHANNELS 16
+#define CIFS_INVAL_CHAN_INDEX (-1)
#define CIFS_ALL_CHANNELS_SET(ses) \
((1UL << (ses)->chan_count) - 1)
#define CIFS_ALL_CHANS_GOOD(ses) \
@@ -2143,6 +2147,7 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst,
unsigned int len, skip;
unsigned int nents = 0;
unsigned long addr;
+ size_t data_size;
int i, j;
/*
@@ -2158,17 +2163,21 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst,
* rqst[1+].rq_iov[0+] data to be encrypted/decrypted
*/
for (i = 0; i < num_rqst; i++) {
+ data_size = iov_iter_count(&rqst[i].rq_iter);
+
/* We really don't want a mixture of pinned and unpinned pages
* in the sglist. It's hard to keep track of which is what.
* Instead, we convert to a BVEC-type iterator higher up.
*/
- if (WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter)))
+ if (data_size &&
+ WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter)))
return -EIO;
/* We also don't want to have any extra refs or pins to clean
* up in the sglist.
*/
- if (WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter)))
+ if (data_size &&
+ WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter)))
return -EIO;
for (j = 0; j < rqst[i].rq_nvec; j++) {
@@ -2184,7 +2193,8 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst,
}
skip = 0;
}
- nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX);
+ if (data_size)
+ nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX);
}
nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE);
return nents;
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index e17222fec9d2..a75220db5c1e 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -2570,7 +2570,7 @@ typedef struct {
struct win_dev {
- unsigned char type[8]; /* IntxCHR or IntxBLK */
+ unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO*/
__le64 major;
__le64 minor;
} __attribute__((packed));
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 0c37eefa18a5..d87e2c26cce2 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -81,7 +81,7 @@ extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx,
extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
char *cifs_build_devname(char *nodename, const char *prepath);
extern void delete_mid(struct mid_q_entry *mid);
-extern void release_mid(struct mid_q_entry *mid);
+void __release_mid(struct kref *refcount);
extern void cifs_wake_up_task(struct mid_q_entry *mid);
extern int cifs_handle_standard(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
@@ -132,6 +132,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *in_buf,
struct smb_hdr *out_buf,
int *bytes_returned);
+
void
cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
bool all_channels);
@@ -610,13 +611,13 @@ void cifs_free_hash(struct shash_desc **sdesc);
struct cifs_chan *
cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server);
-int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses);
+int cifs_try_adding_channels(struct cifs_ses *ses);
bool is_server_using_iface(struct TCP_Server_Info *server,
struct cifs_server_iface *iface);
bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface);
void cifs_ses_mark_for_reconnect(struct cifs_ses *ses);
-unsigned int
+int
cifs_ses_get_chan_index(struct cifs_ses *ses,
struct TCP_Server_Info *server);
void
@@ -640,6 +641,8 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,
bool
cifs_chan_is_iface_active(struct cifs_ses *ses,
struct TCP_Server_Info *server);
+void
+cifs_disable_secondary_channels(struct cifs_ses *ses);
int
cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
int
@@ -740,4 +743,9 @@ static inline bool dfs_src_pathname_equal(const char *s1, const char *s2)
return true;
}
+static inline void release_mid(struct mid_q_entry *mid)
+{
+ kref_put(&mid->refcount, __release_mid);
+}
+
#endif /* _CIFSPROTO_H */
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 7b923e36501b..f896f60c924b 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -119,6 +119,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
static void smb2_query_server_interfaces(struct work_struct *work)
{
int rc;
+ int xid;
struct cifs_tcon *tcon = container_of(work,
struct cifs_tcon,
query_interfaces.work);
@@ -126,8 +127,14 @@ static void smb2_query_server_interfaces(struct work_struct *work)
/*
* query server network interfaces, in case they change
*/
- rc = SMB3_request_interfaces(0, tcon, false);
+ xid = get_xid();
+ rc = SMB3_request_interfaces(xid, tcon, false);
+ free_xid(xid);
+
if (rc) {
+ if (rc == -EOPNOTSUPP)
+ return;
+
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
}
@@ -156,20 +163,25 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
/* If server is a channel, select the primary channel */
pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
- spin_lock(&pserver->srv_lock);
+ /* if we need to signal just this channel */
if (!all_channels) {
- pserver->tcpStatus = CifsNeedReconnect;
- spin_unlock(&pserver->srv_lock);
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus != CifsExiting)
+ server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&server->srv_lock);
return;
}
- spin_unlock(&pserver->srv_lock);
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
+ if (!ses->chans[i].server)
+ continue;
+
spin_lock(&ses->chans[i].server->srv_lock);
- ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ if (ses->chans[i].server->tcpStatus != CifsExiting)
+ ses->chans[i].server->tcpStatus = CifsNeedReconnect;
spin_unlock(&ses->chans[i].server->srv_lock);
}
spin_unlock(&ses->chan_lock);
@@ -207,6 +219,14 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
+ /*
+ * if channel has been marked for termination, nothing to do
+ * for the channel. in fact, we cannot find the channel for the
+ * server. So safe to exit here
+ */
+ if (server->terminate)
+ break;
+
/* check if iface is still active */
if (!cifs_chan_is_iface_active(ses, server))
cifs_chan_update_iface(ses, server);
@@ -241,6 +261,8 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&tcon->tc_lock);
tcon->status = TID_NEED_RECON;
spin_unlock(&tcon->tc_lock);
+
+ cancel_delayed_work(&tcon->query_interfaces);
}
if (ses->tcon_ipc) {
ses->tcon_ipc->need_reconnect = true;
@@ -380,7 +402,13 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
spin_unlock(&server->srv_lock);
cifs_swn_reset_server_dstaddr(server);
cifs_server_unlock(server);
- mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
+
+ /* increase ref count which reconnect work will drop */
+ spin_lock(&cifs_tcp_ses_lock);
+ server->srv_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+ if (mod_delayed_work(cifsiod_wq, &server->reconnect, 0))
+ cifs_put_tcp_session(server, false);
}
} while (server->tcpStatus == CifsNeedReconnect);
@@ -510,7 +538,13 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
spin_unlock(&server->srv_lock);
cifs_swn_reset_server_dstaddr(server);
cifs_server_unlock(server);
- mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
+
+ /* increase ref count which reconnect work will drop */
+ spin_lock(&cifs_tcp_ses_lock);
+ server->srv_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+ if (mod_delayed_work(cifsiod_wq, &server->reconnect, 0))
+ cifs_put_tcp_session(server, false);
} while (server->tcpStatus == CifsNeedReconnect);
mutex_lock(&server->refpath_lock);
@@ -1592,16 +1626,19 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
cancel_delayed_work_sync(&server->echo);
- if (from_reconnect)
+ if (from_reconnect) {
/*
* Avoid deadlock here: reconnect work calls
* cifs_put_tcp_session() at its end. Need to be sure
* that reconnect work does nothing with server pointer after
* that step.
*/
- cancel_delayed_work(&server->reconnect);
- else
- cancel_delayed_work_sync(&server->reconnect);
+ if (cancel_delayed_work(&server->reconnect))
+ cifs_put_tcp_session(server, from_reconnect);
+ } else {
+ if (cancel_delayed_work_sync(&server->reconnect))
+ cifs_put_tcp_session(server, from_reconnect);
+ }
spin_lock(&server->srv_lock);
server->tcpStatus = CifsExiting;
@@ -1969,9 +2006,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
void __cifs_put_smb_ses(struct cifs_ses *ses)
{
- unsigned int rc, xid;
- unsigned int chan_count;
struct TCP_Server_Info *server = ses->server;
+ unsigned int xid;
+ size_t i;
+ int rc;
spin_lock(&ses->ses_lock);
if (ses->ses_status == SES_EXITING) {
@@ -2017,20 +2055,20 @@ void __cifs_put_smb_ses(struct cifs_ses *ses)
list_del_init(&ses->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- chan_count = ses->chan_count;
-
/* close any extra channels */
- if (chan_count > 1) {
- int i;
-
- for (i = 1; i < chan_count; i++) {
- if (ses->chans[i].iface) {
- kref_put(&ses->chans[i].iface->refcount, release_iface);
- ses->chans[i].iface = NULL;
- }
- cifs_put_tcp_session(ses->chans[i].server, 0);
- ses->chans[i].server = NULL;
+ for (i = 1; i < ses->chan_count; i++) {
+ if (ses->chans[i].iface) {
+ kref_put(&ses->chans[i].iface->refcount, release_iface);
+ ses->chans[i].iface = NULL;
}
+ cifs_put_tcp_session(ses->chans[i].server, 0);
+ ses->chans[i].server = NULL;
+ }
+
+ /* we now account for primary channel in iface->refcount */
+ if (ses->chans[0].iface) {
+ kref_put(&ses->chans[0].iface->refcount, release_iface);
+ ses->chans[0].server = NULL;
}
sesInfoFree(ses);
@@ -3560,7 +3598,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
ctx->prepath = NULL;
out:
- cifs_try_adding_channels(cifs_sb, mnt_ctx.ses);
+ cifs_try_adding_channels(mnt_ctx.ses);
rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
if (rc)
goto error;
@@ -3849,8 +3887,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
spin_unlock(&ses->chan_lock);
- if (!is_binding)
+ if (!is_binding) {
ses->ses_status = SES_IN_SETUP;
+
+ /* force iface_list refresh */
+ ses->iface_last_update = 0;
+ }
spin_unlock(&ses->ses_lock);
/* update ses ip_addr only for primary chan */
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 81b84151450d..a8a1d386da65 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -263,15 +263,23 @@ out:
return rc;
}
-/* Resolve UNC hostname in @ctx->source and set ip addr in @ctx->dstaddr */
+/*
+ * If @ctx->dfs_automount, then update @ctx->dstaddr earlier with the DFS root
+ * server from where we'll start following any referrals. Otherwise rely on the
+ * value provided by mount(2) as the user might not have dns_resolver key set up
+ * and therefore failing to upcall to resolve UNC hostname under @ctx->source.
+ */
static int update_fs_context_dstaddr(struct smb3_fs_context *ctx)
{
struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
- int rc;
+ int rc = 0;
- rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL);
- if (!rc)
- cifs_set_port(addr, ctx->port);
+ if (!ctx->nodfs && ctx->dfs_automount) {
+ rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL);
+ if (!rc)
+ cifs_set_port(addr, ctx->port);
+ ctx->dfs_automount = false;
+ }
return rc;
}
diff --git a/fs/smb/client/export.c b/fs/smb/client/export.c
index 37c28415df1e..d606e8cbcb7d 100644
--- a/fs/smb/client/export.c
+++ b/fs/smb/client/export.c
@@ -41,13 +41,12 @@ static struct dentry *cifs_get_parent(struct dentry *dentry)
}
const struct export_operations cifs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.get_parent = cifs_get_parent,
-/* Following five export operations are unneeded so far and can default:
- .get_dentry =
- .get_name =
- .find_exported_dentry =
- .decode_fh =
- .encode_fs = */
+/*
+ * Following export operations are mandatory for NFS export support:
+ * .fh_to_dentry =
+ */
};
#endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 9d8d34af0211..cf46916286d0 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -268,6 +268,7 @@ struct smb3_fs_context {
bool witness:1; /* use witness protocol */
char *leaf_fullpath;
struct cifs_ses *dfs_root_ses;
+ bool dfs_automount:1; /* set for dfs automount only */
};
extern const struct fs_parameter_spec smb3_fs_parameters[];
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 3abfe77bfa46..86fbd3f847d6 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -594,6 +594,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
cifs_dbg(FYI, "Symlink\n");
fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
+ } else if (memcmp("LnxFIFO", pbuf, 8) == 0) {
+ cifs_dbg(FYI, "FIFO\n");
+ fattr->cf_mode |= S_IFIFO;
+ fattr->cf_dtype = DT_FIFO;
} else {
fattr->cf_mode |= S_IFREG; /* file? */
fattr->cf_dtype = DT_REG;
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index f7160003e0ed..e2f92c21fff5 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -117,6 +117,20 @@ out_drop_write:
return rc;
}
+static long smb_mnt_get_tcon_info(struct cifs_tcon *tcon, void __user *arg)
+{
+ int rc = 0;
+ struct smb_mnt_tcon_info tcon_inf;
+
+ tcon_inf.tid = tcon->tid;
+ tcon_inf.session_id = tcon->ses->Suid;
+
+ if (copy_to_user(arg, &tcon_inf, sizeof(struct smb_mnt_tcon_info)))
+ rc = -EFAULT;
+
+ return rc;
+}
+
static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
void __user *arg)
{
@@ -129,6 +143,7 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
fsinf->version = 1;
fsinf->protocol_id = tcon->ses->server->vals->protocol_id;
+ fsinf->tcon_flags = tcon->Flags;
fsinf->device_characteristics =
le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics);
fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
@@ -414,6 +429,17 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
tcon = tlink_tcon(pSMBFile->tlink);
rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
break;
+ case CIFS_IOC_GET_TCON_INFO:
+ cifs_sb = CIFS_SB(inode->i_sb);
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
+ break;
+ }
+ tcon = tlink_tcon(tlink);
+ rc = smb_mnt_get_tcon_info(tcon, (void __user *)arg);
+ cifs_put_tlink(tlink);
+ break;
case CIFS_ENUMERATE_SNAPSHOTS:
if (pSMBFile == NULL)
break;
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index c66be4904e1f..a1da50e66fbb 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -42,23 +42,11 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
rc = cifs_alloc_hash("md5", &md5);
if (rc)
- goto symlink_hash_err;
+ return rc;
- rc = crypto_shash_init(md5);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__);
- goto symlink_hash_err;
- }
- rc = crypto_shash_update(md5, link_str, link_len);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
- goto symlink_hash_err;
- }
- rc = crypto_shash_final(md5, md5_hash);
+ rc = crypto_shash_digest(md5, link_str, link_len, md5_hash);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
-
-symlink_hash_err:
cifs_free_hash(&md5);
return rc;
}
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index c8f5ed8a69f1..a6968573b775 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -117,6 +117,18 @@ cifs_build_devname(char *nodename, const char *prepath)
return dev;
}
+static bool is_dfs_mount(struct dentry *dentry)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ bool ret;
+
+ spin_lock(&tcon->tc_lock);
+ ret = !!tcon->origin_fullpath;
+ spin_unlock(&tcon->tc_lock);
+ return ret;
+}
+
/* Return full path out of a dentry set for automount */
static char *automount_fullpath(struct dentry *dentry, void *page)
{
@@ -212,8 +224,9 @@ static struct vfsmount *cifs_do_automount(struct path *path)
ctx->source = NULL;
goto out;
}
- cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s\n",
- __func__, ctx->source, ctx->UNC, ctx->prepath);
+ ctx->dfs_automount = is_dfs_mount(mntpt);
+ cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s dfs_automount=%d\n",
+ __func__, ctx->source, ctx->UNC, ctx->prepath, ctx->dfs_automount);
mnt = fc_mount(fc);
out:
diff --git a/fs/smb/client/ntlmssp.h b/fs/smb/client/ntlmssp.h
index 2c5dde2ece58..875de43b72de 100644
--- a/fs/smb/client/ntlmssp.h
+++ b/fs/smb/client/ntlmssp.h
@@ -133,8 +133,8 @@ typedef struct _AUTHENTICATE_MESSAGE {
SECURITY_BUFFER WorkstationName;
SECURITY_BUFFER SessionKey;
__le32 NegotiateFlags;
- /* SECURITY_BUFFER for version info not present since we
- do not set the version is present flag */
+ struct ntlmssp_version Version;
+ /* SECURITY_BUFFER */
char UserString[];
} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 79f26c560edf..8b2d7c1ca428 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -24,7 +24,7 @@
#include "fs_context.h"
static int
-cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
+cifs_ses_add_channel(struct cifs_ses *ses,
struct cifs_server_iface *iface);
bool
@@ -69,7 +69,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
/* channel helper functions. assumed that chan_lock is held by caller. */
-unsigned int
+int
cifs_ses_get_chan_index(struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
@@ -85,14 +85,17 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
cifs_dbg(VFS, "unable to get chan index for server: 0x%llx",
server->conn_id);
WARN_ON(1);
- return 0;
+ return CIFS_INVAL_CHAN_INDEX;
}
void
cifs_chan_set_in_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
- unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return;
ses->chans[chan_index].in_reconnect = true;
}
@@ -103,6 +106,9 @@ cifs_chan_clear_in_reconnect(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return;
+
ses->chans[chan_index].in_reconnect = false;
}
@@ -112,6 +118,9 @@ cifs_chan_in_reconnect(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return true; /* err on the safer side */
+
return CIFS_CHAN_IN_RECONNECT(ses, chan_index);
}
@@ -121,6 +130,9 @@ cifs_chan_set_need_reconnect(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return;
+
set_bit(chan_index, &ses->chans_need_reconnect);
cifs_dbg(FYI, "Set reconnect bitmask for chan %u; now 0x%lx\n",
chan_index, ses->chans_need_reconnect);
@@ -132,6 +144,9 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return;
+
clear_bit(chan_index, &ses->chans_need_reconnect);
cifs_dbg(FYI, "Cleared reconnect bitmask for chan %u; now 0x%lx\n",
chan_index, ses->chans_need_reconnect);
@@ -143,6 +158,9 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return true; /* err on the safer side */
+
return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index);
}
@@ -152,19 +170,24 @@ cifs_chan_is_iface_active(struct cifs_ses *ses,
{
unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX)
+ return true; /* err on the safer side */
+
return ses->chans[chan_index].iface &&
ses->chans[chan_index].iface->is_active;
}
/* returns number of channels added */
-int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
+int cifs_try_adding_channels(struct cifs_ses *ses)
{
struct TCP_Server_Info *server = ses->server;
int old_chan_count, new_chan_count;
int left;
int rc = 0;
int tries = 0;
+ size_t iface_weight = 0, iface_min_speed = 0;
struct cifs_server_iface *iface = NULL, *niface = NULL;
+ struct cifs_server_iface *last_iface = NULL;
spin_lock(&ses->chan_lock);
@@ -186,28 +209,17 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
}
if (!(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
- ses->chan_max = 1;
spin_unlock(&ses->chan_lock);
cifs_server_dbg(VFS, "no multichannel support\n");
return 0;
}
spin_unlock(&ses->chan_lock);
- /*
- * Keep connecting to same, fastest, iface for all channels as
- * long as its RSS. Try next fastest one if not RSS or channel
- * creation fails.
- */
- spin_lock(&ses->iface_lock);
- iface = list_first_entry(&ses->iface_list, struct cifs_server_iface,
- iface_head);
- spin_unlock(&ses->iface_lock);
-
while (left > 0) {
tries++;
if (tries > 3*ses->chan_max) {
- cifs_dbg(FYI, "too many channel open attempts (%d channels left to open)\n",
+ cifs_dbg(VFS, "too many channel open attempts (%d channels left to open)\n",
left);
break;
}
@@ -215,23 +227,41 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
spin_lock(&ses->iface_lock);
if (!ses->iface_count) {
spin_unlock(&ses->iface_lock);
+ cifs_dbg(VFS, "server %s does not advertise interfaces\n",
+ ses->server->hostname);
break;
}
+ if (!iface)
+ iface = list_first_entry(&ses->iface_list, struct cifs_server_iface,
+ iface_head);
+ last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface,
+ iface_head);
+ iface_min_speed = last_iface->speed;
+
list_for_each_entry_safe_from(iface, niface, &ses->iface_list,
iface_head) {
+ /* do not mix rdma and non-rdma interfaces */
+ if (iface->rdma_capable != ses->server->rdma)
+ continue;
+
/* skip ifaces that are unusable */
if (!iface->is_active ||
(is_ses_using_iface(ses, iface) &&
- !iface->rss_capable)) {
+ !iface->rss_capable))
+ continue;
+
+ /* check if we already allocated enough channels */
+ iface_weight = iface->speed / iface_min_speed;
+
+ if (iface->weight_fulfilled >= iface_weight)
continue;
- }
/* take ref before unlock */
kref_get(&iface->refcount);
spin_unlock(&ses->iface_lock);
- rc = cifs_ses_add_channel(cifs_sb, ses, iface);
+ rc = cifs_ses_add_channel(ses, iface);
spin_lock(&ses->iface_lock);
if (rc) {
@@ -242,10 +272,21 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
continue;
}
- cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n",
+ iface->num_channels++;
+ iface->weight_fulfilled++;
+ cifs_dbg(VFS, "successfully opened new channel on iface:%pIS\n",
&iface->sockaddr);
break;
}
+
+ /* reached end of list. reset weight_fulfilled and start over */
+ if (list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ list_for_each_entry(iface, &ses->iface_list, iface_head)
+ iface->weight_fulfilled = 0;
+ spin_unlock(&ses->iface_lock);
+ iface = NULL;
+ continue;
+ }
spin_unlock(&ses->iface_lock);
left--;
@@ -256,6 +297,64 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
}
/*
+ * called when multichannel is disabled by the server.
+ * this always gets called from smb2_reconnect
+ * and cannot get called in parallel threads.
+ */
+void
+cifs_disable_secondary_channels(struct cifs_ses *ses)
+{
+ int i, chan_count;
+ struct TCP_Server_Info *server;
+ struct cifs_server_iface *iface;
+
+ spin_lock(&ses->chan_lock);
+ chan_count = ses->chan_count;
+ if (chan_count == 1)
+ goto done;
+
+ ses->chan_count = 1;
+
+ /* for all secondary channels reset the need reconnect bit */
+ ses->chans_need_reconnect &= 1;
+
+ for (i = 1; i < chan_count; i++) {
+ iface = ses->chans[i].iface;
+ server = ses->chans[i].server;
+
+ /*
+ * remove these references first, since we need to unlock
+ * the chan_lock here, since iface_lock is a higher lock
+ */
+ ses->chans[i].iface = NULL;
+ ses->chans[i].server = NULL;
+ spin_unlock(&ses->chan_lock);
+
+ if (iface) {
+ spin_lock(&ses->iface_lock);
+ kref_put(&iface->refcount, release_iface);
+ iface->num_channels--;
+ if (iface->weight_fulfilled)
+ iface->weight_fulfilled--;
+ spin_unlock(&ses->iface_lock);
+ }
+
+ if (server) {
+ if (!server->terminate) {
+ server->terminate = true;
+ cifs_signal_cifsd_for_reconnect(server, false);
+ }
+ cifs_put_tcp_session(server, false);
+ }
+
+ spin_lock(&ses->chan_lock);
+ }
+
+done:
+ spin_unlock(&ses->chan_lock);
+}
+
+/*
* update the iface for the channel if necessary.
* will return 0 when iface is updated, 1 if removed, 2 otherwise
* Must be called with chan_lock held.
@@ -264,13 +363,16 @@ int
cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
{
unsigned int chan_index;
+ size_t iface_weight = 0, iface_min_speed = 0;
struct cifs_server_iface *iface = NULL;
struct cifs_server_iface *old_iface = NULL;
+ struct cifs_server_iface *last_iface = NULL;
+ struct sockaddr_storage ss;
int rc = 0;
spin_lock(&ses->chan_lock);
chan_index = cifs_ses_get_chan_index(ses, server);
- if (!chan_index) {
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
spin_unlock(&ses->chan_lock);
return 0;
}
@@ -284,14 +386,49 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
}
spin_unlock(&ses->chan_lock);
+ spin_lock(&server->srv_lock);
+ ss = server->dstaddr;
+ spin_unlock(&server->srv_lock);
+
spin_lock(&ses->iface_lock);
+ if (!ses->iface_count) {
+ spin_unlock(&ses->iface_lock);
+ cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname);
+ return 0;
+ }
+
+ last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface,
+ iface_head);
+ iface_min_speed = last_iface->speed;
+
/* then look for a new one */
list_for_each_entry(iface, &ses->iface_list, iface_head) {
+ if (!chan_index) {
+ /* if we're trying to get the updated iface for primary channel */
+ if (!cifs_match_ipaddr((struct sockaddr *) &ss,
+ (struct sockaddr *) &iface->sockaddr))
+ continue;
+
+ kref_get(&iface->refcount);
+ break;
+ }
+
+ /* do not mix rdma and non-rdma interfaces */
+ if (iface->rdma_capable != server->rdma)
+ continue;
+
if (!iface->is_active ||
(is_ses_using_iface(ses, iface) &&
!iface->rss_capable)) {
continue;
}
+
+ /* check if we already allocated enough channels */
+ iface_weight = iface->speed / iface_min_speed;
+
+ if (iface->weight_fulfilled >= iface_weight)
+ continue;
+
kref_get(&iface->refcount);
break;
}
@@ -302,16 +439,41 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
cifs_dbg(FYI, "unable to find a suitable iface\n");
}
+ if (!chan_index && !iface) {
+ cifs_dbg(FYI, "unable to get the interface matching: %pIS\n",
+ &ss);
+ spin_unlock(&ses->iface_lock);
+ return 0;
+ }
+
/* now drop the ref to the current iface */
if (old_iface && iface) {
cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n",
&old_iface->sockaddr,
&iface->sockaddr);
+
+ old_iface->num_channels--;
+ if (old_iface->weight_fulfilled)
+ old_iface->weight_fulfilled--;
+ iface->num_channels++;
+ iface->weight_fulfilled++;
+
kref_put(&old_iface->refcount, release_iface);
} else if (old_iface) {
cifs_dbg(FYI, "releasing ref to iface: %pIS\n",
&old_iface->sockaddr);
+
+ old_iface->num_channels--;
+ if (old_iface->weight_fulfilled)
+ old_iface->weight_fulfilled--;
+
kref_put(&old_iface->refcount, release_iface);
+ } else if (!chan_index) {
+ /* special case: update interface for primary channel */
+ cifs_dbg(FYI, "referencing primary channel iface: %pIS\n",
+ &iface->sockaddr);
+ iface->num_channels++;
+ iface->weight_fulfilled++;
} else {
WARN_ON(!iface);
cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr);
@@ -320,6 +482,11 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
spin_lock(&ses->chan_lock);
chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ return 0;
+ }
+
ses->chans[chan_index].iface = iface;
/* No iface is found. if secondary chan, drop connection */
@@ -355,7 +522,7 @@ cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server)
}
static int
-cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
+cifs_ses_add_channel(struct cifs_ses *ses,
struct cifs_server_iface *iface)
{
struct TCP_Server_Info *chan_server;
@@ -434,7 +601,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
* This will be used for encoding/decoding user/domain/pw
* during sess setup auth.
*/
- ctx->local_nls = cifs_sb->local_nls;
+ ctx->local_nls = ses->local_nls;
/* Use RDMA if possible */
ctx->rdma = iface->rdma_capable;
@@ -480,20 +647,16 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
rc = cifs_negotiate_protocol(xid, ses, chan->server);
if (!rc)
- rc = cifs_setup_session(xid, ses, chan->server, cifs_sb->local_nls);
+ rc = cifs_setup_session(xid, ses, chan->server, ses->local_nls);
mutex_unlock(&ses->session_mutex);
out:
if (rc && chan->server) {
- /*
- * we should avoid race with these delayed works before we
- * remove this channel
- */
- cancel_delayed_work_sync(&chan->server->echo);
- cancel_delayed_work_sync(&chan->server->reconnect);
+ cifs_put_tcp_session(chan->server, 0);
spin_lock(&ses->chan_lock);
+
/* we rely on all bits beyond chan_count to be clear */
cifs_chan_clear_need_reconnect(ses, chan->server);
ses->chan_count--;
@@ -503,8 +666,6 @@ out:
*/
WARN_ON(ses->chan_count < 1);
spin_unlock(&ses->chan_lock);
-
- cifs_put_tcp_session(chan->server, 0);
}
kfree(ctx->UNC);
@@ -536,8 +697,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses,
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
- /* BB verify whether signing required on neg or just on auth frame
- (and NTLM case) */
+ /* BB verify whether signing required on neg or just auth frame (and NTLM case) */
capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
@@ -594,8 +754,10 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
/* copy domain */
if (ses->domainName == NULL) {
- /* Sending null domain better than using a bogus domain name (as
- we did briefly in 2.6.18) since server will use its default */
+ /*
+ * Sending null domain better than using a bogus domain name (as
+ * we did briefly in 2.6.18) since server will use its default
+ */
*bcc_ptr = 0;
*(bcc_ptr+1) = 0;
bytes_ret = 0;
@@ -614,8 +776,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
char *bcc_ptr = *pbcc_area;
int bytes_ret = 0;
- /* BB FIXME add check that strings total less
- than 335 or will need to send them as arrays */
+ /* BB FIXME add check that strings less than 335 or will need to send as arrays */
/* copy user */
if (ses->user_name == NULL) {
@@ -660,8 +821,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
if (WARN_ON_ONCE(len < 0))
len = CIFS_MAX_DOMAINNAME_LEN - 1;
bcc_ptr += len;
- } /* else we will send a null domain name
- so the server will default to its own domain */
+ } /* else we send a null domain name so server will default to its own domain */
*bcc_ptr = 0;
bcc_ptr++;
@@ -757,11 +917,14 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
if (len > bleft)
return;
- /* No domain field in LANMAN case. Domain is
- returned by old servers in the SMB negprot response */
- /* BB For newer servers which do not support Unicode,
- but thus do return domain here we could add parsing
- for it later, but it is not very important */
+ /*
+ * No domain field in LANMAN case. Domain is
+ * returned by old servers in the SMB negprot response
+ *
+ * BB For newer servers which do not support Unicode,
+ * but thus do return domain here, we could add parsing
+ * for it later, but it is not very important
+ */
cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
@@ -817,9 +980,12 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
ses->ntlmssp->server_flags = server_flags;
memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
- /* In particular we can examine sign flags */
- /* BB spec says that if AvId field of MsvAvTimestamp is populated then
- we must set the MIC field of the AUTHENTICATE_MESSAGE */
+ /*
+ * In particular we can examine sign flags
+ *
+ * BB spec says that if AvId field of MsvAvTimestamp is populated then
+ * we must set the MIC field of the AUTHENTICATE_MESSAGE
+ */
tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
@@ -1060,10 +1226,16 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
sec_blob->MessageType = NtLmAuthenticate;
+ /* send version information in ntlmssp authenticate also */
flags = ses->ntlmssp->server_flags | NTLMSSP_REQUEST_TARGET |
- NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED;
- /* we only send version information in ntlmssp negotiate, so do not set this flag */
- flags = flags & ~NTLMSSP_NEGOTIATE_VERSION;
+ NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_VERSION |
+ NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED;
+
+ sec_blob->Version.ProductMajorVersion = LINUX_VERSION_MAJOR;
+ sec_blob->Version.ProductMinorVersion = LINUX_VERSION_PATCHLEVEL;
+ sec_blob->Version.ProductBuild = cpu_to_le16(SMB3_PRODUCT_BUILD);
+ sec_blob->Version.NTLMRevisionCurrent = NTLMSSP_REVISION_W2K3;
+
tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE);
sec_blob->NegotiateFlags = cpu_to_le32(flags);
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index 25f7cd6f23d6..32dfa0f7a78c 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -787,7 +787,7 @@ __smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid,
{
struct close_cancelled_open *cancelled;
- cancelled = kzalloc(sizeof(*cancelled), GFP_ATOMIC);
+ cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
if (!cancelled)
return -ENOMEM;
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index f4849a8ad40b..a959ed2c9b22 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -756,6 +756,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_
unsigned int ret_data_len = 0;
struct network_interface_info_ioctl_rsp *out_buf = NULL;
struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *pserver;
/* do not query too frequently */
if (ses->iface_last_update &&
@@ -780,6 +781,11 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_
if (rc)
goto out;
+ /* check if iface is still active */
+ pserver = ses->chans[0].server;
+ if (pserver && !cifs_chan_is_iface_active(ses, pserver))
+ cifs_chan_update_iface(ses, pserver);
+
out:
kfree(out_buf);
return rc;
@@ -5089,7 +5095,7 @@ smb2_make_node(unsigned int xid, struct inode *inode,
* over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
*/
- if (!S_ISCHR(mode) && !S_ISBLK(mode))
+ if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode))
return rc;
cifs_dbg(FYI, "sfu compat create special file\n");
@@ -5137,6 +5143,12 @@ smb2_make_node(unsigned int xid, struct inode *inode,
pdev->minor = cpu_to_le64(MINOR(dev));
rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
&bytes_written, iov, 1);
+ } else if (S_ISFIFO(mode)) {
+ memcpy(pdev->type, "LnxFIFO", 8);
+ pdev->major = 0;
+ pdev->minor = 0;
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
}
tcon->ses->server->ops->close(xid, tcon, &fid);
d_drop(dentry);
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index c75a80bb6d9e..2eb29fa278c3 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -163,6 +163,9 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
int rc = 0;
struct nls_table *nls_codepage = NULL;
struct cifs_ses *ses;
+ int xid;
+ struct TCP_Server_Info *pserver;
+ unsigned int chan_index;
/*
* SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
@@ -223,6 +226,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
return -EAGAIN;
}
}
+
+ /* if server is marked for termination, cifsd will cleanup */
+ if (server->terminate) {
+ spin_unlock(&server->srv_lock);
+ return -EHOSTDOWN;
+ }
spin_unlock(&server->srv_lock);
again:
@@ -242,11 +251,23 @@ again:
mutex_lock(&ses->session_mutex);
/*
+ * if this is called by delayed work, and the channel has been disabled
+ * in parallel, the delayed work can continue to execute in parallel
+ * there's a chance that this channel may not exist anymore
+ */
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&server->srv_lock);
+ mutex_unlock(&ses->session_mutex);
+ rc = -EHOSTDOWN;
+ goto out;
+ }
+
+ /*
* Recheck after acquire mutex. If another thread is negotiating
* and the server never sends an answer the socket will be closed
* and tcpStatus set to reconnect.
*/
- spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedReconnect) {
spin_unlock(&server->srv_lock);
mutex_unlock(&ses->session_mutex);
@@ -283,6 +304,53 @@ again:
rc = cifs_negotiate_protocol(0, ses, server);
if (!rc) {
+ /*
+ * if server stopped supporting multichannel
+ * and the first channel reconnected, disable all the others.
+ */
+ if (ses->chan_count > 1 &&
+ !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+ if (SERVER_IS_CHAN(server)) {
+ cifs_dbg(VFS, "server %s does not support " \
+ "multichannel anymore. skipping secondary channel\n",
+ ses->server->hostname);
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ goto skip_terminate;
+ }
+
+ ses->chans[chan_index].server = NULL;
+ spin_unlock(&ses->chan_lock);
+
+ /*
+ * the above reference of server by channel
+ * needs to be dropped without holding chan_lock
+ * as cifs_put_tcp_session takes a higher lock
+ * i.e. cifs_tcp_ses_lock
+ */
+ cifs_put_tcp_session(server, 1);
+
+ server->terminate = true;
+ cifs_signal_cifsd_for_reconnect(server, false);
+
+ /* mark primary server as needing reconnect */
+ pserver = server->primary_server;
+ cifs_signal_cifsd_for_reconnect(pserver, false);
+
+skip_terminate:
+ mutex_unlock(&ses->session_mutex);
+ rc = -EHOSTDOWN;
+ goto out;
+ } else {
+ cifs_server_dbg(VFS, "does not support " \
+ "multichannel anymore. disabling all other channels\n");
+ cifs_disable_secondary_channels(ses);
+ }
+ }
+
rc = cifs_setup_session(0, ses, server, nls_codepage);
if ((rc == -EACCES) && !tcon->retry) {
mutex_unlock(&ses->session_mutex);
@@ -307,17 +375,44 @@ skip_sess_setup:
tcon->need_reopen_files = true;
rc = cifs_tree_connect(0, tcon, nls_codepage);
- mutex_unlock(&ses->session_mutex);
cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
if (rc) {
/* If sess reconnected but tcon didn't, something strange ... */
+ mutex_unlock(&ses->session_mutex);
cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc);
goto out;
}
+ if (!rc &&
+ (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+ mutex_unlock(&ses->session_mutex);
+
+ /*
+ * query server network interfaces, in case they change
+ */
+ xid = get_xid();
+ rc = SMB3_request_interfaces(xid, tcon, false);
+ free_xid(xid);
+
+ if (rc)
+ cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
+ __func__, rc);
+
+ if (ses->chan_max > ses->chan_count &&
+ !SERVER_IS_CHAN(server)) {
+ if (ses->chan_count == 1)
+ cifs_server_dbg(VFS, "supports multichannel now\n");
+
+ cifs_try_adding_channels(ses);
+ }
+ } else {
+ mutex_unlock(&ses->session_mutex);
+ }
+
if (smb2_command != SMB2_INTERNAL_CMD)
- mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
+ if (mod_delayed_work(cifsiod_wq, &server->reconnect, 0))
+ cifs_put_tcp_session(server, false);
atomic_inc(&tconInfoReconnectCount);
out:
@@ -3808,6 +3903,13 @@ void smb2_reconnect_server(struct work_struct *work)
/* Prevent simultaneous reconnects that can corrupt tcon->rlist list */
mutex_lock(&pserver->reconnect_mutex);
+ /* if the server is marked for termination, drop the ref count here */
+ if (server->terminate) {
+ cifs_put_tcp_session(server, true);
+ mutex_unlock(&pserver->reconnect_mutex);
+ return;
+ }
+
INIT_LIST_HEAD(&tmp_list);
INIT_LIST_HEAD(&tmp_ses_list);
cifs_dbg(FYI, "Reconnecting tcons and channels\n");
@@ -3852,12 +3954,6 @@ void smb2_reconnect_server(struct work_struct *work)
}
spin_unlock(&ses->chan_lock);
}
- /*
- * Get the reference to server struct to be sure that the last call of
- * cifs_put_tcon() in the loop below won't release the server pointer.
- */
- if (tcon_exist || ses_exist)
- server->srv_count++;
spin_unlock(&cifs_tcp_ses_lock);
@@ -3905,13 +4001,17 @@ void smb2_reconnect_server(struct work_struct *work)
done:
cifs_dbg(FYI, "Reconnecting tcons and channels finished\n");
- if (resched)
+ if (resched) {
queue_delayed_work(cifsiod_wq, &server->reconnect, 2 * HZ);
+ mutex_unlock(&pserver->reconnect_mutex);
+
+ /* no need to put tcp session as we're retrying */
+ return;
+ }
mutex_unlock(&pserver->reconnect_mutex);
/* now we can safely release srv struct */
- if (tcon_exist || ses_exist)
- cifs_put_tcp_session(server, 1);
+ cifs_put_tcp_session(server, true);
}
int
@@ -3931,7 +4031,12 @@ SMB2_echo(struct TCP_Server_Info *server)
server->ops->need_neg(server)) {
spin_unlock(&server->srv_lock);
/* No need to send echo on newly established connections */
- mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
+ spin_lock(&cifs_tcp_ses_lock);
+ server->srv_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+ if (mod_delayed_work(cifsiod_wq, &server->reconnect, 0))
+ cifs_put_tcp_session(server, false);
+
return rc;
}
spin_unlock(&server->srv_lock);
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 23c50ed7d4b5..5a3ca62d2f07 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -413,7 +413,13 @@ generate_smb3signingkey(struct cifs_ses *ses,
ses->ses_status == SES_GOOD);
chan_index = cifs_ses_get_chan_index(ses, server);
- /* TODO: introduce ref counting for channels when the can be freed */
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ spin_unlock(&ses->ses_lock);
+
+ return -EINVAL;
+ }
+
spin_unlock(&ses->chan_lock);
spin_unlock(&ses->ses_lock);
@@ -452,6 +458,8 @@ generate_smb3signingkey(struct cifs_ses *ses,
ptriplet->encryption.context,
ses->smb3encryptionkey,
SMB3_ENC_DEC_KEY_SIZE);
+ if (rc)
+ return rc;
rc = generate_key(ses, ptriplet->decryption.label,
ptriplet->decryption.context,
ses->smb3decryptionkey,
@@ -460,9 +468,6 @@ generate_smb3signingkey(struct cifs_ses *ses,
return rc;
}
- if (rc)
- return rc;
-
#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__);
/*
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 14710afdc2a3..4f717ad7c21b 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -76,7 +76,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
return temp;
}
-static void __release_mid(struct kref *refcount)
+void __release_mid(struct kref *refcount)
{
struct mid_q_entry *midEntry =
container_of(refcount, struct mid_q_entry, refcount);
@@ -156,15 +156,6 @@ static void __release_mid(struct kref *refcount)
mempool_free(midEntry, cifs_mid_poolp);
}
-void release_mid(struct mid_q_entry *mid)
-{
- struct TCP_Server_Info *server = mid->server;
-
- spin_lock(&server->mid_lock);
- kref_put(&mid->refcount, __release_mid);
- spin_unlock(&server->mid_lock);
-}
-
void
delete_mid(struct mid_q_entry *mid)
{
@@ -1032,7 +1023,7 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
server = ses->chans[i].server;
- if (!server)
+ if (!server || server->terminate)
continue;
/*
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index ac199160bce6..6780aa3e98a1 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -150,10 +150,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto out;
- if (pTcon->ses->server->ops->set_EA)
+ if (pTcon->ses->server->ops->set_EA) {
rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
full_path, name, value, (__u16)size,
cifs_sb->local_nls, cifs_sb);
+ if (rc == 0)
+ inode_set_ctime_current(inode);
+ }
break;
case XATTR_CIFS_ACL:
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 319fb9ffc6a0..8983f45f8430 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -34,6 +34,7 @@
#define SMB2_QUERY_INFO_HE 0x0010
#define SMB2_SET_INFO_HE 0x0011
#define SMB2_OPLOCK_BREAK_HE 0x0012
+#define SMB2_SERVER_TO_CLIENT_NOTIFICATION 0x0013
/* The same list in little endian */
#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE)
@@ -411,6 +412,7 @@ struct smb2_tree_disconnect_rsp {
#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_NOTIFICATIONS 0x00000080 /* New to SMB3.1.1 */
/* Internal types */
#define SMB2_NT_FIND 0x00100000
#define SMB2_LARGE_FILES 0x00200000
@@ -981,6 +983,19 @@ struct smb2_change_notify_rsp {
__u8 Buffer[]; /* array of file notify structs */
} __packed;
+/*
+ * SMB2_SERVER_TO_CLIENT_NOTIFICATION: See MS-SMB2 section 2.2.44
+ */
+
+#define SMB2_NOTIFY_SESSION_CLOSED 0x0000
+
+struct smb2_server_client_notification {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __u16 Reserved; /* MBZ */
+ __le32 NotificationType;
+ __u8 NotificationBuffer[4]; /* MBZ */
+} __packed;
/*
* SMB2_CREATE See MS-SMB2 section 2.2.13
@@ -1097,16 +1112,23 @@ struct smb2_change_notify_rsp {
#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002)
#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004)
#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008)
+/* FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010) should be zero, ignored */
+/* FILE_SYNCHRONOUS_IO_NONALERT cpu_to_le32(0x00000020) should be zero, ignored */
#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040)
#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100)
#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200)
+/* FILE_OPEN_REMOTE_INSTANCE cpu_to_le32(0x00000400) should be zero, ignored */
#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800)
-#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000)
+#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) /* MBZ */
#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000)
#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000)
+/* FILE_OPEN_REQUIRING_OPLOCK cpu_to_le32(0x00010000) should be zero, ignored */
+/* FILE_DISALLOW_EXCLUSIVE cpu_to_le32(0x00020000) should be zero, ignored */
+/* FILE_RESERVE_OPFILTER cpu_to_le32(0x00100000) MBZ */
#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000)
#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000)
+/* #define FILE_OPEN_FOR_FREE_SPACE_QUERY cpu_to_le32(0x00800000) should be zero, ignored */
#define CREATE_OPTIONS_MASK_LE cpu_to_le32(0x00FFFFFF)
#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
diff --git a/fs/smb/server/ksmbd_spnego_negtokeninit.asn1 b/fs/smb/server/ksmbd_spnego_negtokeninit.asn1
index 0065f191b54b..001513806fc0 100644
--- a/fs/smb/server/ksmbd_spnego_negtokeninit.asn1
+++ b/fs/smb/server/ksmbd_spnego_negtokeninit.asn1
@@ -1,3 +1,11 @@
+-- SPDX-License-Identifier: BSD-3-Clause
+--
+-- Copyright (C) 1998, 2000 IETF Trust and the persons identified as authors
+-- of the code
+--
+-- https://www.rfc-editor.org/rfc/rfc2478#section-3.2.1
+-- https://www.rfc-editor.org/rfc/rfc2743#section-3.1
+
GSSAPI ::=
[APPLICATION 0] IMPLICIT SEQUENCE {
thisMech
diff --git a/fs/smb/server/ksmbd_spnego_negtokentarg.asn1 b/fs/smb/server/ksmbd_spnego_negtokentarg.asn1
index 1151933e7b9c..797e485d57f1 100644
--- a/fs/smb/server/ksmbd_spnego_negtokentarg.asn1
+++ b/fs/smb/server/ksmbd_spnego_negtokentarg.asn1
@@ -1,3 +1,10 @@
+-- SPDX-License-Identifier: BSD-3-Clause
+--
+-- Copyright (C) 1998 IETF Trust and the persons identified as authors
+-- of the code
+--
+-- https://www.rfc-editor.org/rfc/rfc2478#section-3.2.1
+
GSSAPI ::=
CHOICE {
negTokenInit
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index e6ba1e9b8589..6691ae68af0c 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -366,11 +366,22 @@ static int smb1_allocate_rsp_buf(struct ksmbd_work *work)
return 0;
}
+/**
+ * set_smb1_rsp_status() - set error type in smb response header
+ * @work: smb work containing smb response header
+ * @err: error code to set in response
+ */
+static void set_smb1_rsp_status(struct ksmbd_work *work, __le32 err)
+{
+ work->send_no_response = 1;
+}
+
static struct smb_version_ops smb1_server_ops = {
.get_cmd_val = get_smb1_cmd_val,
.init_rsp_hdr = init_smb1_rsp_hdr,
.allocate_rsp_buf = smb1_allocate_rsp_buf,
.check_user_session = smb1_check_user_session,
+ .set_rsp_status = set_smb1_rsp_status,
};
static int smb1_negotiate(struct ksmbd_work *work)
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 6c0305be895e..51b8bfab7481 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -1107,6 +1107,7 @@ pass:
struct smb_acl *pdacl;
struct smb_sid *powner_sid = NULL, *pgroup_sid = NULL;
int powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size;
+ int pntsd_alloc_size;
if (parent_pntsd->osidoffset) {
powner_sid = (struct smb_sid *)((char *)parent_pntsd +
@@ -1119,9 +1120,10 @@ pass:
pgroup_sid_size = 1 + 1 + 6 + (pgroup_sid->num_subauth * 4);
}
- pntsd = kzalloc(sizeof(struct smb_ntsd) + powner_sid_size +
- pgroup_sid_size + sizeof(struct smb_acl) +
- nt_size, GFP_KERNEL);
+ pntsd_alloc_size = sizeof(struct smb_ntsd) + powner_sid_size +
+ pgroup_sid_size + sizeof(struct smb_acl) + nt_size;
+
+ pntsd = kzalloc(pntsd_alloc_size, GFP_KERNEL);
if (!pntsd) {
rc = -ENOMEM;
goto free_aces_base;
@@ -1136,6 +1138,27 @@ pass:
pntsd->gsidoffset = parent_pntsd->gsidoffset;
pntsd->dacloffset = parent_pntsd->dacloffset;
+ if ((u64)le32_to_cpu(pntsd->osidoffset) + powner_sid_size >
+ pntsd_alloc_size) {
+ rc = -EINVAL;
+ kfree(pntsd);
+ goto free_aces_base;
+ }
+
+ if ((u64)le32_to_cpu(pntsd->gsidoffset) + pgroup_sid_size >
+ pntsd_alloc_size) {
+ rc = -EINVAL;
+ kfree(pntsd);
+ goto free_aces_base;
+ }
+
+ if ((u64)le32_to_cpu(pntsd->dacloffset) + sizeof(struct smb_acl) + nt_size >
+ pntsd_alloc_size) {
+ rc = -EINVAL;
+ kfree(pntsd);
+ goto free_aces_base;
+ }
+
if (pntsd->osidoffset) {
struct smb_sid *owner_sid = (struct smb_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 1053127f71ad..c53dea5598fc 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -1177,9 +1177,10 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
/**
* ksmbd_vfs_kern_path_locked() - lookup a file and get path info
- * @name: file path that is relative to share
- * @flags: lookup flags
- * @path: if lookup succeed, return path info
+ * @name: file path that is relative to share
+ * @flags: lookup flags
+ * @parent_path: if lookup succeed, return parent_path info
+ * @path: if lookup succeed, return path info
* @caseless: caseless filename lookup
*
* Return: 0 on success, otherwise error
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 723763746238..62972f0ff868 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -173,6 +173,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
const struct export_operations squashfs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = squashfs_fh_to_dentry,
.fh_to_parent = squashfs_fh_to_parent,
.get_parent = squashfs_get_parent
diff --git a/fs/super.c b/fs/super.c
index c7b452e12e4c..076392396e72 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -178,7 +178,7 @@ static void super_wake(struct super_block *sb, unsigned int flag)
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
static unsigned long super_cache_scan(struct shrinker *shrink,
@@ -191,7 +191,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
long dentries;
long inodes;
- sb = container_of(shrink, struct super_block, s_shrink);
+ sb = shrink->private_data;
/*
* Deadlock avoidance. We may hold various FS locks, and we don't want
@@ -244,7 +244,7 @@ static unsigned long super_cache_count(struct shrinker *shrink,
struct super_block *sb;
long total_objects = 0;
- sb = container_of(shrink, struct super_block, s_shrink);
+ sb = shrink->private_data;
/*
* We don't call super_trylock_shared() here as it is a scalability
@@ -306,7 +306,7 @@ static void destroy_unused_super(struct super_block *s)
security_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
- free_prealloced_shrinker(&s->s_shrink);
+ shrinker_free(s->s_shrink);
/* no delays needed */
destroy_super_work(&s->destroy_work);
}
@@ -383,16 +383,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
s->s_time_min = TIME64_MIN;
s->s_time_max = TIME64_MAX;
- s->s_shrink.seeks = DEFAULT_SEEKS;
- s->s_shrink.scan_objects = super_cache_scan;
- s->s_shrink.count_objects = super_cache_count;
- s->s_shrink.batch = 1024;
- s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
- if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name))
+ s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
+ "sb-%s", type->name);
+ if (!s->s_shrink)
goto fail;
- if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
+
+ s->s_shrink->scan_objects = super_cache_scan;
+ s->s_shrink->count_objects = super_cache_count;
+ s->s_shrink->batch = 1024;
+ s->s_shrink->private_data = s;
+
+ if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink))
goto fail;
- if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
+ if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
goto fail;
return s;
@@ -477,7 +480,7 @@ void deactivate_locked_super(struct super_block *s)
{
struct file_system_type *fs = s->s_type;
if (atomic_dec_and_test(&s->s_active)) {
- unregister_shrinker(&s->s_shrink);
+ shrinker_free(s->s_shrink);
fs->kill_sb(s);
kill_super_notify(s);
@@ -818,7 +821,7 @@ retry:
hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(s->s_type);
- register_shrinker_prepared(&s->s_shrink);
+ shrinker_register(s->s_shrink);
return s;
share_extant_sb:
@@ -901,7 +904,7 @@ retry:
hlist_add_head(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(type);
- register_shrinker_prepared(&s->s_shrink);
+ shrinker_register(s->s_shrink);
return s;
}
EXPORT_SYMBOL(sget);
@@ -1540,7 +1543,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
mutex_unlock(&bdev->bd_fsfreeze_mutex);
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name,
+ shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
sb->s_id);
sb_set_blocksize(sb, block_size(bdev));
return 0;
@@ -2157,3 +2160,4 @@ int sb_init_dio_done_wq(struct super_block *sb)
destroy_workqueue(wq);
return 0;
}
+EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index a12ac0356c69..6b7652fb8050 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -167,6 +167,18 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
return battr->mmap(of->file, kobj, battr, vma);
}
+static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
+ int whence)
+{
+ struct bin_attribute *battr = of->kn->priv;
+ struct kobject *kobj = of->kn->parent->priv;
+
+ if (battr->llseek)
+ return battr->llseek(of->file, kobj, battr, offset, whence);
+ else
+ return generic_file_llseek(of->file, offset, whence);
+}
+
static int sysfs_kf_bin_open(struct kernfs_open_file *of)
{
struct bin_attribute *battr = of->kn->priv;
@@ -249,6 +261,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
.write = sysfs_kf_bin_write,
.mmap = sysfs_kf_bin_mmap,
.open = sysfs_kf_bin_open,
+ .llseek = sysfs_kf_bin_llseek,
};
int sysfs_add_file_mode_ns(struct kernfs_node *parent,
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 8c8d64e76103..f8a594a50ae6 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -2,8 +2,9 @@
/*
* event_inode.c - part of tracefs, a pseudo file system for activating tracing
*
- * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
+ * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt <rostedt@goodmis.org>
* Copyright (C) 2020-23 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
+ * Copyright (C) 2023 Google, author: Steven Rostedt <rostedt@goodmis.org>
*
* eventfs is used to dynamically create inodes and dentries based on the
* meta data provided by the tracing system.
@@ -23,48 +24,30 @@
#include <linux/delay.h>
#include "internal.h"
-struct eventfs_inode {
- struct list_head e_top_files;
-};
+/*
+ * eventfs_mutex protects the eventfs_inode (ei) dentry. Any access
+ * to the ei->dentry must be done under this mutex and after checking
+ * if ei->is_freed is not set. The ei->dentry is released under the
+ * mutex at the same time ei->is_freed is set. If ei->is_freed is set
+ * then the ei->dentry is invalid.
+ */
+static DEFINE_MUTEX(eventfs_mutex);
/*
- * struct eventfs_file - hold the properties of the eventfs files and
- * directories.
- * @name: the name of the file or directory to create
- * @d_parent: holds parent's dentry
- * @dentry: once accessed holds dentry
- * @list: file or directory to be added to parent directory
- * @ei: list of files and directories within directory
- * @fop: file_operations for file or directory
- * @iop: inode_operations for file or directory
- * @data: something that the caller will want to get to later on
- * @mode: the permission that the file or directory should have
+ * The eventfs_inode (ei) itself is protected by SRCU. It is released from
+ * its parent's list and will have is_freed set (under eventfs_mutex).
+ * After the SRCU grace period is over, the ei may be freed.
*/
-struct eventfs_file {
- const char *name;
- struct dentry *d_parent;
- struct dentry *dentry;
- struct list_head list;
- struct eventfs_inode *ei;
- const struct file_operations *fop;
- const struct inode_operations *iop;
- /*
- * Union - used for deletion
- * @del_list: list of eventfs_file to delete
- * @rcu: eventfs_file to delete in RCU
- * @is_freed: node is freed if one of the above is set
- */
- union {
- struct list_head del_list;
- struct rcu_head rcu;
- unsigned long is_freed;
- };
- void *data;
- umode_t mode;
+DEFINE_STATIC_SRCU(eventfs_srcu);
+
+/* Mode is unsigned short, use the upper bits for flags */
+enum {
+ EVENTFS_SAVE_MODE = BIT(16),
+ EVENTFS_SAVE_UID = BIT(17),
+ EVENTFS_SAVE_GID = BIT(18),
};
-static DEFINE_MUTEX(eventfs_mutex);
-DEFINE_STATIC_SRCU(eventfs_srcu);
+#define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1)
static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
@@ -73,8 +56,88 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file);
static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx);
static int eventfs_release(struct inode *inode, struct file *file);
+static void update_attr(struct eventfs_attr *attr, struct iattr *iattr)
+{
+ unsigned int ia_valid = iattr->ia_valid;
+
+ if (ia_valid & ATTR_MODE) {
+ attr->mode = (attr->mode & ~EVENTFS_MODE_MASK) |
+ (iattr->ia_mode & EVENTFS_MODE_MASK) |
+ EVENTFS_SAVE_MODE;
+ }
+ if (ia_valid & ATTR_UID) {
+ attr->mode |= EVENTFS_SAVE_UID;
+ attr->uid = iattr->ia_uid;
+ }
+ if (ia_valid & ATTR_GID) {
+ attr->mode |= EVENTFS_SAVE_GID;
+ attr->gid = iattr->ia_gid;
+ }
+}
+
+static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *iattr)
+{
+ const struct eventfs_entry *entry;
+ struct eventfs_inode *ei;
+ const char *name;
+ int ret;
+
+ mutex_lock(&eventfs_mutex);
+ ei = dentry->d_fsdata;
+ if (ei->is_freed) {
+ /* Do not allow changes if the event is about to be removed. */
+ mutex_unlock(&eventfs_mutex);
+ return -ENODEV;
+ }
+
+ /* Preallocate the children mode array if necessary */
+ if (!(dentry->d_inode->i_mode & S_IFDIR)) {
+ if (!ei->entry_attrs) {
+ ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries,
+ GFP_KERNEL);
+ if (!ei->entry_attrs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
+ ret = simple_setattr(idmap, dentry, iattr);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * If this is a dir, then update the ei cache, only the file
+ * mode is saved in the ei->m_children, and the ownership is
+ * determined by the parent directory.
+ */
+ if (dentry->d_inode->i_mode & S_IFDIR) {
+ update_attr(&ei->attr, iattr);
+
+ } else {
+ name = dentry->d_name.name;
+
+ for (int i = 0; i < ei->nr_entries; i++) {
+ entry = &ei->entries[i];
+ if (strcmp(name, entry->name) == 0) {
+ update_attr(&ei->entry_attrs[i], iattr);
+ break;
+ }
+ }
+ }
+ out:
+ mutex_unlock(&eventfs_mutex);
+ return ret;
+}
+
static const struct inode_operations eventfs_root_dir_inode_operations = {
.lookup = eventfs_root_lookup,
+ .setattr = eventfs_set_attr,
+};
+
+static const struct inode_operations eventfs_file_inode_operations = {
+ .setattr = eventfs_set_attr,
};
static const struct file_operations eventfs_file_operations = {
@@ -85,26 +148,40 @@ static const struct file_operations eventfs_file_operations = {
.release = eventfs_release,
};
+static void update_inode_attr(struct inode *inode, struct eventfs_attr *attr, umode_t mode)
+{
+ if (!attr) {
+ inode->i_mode = mode;
+ return;
+ }
+
+ if (attr->mode & EVENTFS_SAVE_MODE)
+ inode->i_mode = attr->mode & EVENTFS_MODE_MASK;
+ else
+ inode->i_mode = mode;
+
+ if (attr->mode & EVENTFS_SAVE_UID)
+ inode->i_uid = attr->uid;
+
+ if (attr->mode & EVENTFS_SAVE_GID)
+ inode->i_gid = attr->gid;
+}
+
/**
* create_file - create a file in the tracefs filesystem
* @name: the name of the file to create.
* @mode: the permission that the file should have.
+ * @attr: saved attributes changed by user
* @parent: parent dentry for this file.
* @data: something that the caller will want to get to later on.
* @fop: struct file_operations that should be used for this file.
*
- * This is the basic "create a file" function for tracefs. It allows for a
- * wide range of flexibility in creating a file.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the tracefs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %NULL will be returned.
- *
- * If tracefs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
+ * This function creates a dentry that represents a file in the eventsfs_inode
+ * directory. The inode.i_private pointer will point to @data in the open()
+ * call.
*/
static struct dentry *create_file(const char *name, umode_t mode,
+ struct eventfs_attr *attr,
struct dentry *parent, void *data,
const struct file_operations *fop)
{
@@ -118,6 +195,7 @@ static struct dentry *create_file(const char *name, umode_t mode,
if (WARN_ON_ONCE(!S_ISREG(mode)))
return NULL;
+ WARN_ON_ONCE(!parent);
dentry = eventfs_start_creating(name, parent);
if (IS_ERR(dentry))
@@ -127,7 +205,10 @@ static struct dentry *create_file(const char *name, umode_t mode,
if (unlikely(!inode))
return eventfs_failed_creating(dentry);
- inode->i_mode = mode;
+ /* If the user updated the directory's attributes, use them */
+ update_inode_attr(inode, attr, mode);
+
+ inode->i_op = &eventfs_file_inode_operations;
inode->i_fop = fop;
inode->i_private = data;
@@ -140,28 +221,19 @@ static struct dentry *create_file(const char *name, umode_t mode,
/**
* create_dir - create a dir in the tracefs filesystem
- * @name: the name of the file to create.
+ * @ei: the eventfs_inode that represents the directory to create
* @parent: parent dentry for this file.
- * @data: something that the caller will want to get to later on.
*
- * This is the basic "create a dir" function for eventfs. It allows for a
- * wide range of flexibility in creating a dir.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the tracefs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %NULL will be returned.
- *
- * If tracefs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
+ * This function will create a dentry for a directory represented by
+ * a eventfs_inode.
*/
-static struct dentry *create_dir(const char *name, struct dentry *parent, void *data)
+static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent)
{
struct tracefs_inode *ti;
struct dentry *dentry;
struct inode *inode;
- dentry = eventfs_start_creating(name, parent);
+ dentry = eventfs_start_creating(ei->name, parent);
if (IS_ERR(dentry))
return dentry;
@@ -169,10 +241,11 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
if (unlikely(!inode))
return eventfs_failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ /* If the user updated the directory's attributes, use them */
+ update_inode_attr(inode, &ei->attr, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
+
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
- inode->i_private = data;
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE;
@@ -184,117 +257,198 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
return eventfs_end_creating(dentry);
}
+static void free_ei(struct eventfs_inode *ei)
+{
+ kfree_const(ei->name);
+ kfree(ei->d_children);
+ kfree(ei->entry_attrs);
+ kfree(ei);
+}
+
/**
- * eventfs_set_ef_status_free - set the ef->status to free
+ * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode
* @ti: the tracefs_inode of the dentry
- * @dentry: dentry who's status to be freed
+ * @dentry: dentry which has the reference to remove.
*
- * eventfs_set_ef_status_free will be called if no more
- * references remain
+ * Remove the association between a dentry from an eventfs_inode.
*/
-void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
{
- struct tracefs_inode *ti_parent;
struct eventfs_inode *ei;
- struct eventfs_file *ef, *tmp;
-
- /* The top level events directory may be freed by this */
- if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
- LIST_HEAD(ef_del_list);
+ int i;
- mutex_lock(&eventfs_mutex);
+ mutex_lock(&eventfs_mutex);
- ei = ti->private;
+ ei = dentry->d_fsdata;
+ if (!ei)
+ goto out;
- /* Record all the top level files */
- list_for_each_entry_srcu(ef, &ei->e_top_files, list,
- lockdep_is_held(&eventfs_mutex)) {
- list_add_tail(&ef->del_list, &ef_del_list);
+ /* This could belong to one of the files of the ei */
+ if (ei->dentry != dentry) {
+ for (i = 0; i < ei->nr_entries; i++) {
+ if (ei->d_children[i] == dentry)
+ break;
}
+ if (WARN_ON_ONCE(i == ei->nr_entries))
+ goto out;
+ ei->d_children[i] = NULL;
+ } else if (ei->is_freed) {
+ free_ei(ei);
+ } else {
+ ei->dentry = NULL;
+ }
+
+ dentry->d_fsdata = NULL;
+ out:
+ mutex_unlock(&eventfs_mutex);
+}
- /* Nothing should access this, but just in case! */
- ti->private = NULL;
+/**
+ * create_file_dentry - create a dentry for a file of an eventfs_inode
+ * @ei: the eventfs_inode that the file will be created under
+ * @idx: the index into the d_children[] of the @ei
+ * @parent: The parent dentry of the created file.
+ * @name: The name of the file to create
+ * @mode: The mode of the file.
+ * @data: The data to use to set the inode of the file with on open()
+ * @fops: The fops of the file to be created.
+ * @lookup: If called by the lookup routine, in which case, dput() the created dentry.
+ *
+ * Create a dentry for a file of an eventfs_inode @ei and place it into the
+ * address located at @e_dentry. If the @e_dentry already has a dentry, then
+ * just do a dget() on it and return. Otherwise create the dentry and attach it.
+ */
+static struct dentry *
+create_file_dentry(struct eventfs_inode *ei, int idx,
+ struct dentry *parent, const char *name, umode_t mode, void *data,
+ const struct file_operations *fops, bool lookup)
+{
+ struct eventfs_attr *attr = NULL;
+ struct dentry **e_dentry = &ei->d_children[idx];
+ struct dentry *dentry;
+ bool invalidate = false;
+ mutex_lock(&eventfs_mutex);
+ if (ei->is_freed) {
mutex_unlock(&eventfs_mutex);
+ return NULL;
+ }
+ /* If the e_dentry already has a dentry, use it */
+ if (*e_dentry) {
+ /* lookup does not need to up the ref count */
+ if (!lookup)
+ dget(*e_dentry);
+ mutex_unlock(&eventfs_mutex);
+ return *e_dentry;
+ }
- /* Now safely free the top level files and their children */
- list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
- list_del(&ef->del_list);
- eventfs_remove(ef);
- }
+ /* ei->entry_attrs are protected by SRCU */
+ if (ei->entry_attrs)
+ attr = &ei->entry_attrs[idx];
- kfree(ei);
- return;
- }
+ mutex_unlock(&eventfs_mutex);
- mutex_lock(&eventfs_mutex);
+ /* The lookup already has the parent->d_inode locked */
+ if (!lookup)
+ inode_lock(parent->d_inode);
- ti_parent = get_tracefs(dentry->d_parent->d_inode);
- if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
- goto out;
+ dentry = create_file(name, mode, attr, parent, data, fops);
- ef = dentry->d_fsdata;
- if (!ef)
- goto out;
+ if (!lookup)
+ inode_unlock(parent->d_inode);
- /*
- * If ef was freed, then the LSB bit is set for d_fsdata.
- * But this should not happen, as it should still have a
- * ref count that prevents it. Warn in case it does.
- */
- if (WARN_ON_ONCE((unsigned long)ef & 1))
- goto out;
+ mutex_lock(&eventfs_mutex);
- dentry->d_fsdata = NULL;
- ef->dentry = NULL;
-out:
+ if (IS_ERR_OR_NULL(dentry)) {
+ /*
+ * When the mutex was released, something else could have
+ * created the dentry for this e_dentry. In which case
+ * use that one.
+ *
+ * Note, with the mutex held, the e_dentry cannot have content
+ * and the ei->is_freed be true at the same time.
+ */
+ dentry = *e_dentry;
+ if (WARN_ON_ONCE(dentry && ei->is_freed))
+ dentry = NULL;
+ /* The lookup does not need to up the dentry refcount */
+ if (dentry && !lookup)
+ dget(dentry);
+ mutex_unlock(&eventfs_mutex);
+ return dentry;
+ }
+
+ if (!*e_dentry && !ei->is_freed) {
+ *e_dentry = dentry;
+ dentry->d_fsdata = ei;
+ } else {
+ /*
+ * Should never happen unless we get here due to being freed.
+ * Otherwise it means two dentries exist with the same name.
+ */
+ WARN_ON_ONCE(!ei->is_freed);
+ invalidate = true;
+ }
mutex_unlock(&eventfs_mutex);
+
+ if (invalidate)
+ d_invalidate(dentry);
+
+ if (lookup || invalidate)
+ dput(dentry);
+
+ return invalidate ? NULL : dentry;
}
/**
* eventfs_post_create_dir - post create dir routine
- * @ef: eventfs_file of recently created dir
+ * @ei: eventfs_inode of recently created dir
*
* Map the meta-data of files within an eventfs dir to their parent dentry
*/
-static void eventfs_post_create_dir(struct eventfs_file *ef)
+static void eventfs_post_create_dir(struct eventfs_inode *ei)
{
- struct eventfs_file *ef_child;
+ struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
+ lockdep_assert_held(&eventfs_mutex);
+
/* srcu lock already held */
/* fill parent-child relation */
- list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
srcu_read_lock_held(&eventfs_srcu)) {
- ef_child->d_parent = ef->dentry;
+ ei_child->d_parent = ei->dentry;
}
- ti = get_tracefs(ef->dentry->d_inode);
- ti->private = ef->ei;
+ ti = get_tracefs(ei->dentry->d_inode);
+ ti->private = ei;
}
/**
- * create_dentry - helper function to create dentry
- * @ef: eventfs_file of file or directory to create
- * @parent: parent dentry
- * @lookup: true if called from lookup routine
+ * create_dir_dentry - Create a directory dentry for the eventfs_inode
+ * @pei: The eventfs_inode parent of ei.
+ * @ei: The eventfs_inode to create the directory for
+ * @parent: The dentry of the parent of this directory
+ * @lookup: True if this is called by the lookup code
*
- * Used to create a dentry for file/dir, executes post dentry creation routine
+ * This creates and attaches a directory dentry to the eventfs_inode @ei.
*/
static struct dentry *
-create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
+create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei,
+ struct dentry *parent, bool lookup)
{
bool invalidate = false;
- struct dentry *dentry;
+ struct dentry *dentry = NULL;
mutex_lock(&eventfs_mutex);
- if (ef->is_freed) {
+ if (pei->is_freed || ei->is_freed) {
mutex_unlock(&eventfs_mutex);
return NULL;
}
- if (ef->dentry) {
- dentry = ef->dentry;
- /* On dir open, up the ref count */
+ if (ei->dentry) {
+ /* If the dentry already has a dentry, use it */
+ dentry = ei->dentry;
+ /* lookup does not need to up the ref count */
if (!lookup)
dget(dentry);
mutex_unlock(&eventfs_mutex);
@@ -302,42 +456,44 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
}
mutex_unlock(&eventfs_mutex);
+ /* The lookup already has the parent->d_inode locked */
if (!lookup)
inode_lock(parent->d_inode);
- if (ef->ei)
- dentry = create_dir(ef->name, parent, ef->data);
- else
- dentry = create_file(ef->name, ef->mode, parent,
- ef->data, ef->fop);
+ dentry = create_dir(ei, parent);
if (!lookup)
inode_unlock(parent->d_inode);
mutex_lock(&eventfs_mutex);
- if (IS_ERR_OR_NULL(dentry)) {
- /* If the ef was already updated get it */
- dentry = ef->dentry;
+
+ if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
+ /*
+ * When the mutex was released, something else could have
+ * created the dentry for this e_dentry. In which case
+ * use that one.
+ *
+ * Note, with the mutex held, the e_dentry cannot have content
+ * and the ei->is_freed be true at the same time.
+ */
+ dentry = ei->dentry;
if (dentry && !lookup)
dget(dentry);
mutex_unlock(&eventfs_mutex);
return dentry;
}
- if (!ef->dentry && !ef->is_freed) {
- ef->dentry = dentry;
- if (ef->ei)
- eventfs_post_create_dir(ef);
- dentry->d_fsdata = ef;
+ if (!ei->dentry && !ei->is_freed) {
+ ei->dentry = dentry;
+ eventfs_post_create_dir(ei);
+ dentry->d_fsdata = ei;
} else {
- /* A race here, should try again (unless freed) */
- invalidate = true;
-
/*
* Should never happen unless we get here due to being freed.
* Otherwise it means two dentries exist with the same name.
*/
- WARN_ON_ONCE(!ef->is_freed);
+ WARN_ON_ONCE(!ei->is_freed);
+ invalidate = true;
}
mutex_unlock(&eventfs_mutex);
if (invalidate)
@@ -349,50 +505,90 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
return invalidate ? NULL : dentry;
}
-static bool match_event_file(struct eventfs_file *ef, const char *name)
-{
- bool ret;
-
- mutex_lock(&eventfs_mutex);
- ret = !ef->is_freed && strcmp(ef->name, name) == 0;
- mutex_unlock(&eventfs_mutex);
-
- return ret;
-}
-
/**
* eventfs_root_lookup - lookup routine to create file/dir
* @dir: in which a lookup is being done
* @dentry: file/dir dentry
- * @flags: to pass as flags parameter to simple lookup
+ * @flags: Just passed to simple_lookup()
*
- * Used to create a dynamic file/dir within @dir. Use the eventfs_inode
- * list of meta data to find the information needed to create the file/dir.
+ * Used to create dynamic file/dir with-in @dir, search with-in @ei
+ * list, if @dentry found go ahead and create the file/dir
*/
+
static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
{
+ const struct file_operations *fops;
+ const struct eventfs_entry *entry;
+ struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
struct eventfs_inode *ei;
- struct eventfs_file *ef;
+ struct dentry *ei_dentry = NULL;
struct dentry *ret = NULL;
+ const char *name = dentry->d_name.name;
+ bool created = false;
+ umode_t mode;
+ void *data;
int idx;
+ int i;
+ int r;
ti = get_tracefs(dir);
if (!(ti->flags & TRACEFS_EVENT_INODE))
return NULL;
- ei = ti->private;
+ /* Grab srcu to prevent the ei from going away */
idx = srcu_read_lock(&eventfs_srcu);
- list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+
+ /*
+ * Grab the eventfs_mutex to consistent value from ti->private.
+ * This s
+ */
+ mutex_lock(&eventfs_mutex);
+ ei = READ_ONCE(ti->private);
+ if (ei && !ei->is_freed)
+ ei_dentry = READ_ONCE(ei->dentry);
+ mutex_unlock(&eventfs_mutex);
+
+ if (!ei || !ei_dentry)
+ goto out;
+
+ data = ei->data;
+
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
srcu_read_lock_held(&eventfs_srcu)) {
- if (!match_event_file(ef, dentry->d_name.name))
+ if (strcmp(ei_child->name, name) != 0)
continue;
ret = simple_lookup(dir, dentry, flags);
- create_dentry(ef, ef->d_parent, true);
+ create_dir_dentry(ei, ei_child, ei_dentry, true);
+ created = true;
break;
}
+
+ if (created)
+ goto out;
+
+ for (i = 0; i < ei->nr_entries; i++) {
+ entry = &ei->entries[i];
+ if (strcmp(name, entry->name) == 0) {
+ void *cdata = data;
+ mutex_lock(&eventfs_mutex);
+ /* If ei->is_freed, then the event itself may be too */
+ if (!ei->is_freed)
+ r = entry->callback(name, &mode, &cdata, &fops);
+ else
+ r = -1;
+ mutex_unlock(&eventfs_mutex);
+ if (r <= 0)
+ continue;
+ ret = simple_lookup(dir, dentry, flags);
+ create_file_dentry(ei, i, ei_dentry, name, mode, cdata,
+ fops, true);
+ break;
+ }
+ }
+ out:
srcu_read_unlock(&eventfs_srcu, idx);
return ret;
}
@@ -432,29 +628,48 @@ static int eventfs_release(struct inode *inode, struct file *file)
return dcache_dir_close(inode, file);
}
+static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt)
+{
+ struct dentry **tmp;
+
+ tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
+ if (!tmp)
+ return -1;
+ tmp[cnt] = d;
+ tmp[cnt + 1] = NULL;
+ *dentries = tmp;
+ return 0;
+}
+
/**
* dcache_dir_open_wrapper - eventfs open wrapper
* @inode: not used
- * @file: dir to be opened (to create its child)
+ * @file: dir to be opened (to create it's children)
*
- * Used to dynamically create the file/dir within @file. @file is really a
- * directory and all the files/dirs of the children within @file will be
- * created. If any of the files/dirs have already been created, their
- * reference count will be incremented.
+ * Used to dynamic create file/dir with-in @file, all the
+ * file/dir will be created. If already created then references
+ * will be increased
*/
static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
{
+ const struct file_operations *fops;
+ const struct eventfs_entry *entry;
+ struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
struct eventfs_inode *ei;
- struct eventfs_file *ef;
struct dentry_list *dlist;
struct dentry **dentries = NULL;
- struct dentry *dentry = file_dentry(file);
+ struct dentry *parent = file_dentry(file);
struct dentry *d;
struct inode *f_inode = file_inode(file);
+ const char *name = parent->d_name.name;
+ umode_t mode;
+ void *data;
int cnt = 0;
int idx;
int ret;
+ int i;
+ int r;
ti = get_tracefs(f_inode);
if (!(ti->flags & TRACEFS_EVENT_INODE))
@@ -463,25 +678,56 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
if (WARN_ON_ONCE(file->private_data))
return -EINVAL;
+ idx = srcu_read_lock(&eventfs_srcu);
+
+ mutex_lock(&eventfs_mutex);
+ ei = READ_ONCE(ti->private);
+ mutex_unlock(&eventfs_mutex);
+
+ if (!ei) {
+ srcu_read_unlock(&eventfs_srcu, idx);
+ return -EINVAL;
+ }
+
+
+ data = ei->data;
+
dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
- if (!dlist)
+ if (!dlist) {
+ srcu_read_unlock(&eventfs_srcu, idx);
return -ENOMEM;
+ }
- ei = ti->private;
- idx = srcu_read_lock(&eventfs_srcu);
- list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
srcu_read_lock_held(&eventfs_srcu)) {
- d = create_dentry(ef, dentry, false);
+ d = create_dir_dentry(ei, ei_child, parent, false);
if (d) {
- struct dentry **tmp;
+ ret = add_dentries(&dentries, d, cnt);
+ if (ret < 0)
+ break;
+ cnt++;
+ }
+ }
- tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
- if (!tmp)
+ for (i = 0; i < ei->nr_entries; i++) {
+ void *cdata = data;
+ entry = &ei->entries[i];
+ name = entry->name;
+ mutex_lock(&eventfs_mutex);
+ /* If ei->is_freed, then the event itself may be too */
+ if (!ei->is_freed)
+ r = entry->callback(name, &mode, &cdata, &fops);
+ else
+ r = -1;
+ mutex_unlock(&eventfs_mutex);
+ if (r <= 0)
+ continue;
+ d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false);
+ if (d) {
+ ret = add_dentries(&dentries, d, cnt);
+ if (ret < 0)
break;
- tmp[cnt] = d;
- tmp[cnt + 1] = NULL;
cnt++;
- dentries = tmp;
}
}
srcu_read_unlock(&eventfs_srcu, idx);
@@ -514,63 +760,104 @@ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
}
/**
- * eventfs_prepare_ef - helper function to prepare eventfs_file
- * @name: the name of the file/directory to create.
- * @mode: the permission that the file should have.
- * @fop: struct file_operations that should be used for this file/directory.
- * @iop: struct inode_operations that should be used for this file/directory.
- * @data: something that the caller will want to get to later on. The
- * inode.i_private pointer will point to this value on the open() call.
+ * eventfs_create_dir - Create the eventfs_inode for this directory
+ * @name: The name of the directory to create.
+ * @parent: The eventfs_inode of the parent directory.
+ * @entries: A list of entries that represent the files under this directory
+ * @size: The number of @entries
+ * @data: The default data to pass to the files (an entry may override it).
+ *
+ * This function creates the descriptor to represent a directory in the
+ * eventfs. This descriptor is an eventfs_inode, and it is returned to be
+ * used to create other children underneath.
+ *
+ * The @entries is an array of eventfs_entry structures which has:
+ * const char *name
+ * eventfs_callback callback;
*
- * This function allocates and fills the eventfs_file structure.
+ * The name is the name of the file, and the callback is a pointer to a function
+ * that will be called when the file is reference (either by lookup or by
+ * reading a directory). The callback is of the prototype:
+ *
+ * int callback(const char *name, umode_t *mode, void **data,
+ * const struct file_operations **fops);
+ *
+ * When a file needs to be created, this callback will be called with
+ * name = the name of the file being created (so that the same callback
+ * may be used for multiple files).
+ * mode = a place to set the file's mode
+ * data = A pointer to @data, and the callback may replace it, which will
+ * cause the file created to pass the new data to the open() call.
+ * fops = the fops to use for the created file.
+ *
+ * NB. @callback is called while holding internal locks of the eventfs
+ * system. The callback must not call any code that might also call into
+ * the tracefs or eventfs system or it will risk creating a deadlock.
*/
-static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
- const struct file_operations *fop,
- const struct inode_operations *iop,
- void *data)
+struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
+ const struct eventfs_entry *entries,
+ int size, void *data)
{
- struct eventfs_file *ef;
+ struct eventfs_inode *ei;
- ef = kzalloc(sizeof(*ef), GFP_KERNEL);
- if (!ef)
+ if (!parent)
+ return ERR_PTR(-EINVAL);
+
+ ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ if (!ei)
return ERR_PTR(-ENOMEM);
- ef->name = kstrdup(name, GFP_KERNEL);
- if (!ef->name) {
- kfree(ef);
+ ei->name = kstrdup_const(name, GFP_KERNEL);
+ if (!ei->name) {
+ kfree(ei);
return ERR_PTR(-ENOMEM);
}
- if (S_ISDIR(mode)) {
- ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
- if (!ef->ei) {
- kfree(ef->name);
- kfree(ef);
+ if (size) {
+ ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
+ if (!ei->d_children) {
+ kfree_const(ei->name);
+ kfree(ei);
return ERR_PTR(-ENOMEM);
}
- INIT_LIST_HEAD(&ef->ei->e_top_files);
- } else {
- ef->ei = NULL;
}
- ef->iop = iop;
- ef->fop = fop;
- ef->mode = mode;
- ef->data = data;
- return ef;
+ ei->entries = entries;
+ ei->nr_entries = size;
+ ei->data = data;
+ INIT_LIST_HEAD(&ei->children);
+ INIT_LIST_HEAD(&ei->list);
+
+ mutex_lock(&eventfs_mutex);
+ if (!parent->is_freed) {
+ list_add_tail(&ei->list, &parent->children);
+ ei->d_parent = parent->dentry;
+ }
+ mutex_unlock(&eventfs_mutex);
+
+ /* Was the parent freed? */
+ if (list_empty(&ei->list)) {
+ free_ei(ei);
+ ei = NULL;
+ }
+ return ei;
}
/**
- * eventfs_create_events_dir - create the trace event structure
- * @name: the name of the directory to create.
- * @parent: parent dentry for this file. This should be a directory dentry
- * if set. If this parameter is NULL, then the directory will be
- * created in the root of the tracefs filesystem.
+ * eventfs_create_events_dir - create the top level events directory
+ * @name: The name of the top level directory to create.
+ * @parent: Parent dentry for this file in the tracefs directory.
+ * @entries: A list of entries that represent the files under this directory
+ * @size: The number of @entries
+ * @data: The default data to pass to the files (an entry may override it).
*
* This function creates the top of the trace event directory.
+ *
+ * See eventfs_create_dir() for use of @entries.
*/
-struct dentry *eventfs_create_events_dir(const char *name,
- struct dentry *parent)
+struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
+ const struct eventfs_entry *entries,
+ int size, void *data)
{
struct dentry *dentry = tracefs_start_creating(name, parent);
struct eventfs_inode *ei;
@@ -581,19 +868,32 @@ struct dentry *eventfs_create_events_dir(const char *name,
return NULL;
if (IS_ERR(dentry))
- return dentry;
+ return ERR_CAST(dentry);
ei = kzalloc(sizeof(*ei), GFP_KERNEL);
if (!ei)
- return ERR_PTR(-ENOMEM);
+ goto fail_ei;
+
inode = tracefs_get_inode(dentry->d_sb);
- if (unlikely(!inode)) {
- kfree(ei);
- tracefs_failed_creating(dentry);
- return ERR_PTR(-ENOMEM);
+ if (unlikely(!inode))
+ goto fail;
+
+ if (size) {
+ ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
+ if (!ei->d_children)
+ goto fail;
}
- INIT_LIST_HEAD(&ei->e_top_files);
+ ei->dentry = dentry;
+ ei->entries = entries;
+ ei->nr_entries = size;
+ ei->data = data;
+ ei->name = kstrdup_const(name, GFP_KERNEL);
+ if (!ei->name)
+ goto fail;
+
+ INIT_LIST_HEAD(&ei->children);
+ INIT_LIST_HEAD(&ei->list);
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
@@ -603,198 +903,97 @@ struct dentry *eventfs_create_events_dir(const char *name,
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
+ dentry->d_fsdata = ei;
+
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
inc_nlink(dentry->d_parent->d_inode);
fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
- return tracefs_end_creating(dentry);
-}
-
-/**
- * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later
- * @name: the name of the file to create.
- * @parent: parent dentry for this dir.
- *
- * This function adds eventfs subsystem dir to list.
- * And all these dirs are created on the fly when they are looked up,
- * and the dentry and inodes will be removed when they are done.
- */
-struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
- struct dentry *parent)
-{
- struct tracefs_inode *ti_parent;
- struct eventfs_inode *ei_parent;
- struct eventfs_file *ef;
-
- if (security_locked_down(LOCKDOWN_TRACEFS))
- return NULL;
-
- if (!parent)
- return ERR_PTR(-EINVAL);
-
- ti_parent = get_tracefs(parent->d_inode);
- ei_parent = ti_parent->private;
+ tracefs_end_creating(dentry);
- ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
- if (IS_ERR(ef))
- return ef;
+ return ei;
- mutex_lock(&eventfs_mutex);
- list_add_tail(&ef->list, &ei_parent->e_top_files);
- ef->d_parent = parent;
- mutex_unlock(&eventfs_mutex);
- return ef;
+ fail:
+ kfree(ei->d_children);
+ kfree(ei);
+ fail_ei:
+ tracefs_failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
}
-/**
- * eventfs_add_dir - add eventfs dir to list to create later
- * @name: the name of the file to create.
- * @ef_parent: parent eventfs_file for this dir.
- *
- * This function adds eventfs dir to list.
- * And all these dirs are created on the fly when they are looked up,
- * and the dentry and inodes will be removed when they are done.
- */
-struct eventfs_file *eventfs_add_dir(const char *name,
- struct eventfs_file *ef_parent)
-{
- struct eventfs_file *ef;
-
- if (security_locked_down(LOCKDOWN_TRACEFS))
- return NULL;
-
- if (!ef_parent)
- return ERR_PTR(-EINVAL);
-
- ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
- if (IS_ERR(ef))
- return ef;
+static LLIST_HEAD(free_list);
- mutex_lock(&eventfs_mutex);
- list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
- ef->d_parent = ef_parent->dentry;
- mutex_unlock(&eventfs_mutex);
- return ef;
-}
-
-/**
- * eventfs_add_events_file - add the data needed to create a file for later reference
- * @name: the name of the file to create.
- * @mode: the permission that the file should have.
- * @parent: parent dentry for this file.
- * @data: something that the caller will want to get to later on.
- * @fop: struct file_operations that should be used for this file.
- *
- * This function is used to add the information needed to create a
- * dentry/inode within the top level events directory. The file created
- * will have the @mode permissions. The @data will be used to fill the
- * inode.i_private when the open() call is done. The dentry and inodes are
- * all created when they are referenced, and removed when they are no
- * longer referenced.
- */
-int eventfs_add_events_file(const char *name, umode_t mode,
- struct dentry *parent, void *data,
- const struct file_operations *fop)
+static void eventfs_workfn(struct work_struct *work)
{
- struct tracefs_inode *ti;
- struct eventfs_inode *ei;
- struct eventfs_file *ef;
-
- if (security_locked_down(LOCKDOWN_TRACEFS))
- return -ENODEV;
-
- if (!parent)
- return -EINVAL;
-
- if (!(mode & S_IFMT))
- mode |= S_IFREG;
-
- if (!parent->d_inode)
- return -EINVAL;
-
- ti = get_tracefs(parent->d_inode);
- if (!(ti->flags & TRACEFS_EVENT_INODE))
- return -EINVAL;
-
- ei = ti->private;
- ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
-
- if (IS_ERR(ef))
- return -ENOMEM;
-
- mutex_lock(&eventfs_mutex);
- list_add_tail(&ef->list, &ei->e_top_files);
- ef->d_parent = parent;
- mutex_unlock(&eventfs_mutex);
- return 0;
+ struct eventfs_inode *ei, *tmp;
+ struct llist_node *llnode;
+
+ llnode = llist_del_all(&free_list);
+ llist_for_each_entry_safe(ei, tmp, llnode, llist) {
+ /* This dput() matches the dget() from unhook_dentry() */
+ for (int i = 0; i < ei->nr_entries; i++) {
+ if (ei->d_children[i])
+ dput(ei->d_children[i]);
+ }
+ /* This should only get here if it had a dentry */
+ if (!WARN_ON_ONCE(!ei->dentry))
+ dput(ei->dentry);
+ }
}
-/**
- * eventfs_add_file - add eventfs file to list to create later
- * @name: the name of the file to create.
- * @mode: the permission that the file should have.
- * @ef_parent: parent eventfs_file for this file.
- * @data: something that the caller will want to get to later on.
- * @fop: struct file_operations that should be used for this file.
- *
- * This function is used to add the information needed to create a
- * file within a subdirectory of the events directory. The file created
- * will have the @mode permissions. The @data will be used to fill the
- * inode.i_private when the open() call is done. The dentry and inodes are
- * all created when they are referenced, and removed when they are no
- * longer referenced.
- */
-int eventfs_add_file(const char *name, umode_t mode,
- struct eventfs_file *ef_parent,
- void *data,
- const struct file_operations *fop)
-{
- struct eventfs_file *ef;
-
- if (security_locked_down(LOCKDOWN_TRACEFS))
- return -ENODEV;
+static DECLARE_WORK(eventfs_work, eventfs_workfn);
- if (!ef_parent)
- return -EINVAL;
+static void free_rcu_ei(struct rcu_head *head)
+{
+ struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu);
- if (!(mode & S_IFMT))
- mode |= S_IFREG;
+ if (ei->dentry) {
+ /* Do not free the ei until all references of dentry are gone */
+ if (llist_add(&ei->llist, &free_list))
+ queue_work(system_unbound_wq, &eventfs_work);
+ return;
+ }
- ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
- if (IS_ERR(ef))
- return -ENOMEM;
+ /* If the ei doesn't have a dentry, neither should its children */
+ for (int i = 0; i < ei->nr_entries; i++) {
+ WARN_ON_ONCE(ei->d_children[i]);
+ }
- mutex_lock(&eventfs_mutex);
- list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
- ef->d_parent = ef_parent->dentry;
- mutex_unlock(&eventfs_mutex);
- return 0;
+ free_ei(ei);
}
-static void free_ef(struct rcu_head *head)
+static void unhook_dentry(struct dentry *dentry)
{
- struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu);
+ if (!dentry)
+ return;
+ /*
+ * Need to add a reference to the dentry that is expected by
+ * simple_recursive_removal(), which will include a dput().
+ */
+ dget(dentry);
- kfree(ef->name);
- kfree(ef->ei);
- kfree(ef);
+ /*
+ * Also add a reference for the dput() in eventfs_workfn().
+ * That is required as that dput() will free the ei after
+ * the SRCU grace period is over.
+ */
+ dget(dentry);
}
/**
* eventfs_remove_rec - remove eventfs dir or file from list
- * @ef: eventfs_file to be removed.
- * @head: to create list of eventfs_file to be deleted
- * @level: to check recursion depth
+ * @ei: eventfs_inode to be removed.
+ * @level: prevent recursion from going more than 3 levels deep.
*
- * The helper function eventfs_remove_rec() is used to clean up and free the
- * associated data from eventfs for both of the added functions.
+ * This function recursively removes eventfs_inodes which
+ * contains info of files and/or directories.
*/
-static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, int level)
+static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
{
- struct eventfs_file *ef_child;
+ struct eventfs_inode *ei_child;
- if (!ef)
+ if (!ei)
return;
/*
* Check recursion depth. It should never be greater than 3:
@@ -806,100 +1005,76 @@ static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head,
if (WARN_ON_ONCE(level > 3))
return;
- if (ef->ei) {
- /* search for nested folders or files */
- list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
- lockdep_is_held(&eventfs_mutex)) {
- eventfs_remove_rec(ef_child, head, level + 1);
+ /* search for nested folders or files */
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
+ lockdep_is_held(&eventfs_mutex)) {
+ /* Children only have dentry if parent does */
+ WARN_ON_ONCE(ei_child->dentry && !ei->dentry);
+ eventfs_remove_rec(ei_child, level + 1);
+ }
+
+
+ ei->is_freed = 1;
+
+ for (int i = 0; i < ei->nr_entries; i++) {
+ if (ei->d_children[i]) {
+ /* Children only have dentry if parent does */
+ WARN_ON_ONCE(!ei->dentry);
+ unhook_dentry(ei->d_children[i]);
}
}
- list_del_rcu(&ef->list);
- list_add_tail(&ef->del_list, head);
+ unhook_dentry(ei->dentry);
+
+ list_del_rcu(&ei->list);
+ call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei);
}
/**
- * eventfs_remove - remove eventfs dir or file from list
- * @ef: eventfs_file to be removed.
+ * eventfs_remove_dir - remove eventfs dir or file from list
+ * @ei: eventfs_inode to be removed.
*
* This function acquire the eventfs_mutex lock and call eventfs_remove_rec()
*/
-void eventfs_remove(struct eventfs_file *ef)
+void eventfs_remove_dir(struct eventfs_inode *ei)
{
- struct eventfs_file *tmp;
- LIST_HEAD(ef_del_list);
- struct dentry *dentry_list = NULL;
struct dentry *dentry;
- if (!ef)
+ if (!ei)
return;
mutex_lock(&eventfs_mutex);
- eventfs_remove_rec(ef, &ef_del_list, 0);
- list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
- if (ef->dentry) {
- unsigned long ptr = (unsigned long)dentry_list;
-
- /* Keep the dentry from being freed yet */
- dget(ef->dentry);
-
- /*
- * Paranoid: The dget() above should prevent the dentry
- * from being freed and calling eventfs_set_ef_status_free().
- * But just in case, set the link list LSB pointer to 1
- * and have eventfs_set_ef_status_free() check that to
- * make sure that if it does happen, it will not think
- * the d_fsdata is an event_file.
- *
- * For this to work, no event_file should be allocated
- * on a odd space, as the ef should always be allocated
- * to be at least word aligned. Check for that too.
- */
- WARN_ON_ONCE(ptr & 1);
-
- ef->dentry->d_fsdata = (void *)(ptr | 1);
- dentry_list = ef->dentry;
- ef->dentry = NULL;
- }
- call_srcu(&eventfs_srcu, &ef->rcu, free_ef);
- }
+ dentry = ei->dentry;
+ eventfs_remove_rec(ei, 0);
mutex_unlock(&eventfs_mutex);
- while (dentry_list) {
- unsigned long ptr;
-
- dentry = dentry_list;
- ptr = (unsigned long)dentry->d_fsdata & ~1UL;
- dentry_list = (struct dentry *)ptr;
- dentry->d_fsdata = NULL;
- d_invalidate(dentry);
- mutex_lock(&eventfs_mutex);
- /* dentry should now have at least a single reference */
- WARN_ONCE((int)d_count(dentry) < 1,
- "dentry %p less than one reference (%d) after invalidate\n",
- dentry, d_count(dentry));
- mutex_unlock(&eventfs_mutex);
- dput(dentry);
- }
+ /*
+ * If any of the ei children has a dentry, then the ei itself
+ * must have a dentry.
+ */
+ if (dentry)
+ simple_recursive_removal(dentry, NULL);
}
/**
- * eventfs_remove_events_dir - remove eventfs dir or file from list
- * @dentry: events's dentry to be removed.
+ * eventfs_remove_events_dir - remove the top level eventfs directory
+ * @ei: the event_inode returned by eventfs_create_events_dir().
*
- * This function remove events main directory
+ * This function removes the events main directory
*/
-void eventfs_remove_events_dir(struct dentry *dentry)
+void eventfs_remove_events_dir(struct eventfs_inode *ei)
{
- struct tracefs_inode *ti;
-
- if (!dentry || !dentry->d_inode)
- return;
+ struct dentry *dentry;
- ti = get_tracefs(dentry->d_inode);
- if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
- return;
+ dentry = ei->dentry;
+ eventfs_remove_dir(ei);
- d_invalidate(dentry);
+ /*
+ * Matches the dget() done by tracefs_start_creating()
+ * in eventfs_create_events_dir() when it the dentry was
+ * created. In other words, it's a normal dentry that
+ * sticks around while the other ei->dentry are created
+ * and destroyed dynamically.
+ */
dput(dentry);
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 429603d865a9..5b54948514fe 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
ti = get_tracefs(inode);
if (ti && ti->flags & TRACEFS_EVENT_INODE)
- eventfs_set_ef_status_free(ti, dentry);
+ eventfs_set_ei_status_free(ti, dentry);
iput(inode);
}
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 4f2e49e2197b..ccee18ca66c7 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -13,6 +13,58 @@ struct tracefs_inode {
struct inode vfs_inode;
};
+/*
+ * struct eventfs_attr - cache the mode and ownership of a eventfs entry
+ * @mode: saved mode plus flags of what is saved
+ * @uid: saved uid if changed
+ * @gid: saved gid if changed
+ */
+struct eventfs_attr {
+ int mode;
+ kuid_t uid;
+ kgid_t gid;
+};
+
+/*
+ * struct eventfs_inode - hold the properties of the eventfs directories.
+ * @list: link list into the parent directory
+ * @entries: the array of entries representing the files in the directory
+ * @name: the name of the directory to create
+ * @children: link list into the child eventfs_inode
+ * @dentry: the dentry of the directory
+ * @d_parent: pointer to the parent's dentry
+ * @d_children: The array of dentries to represent the files when created
+ * @entry_attrs: Saved mode and ownership of the @d_children
+ * @attr: Saved mode and ownership of eventfs_inode itself
+ * @data: The private data to pass to the callbacks
+ * @is_freed: Flag set if the eventfs is on its way to be freed
+ * Note if is_freed is set, then dentry is corrupted.
+ * @nr_entries: The number of items in @entries
+ */
+struct eventfs_inode {
+ struct list_head list;
+ const struct eventfs_entry *entries;
+ const char *name;
+ struct list_head children;
+ struct dentry *dentry; /* Check is_freed to access */
+ struct dentry *d_parent;
+ struct dentry **d_children;
+ struct eventfs_attr *entry_attrs;
+ struct eventfs_attr attr;
+ void *data;
+ /*
+ * Union - used for deletion
+ * @llist: for calling dput() if needed after RCU
+ * @rcu: eventfs_inode to delete in RCU
+ */
+ union {
+ struct llist_node llist;
+ struct rcu_head rcu;
+ };
+ unsigned int is_freed:1;
+ unsigned int nr_entries:31;
+};
+
static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
{
return container_of(inode, struct tracefs_inode, vfs_inode);
@@ -25,6 +77,6 @@ struct inode *tracefs_get_inode(struct super_block *sb);
struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
struct dentry *eventfs_failed_creating(struct dentry *dentry);
struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
+void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
#endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index e564d5ff8781..0d561ecb6869 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -9,10 +9,9 @@
* This file implements various helper functions for UBIFS authentication support
*/
-#include <linux/crypto.h>
#include <linux/verification.h>
#include <crypto/hash.h>
-#include <crypto/algapi.h>
+#include <crypto/utils.h>
#include <keys/user-type.h>
#include <keys/asymmetric-type.h>
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 7af442de44c3..3b13c648d490 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -725,7 +725,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
struct inode *inode = d_inode(old_dentry);
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_inode *dir_ui = ubifs_inode(dir);
- int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int err, sz_change;
struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
.dirtied_ino_d = ALIGN(ui->data_len, 8) };
struct fscrypt_name nm;
@@ -749,6 +749,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
return err;
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
err = dbg_check_synced_i_size(c, inode);
if (err)
goto out_fname;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2e65fd2dbdc3..2d2b39f843ce 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1375,6 +1375,9 @@ static inline int mctime_update_needed(const struct inode *inode,
/**
* ubifs_update_time - update time of inode.
* @inode: inode to update
+ * @time: timespec structure to hold the current time value
+ * @flags: time updating control flag determines updating
+ * which time fields of @inode
*
* This function updates time of the inode.
*/
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index d69d2154645b..f0a5538c84b0 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1607,6 +1607,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
ubifs_err(c, "bad data node (block %u, inode %lu)",
blk, inode->i_ino);
ubifs_dump_node(c, dn, dn_size);
+ err = -EUCLEAN;
goto out_free;
}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 4211e4456b1e..c59d47fe7939 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -23,7 +23,6 @@
#include "ubifs.h"
#include <linux/list_sort.h>
#include <crypto/hash.h>
-#include <crypto/algapi.h>
/**
* struct replay_entry - replay list entry.
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 366941d4a18a..09e270d6ed02 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -54,11 +54,7 @@ module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_vers
static struct kmem_cache *ubifs_inode_slab;
/* UBIFS TNC shrinker description */
-static struct shrinker ubifs_shrinker_info = {
- .scan_objects = ubifs_shrink_scan,
- .count_objects = ubifs_shrink_count,
- .seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *ubifs_shrinker_info;
/**
* validate_inode - validate inode.
@@ -923,8 +919,10 @@ static void free_buds(struct ubifs_info *c)
{
struct ubifs_bud *bud, *n;
- rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb)
+ rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) {
+ kfree(bud->log_hash);
kfree(bud);
+ }
}
/**
@@ -1193,6 +1191,7 @@ static void destroy_journal(struct ubifs_info *c)
bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
list_del(&bud->list);
+ kfree(bud->log_hash);
kfree(bud);
}
ubifs_destroy_idx_gc(c);
@@ -2373,7 +2372,7 @@ static void inode_slab_ctor(void *obj)
static int __init ubifs_init(void)
{
- int err;
+ int err = -ENOMEM;
BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
@@ -2439,10 +2438,15 @@ static int __init ubifs_init(void)
if (!ubifs_inode_slab)
return -ENOMEM;
- err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab");
- if (err)
+ ubifs_shrinker_info = shrinker_alloc(0, "ubifs-slab");
+ if (!ubifs_shrinker_info)
goto out_slab;
+ ubifs_shrinker_info->count_objects = ubifs_shrink_count;
+ ubifs_shrinker_info->scan_objects = ubifs_shrink_scan;
+
+ shrinker_register(ubifs_shrinker_info);
+
err = ubifs_compressors_init();
if (err)
goto out_shrinker;
@@ -2467,7 +2471,7 @@ out_dbg:
dbg_debugfs_exit();
ubifs_compressors_exit();
out_shrinker:
- unregister_shrinker(&ubifs_shrinker_info);
+ shrinker_free(ubifs_shrinker_info);
out_slab:
kmem_cache_destroy(ubifs_inode_slab);
return err;
@@ -2483,7 +2487,7 @@ static void __exit ubifs_exit(void)
dbg_debugfs_exit();
ubifs_sysfs_exit();
ubifs_compressors_exit();
- unregister_shrinker(&ubifs_shrinker_info);
+ shrinker_free(ubifs_shrinker_info);
/*
* Make sure all delayed rcu free inodes are flushed before we
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6b7d95b65f4b..f4728e65d1bd 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -65,6 +65,7 @@ static void do_insert_old_idx(struct ubifs_info *c,
else {
ubifs_err(c, "old idx added twice!");
kfree(old_idx);
+ return;
}
}
rb_link_node(&old_idx->rb, parent, p);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 62633816d7d0..3916dc4f30ca 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -31,7 +31,7 @@
#include <linux/completion.h>
#include <crypto/hash_info.h>
#include <crypto/hash.h>
-#include <crypto/algapi.h>
+#include <crypto/utils.h>
#include <linux/fscrypt.h>
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 2436e3f82147..53c11be2b2c1 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -240,6 +240,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
unsigned int count, sector_t oldb,
sector_t newb, struct page *locked_page)
{
+ struct folio *folio, *locked_folio = page_folio(locked_page);
const unsigned blks_per_page =
1 << (PAGE_SHIFT - inode->i_blkbits);
const unsigned mask = blks_per_page - 1;
@@ -247,42 +248,39 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
pgoff_t index, cur_index, last_index;
unsigned pos, j, lblock;
sector_t end, i;
- struct page *page;
struct buffer_head *head, *bh;
UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n",
inode->i_ino, count,
(unsigned long long)oldb, (unsigned long long)newb);
- BUG_ON(!locked_page);
- BUG_ON(!PageLocked(locked_page));
+ BUG_ON(!folio_test_locked(locked_folio));
- cur_index = locked_page->index;
+ cur_index = locked_folio->index;
end = count + beg;
last_index = end >> (PAGE_SHIFT - inode->i_blkbits);
for (i = beg; i < end; i = (i | mask) + 1) {
index = i >> (PAGE_SHIFT - inode->i_blkbits);
if (likely(cur_index != index)) {
- page = ufs_get_locked_page(mapping, index);
- if (!page)/* it was truncated */
+ folio = ufs_get_locked_folio(mapping, index);
+ if (!folio) /* it was truncated */
continue;
- if (IS_ERR(page)) {/* or EIO */
+ if (IS_ERR(folio)) {/* or EIO */
ufs_error(inode->i_sb, __func__,
"read of page %llu failed\n",
(unsigned long long)index);
continue;
}
} else
- page = locked_page;
+ folio = locked_folio;
- head = page_buffers(page);
+ head = folio_buffers(folio);
bh = head;
pos = i & mask;
for (j = 0; j < pos; ++j)
bh = bh->b_this_page;
-
if (unlikely(index == last_index))
lblock = end & mask;
else
@@ -313,7 +311,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
} while (bh != head);
if (likely(cur_index != index))
- ufs_put_locked_page(page);
+ ufs_put_locked_folio(folio);
}
UFSD("EXIT\n");
}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 338e4b97312f..ebce93b08281 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -1063,7 +1063,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
unsigned i, end;
sector_t lastfrag;
- struct page *lastpage;
+ struct folio *folio;
struct buffer_head *bh;
u64 phys64;
@@ -1074,18 +1074,17 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
lastfrag--;
- lastpage = ufs_get_locked_page(mapping, lastfrag >>
+ folio = ufs_get_locked_folio(mapping, lastfrag >>
(PAGE_SHIFT - inode->i_blkbits));
- if (IS_ERR(lastpage)) {
- err = -EIO;
- goto out;
- }
-
- end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1);
- bh = page_buffers(lastpage);
- for (i = 0; i < end; ++i)
- bh = bh->b_this_page;
+ if (IS_ERR(folio)) {
+ err = -EIO;
+ goto out;
+ }
+ end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1);
+ bh = folio_buffers(folio);
+ for (i = 0; i < end; ++i)
+ bh = bh->b_this_page;
err = ufs_getfrag_block(inode, lastfrag, bh, 1);
@@ -1101,7 +1100,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
*/
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
- set_page_dirty(lastpage);
+ folio_mark_dirty(folio);
}
if (lastfrag >= UFS_IND_FRAGMENT) {
@@ -1119,7 +1118,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
}
}
out_unlock:
- ufs_put_locked_page(lastpage);
+ ufs_put_locked_folio(folio);
out:
return err;
}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 23377c1baed9..a480810cd4e3 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -137,6 +137,7 @@ static struct dentry *ufs_get_parent(struct dentry *child)
}
static const struct export_operations ufs_export_ops = {
+ .encode_fh = generic_encode_ino32_fh,
.fh_to_dentry = ufs_fh_to_dentry,
.fh_to_parent = ufs_fh_to_parent,
.get_parent = ufs_get_parent,
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 08ddf41eaaad..13ba34e6d64f 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -230,42 +230,40 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
}
/**
- * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist
+ * ufs_get_locked_folio() - locate, pin and lock a pagecache folio, if not exist
* read it from disk.
* @mapping: the address_space to search
* @index: the page index
*
- * Locates the desired pagecache page, if not exist we'll read it,
+ * Locates the desired pagecache folio, if not exist we'll read it,
* locks it, increments its reference
* count and returns its address.
*
*/
-
-struct page *ufs_get_locked_page(struct address_space *mapping,
+struct folio *ufs_get_locked_folio(struct address_space *mapping,
pgoff_t index)
{
struct inode *inode = mapping->host;
- struct page *page = find_lock_page(mapping, index);
- if (!page) {
- page = read_mapping_page(mapping, index, NULL);
+ struct folio *folio = filemap_lock_folio(mapping, index);
+ if (!folio) {
+ folio = read_mapping_folio(mapping, index, NULL);
- if (IS_ERR(page)) {
- printk(KERN_ERR "ufs_change_blocknr: "
- "read_mapping_page error: ino %lu, index: %lu\n",
+ if (IS_ERR(folio)) {
+ printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %lu, index: %lu\n",
mapping->host->i_ino, index);
- return page;
+ return folio;
}
- lock_page(page);
+ folio_lock(folio);
- if (unlikely(page->mapping == NULL)) {
+ if (unlikely(folio->mapping == NULL)) {
/* Truncate got there first */
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return NULL;
}
}
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << inode->i_blkbits, 0);
- return page;
+ if (!folio_buffers(folio))
+ create_empty_buffers(folio, 1 << inode->i_blkbits, 0);
+ return folio;
}
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 89247193d96d..0ecd2ed792f5 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -273,15 +273,13 @@ extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struc
extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned);
/* This functions works with cache pages*/
-extern struct page *ufs_get_locked_page(struct address_space *mapping,
- pgoff_t index);
-static inline void ufs_put_locked_page(struct page *page)
+struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index);
+static inline void ufs_put_locked_folio(struct folio *folio)
{
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
-
/*
* macros and inline function to get important structures from ufs_sb_private_info
*/
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 56eaae9dac1a..e8af40b05549 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -49,7 +49,7 @@ static struct ctl_table vm_userfaultfd_table[] = {
};
#endif
-static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
@@ -123,6 +123,11 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
+{
+ return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+}
+
/*
* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
* meaningful when userfaultfd_wp()==true on the vma and when it's
@@ -922,20 +927,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
continue;
}
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
- new_flags, vma->anon_vma,
- vma->vm_file, vma->vm_pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev) {
- vma = prev;
- } else {
- prev = vma;
- }
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
+ vma->vm_end, new_flags,
+ NULL_VM_UFFD_CTX);
vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ prev = vma;
}
mmap_write_unlock(mm);
mmput(mm);
@@ -1325,7 +1325,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
bool basic_ioctls;
unsigned long start, end, vma_end;
struct vma_iterator vmi;
- pgoff_t pgoff;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1399,7 +1399,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/* check not compatible vmas */
ret = -EINVAL;
- if (!vma_can_userfault(cur, vm_flags))
+ if (!vma_can_userfault(cur, vm_flags, wp_async))
goto out_unlock;
/*
@@ -1460,7 +1460,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vm_flags));
+ BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1478,28 +1478,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma_end = min(end, vma->vm_end);
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma),
- ((struct vm_userfaultfd_ctx){ ctx }),
- anon_vma_name(vma));
- if (prev) {
- /* vma_merge() invalidated the mas */
- vma = prev;
- goto next;
- }
- if (vma->vm_start < start) {
- ret = split_vma(&vmi, vma, start, 1);
- if (ret)
- break;
- }
- if (vma->vm_end > end) {
- ret = split_vma(&vmi, vma, end, 0);
- if (ret)
- break;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags,
+ (struct vm_userfaultfd_ctx){ctx});
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
}
- next:
+
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
@@ -1561,7 +1547,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
struct vma_iterator vmi;
- pgoff_t pgoff;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1615,7 +1601,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* provides for more strict behavior to notice
* unregistration errors.
*/
- if (!vma_can_userfault(cur, cur->vm_flags))
+ if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
goto out_unlock;
found = true;
@@ -1631,7 +1617,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
+ BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
/*
* Nothing to do: this vma is already registered into this
@@ -1664,26 +1650,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
uffd_wp_range(vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev) {
- vma = prev;
- goto next;
- }
- if (vma->vm_start < start) {
- ret = split_vma(&vmi, vma, start, 1);
- if (ret)
- break;
- }
- if (vma->vm_end > end) {
- ret = split_vma(&vmi, vma, end, 0);
- if (ret)
- break;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags, NULL_VM_UFFD_CTX);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
}
- next:
+
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
@@ -2018,6 +1991,11 @@ out:
return ret;
}
+bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+ return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -2051,6 +2029,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
goto err_out;
+
+ /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
+ if (features & UFFD_FEATURE_WP_ASYNC)
+ features |= UFFD_FEATURE_WP_UNPOPULATED;
+
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
@@ -2063,6 +2046,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#ifndef CONFIG_PTE_MARKER_UFFD_WP
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index ed0bc8cbc703..567fb37274d3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
- select XFS_DEBUG
+ select DEBUG_FS
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3069194527dd..100ab5931b31 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2275,16 +2275,37 @@ xfs_alloc_min_freelist(
ASSERT(mp->m_alloc_maxlevels > 0);
+ /*
+ * For a btree shorter than the maximum height, the worst case is that
+ * every level gets split and a new level is added, then while inserting
+ * another entry to refill the AGFL, every level under the old root gets
+ * split again. This is:
+ *
+ * (full height split reservation) + (AGFL refill split height)
+ * = (current height + 1) + (current height - 1)
+ * = (new height) + (new height - 2)
+ * = 2 * new height - 2
+ *
+ * For a btree of maximum height, the worst case is that every level
+ * under the root gets split, then while inserting another entry to
+ * refill the AGFL, every level under the root gets split again. This is
+ * also:
+ *
+ * 2 * (current height - 1)
+ * = 2 * (new height - 1)
+ * = 2 * new height - 2
+ */
+
/* space needed by-bno freespace btree */
min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_alloc_maxlevels);
+ mp->m_alloc_maxlevels) * 2 - 2;
/* space needed by-size freespace btree */
min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_alloc_maxlevels);
+ mp->m_alloc_maxlevels) * 2 - 2;
/* space needed reverse mapping used space btree */
if (xfs_has_rmapbt(mp))
min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
- mp->m_rmap_maxlevels);
+ mp->m_rmap_maxlevels) * 2 - 2;
return min_free;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 30c931b38853..be62acffad6c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -21,7 +21,7 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_quota.h"
@@ -2989,7 +2989,7 @@ xfs_bmap_extsize_align(
* If realtime, and the result isn't a multiple of the realtime
* extent size we need to remove blocks until it is.
*/
- if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+ if (rt && (temp = xfs_extlen_to_rtxmod(mp, align_alen))) {
/*
* We're not covering the original request, or
* we won't be able to once we fix the length.
@@ -3016,7 +3016,7 @@ xfs_bmap_extsize_align(
else {
align_alen -= orig_off - align_off;
align_off = orig_off;
- align_alen -= align_alen % mp->m_sb.sb_rextsize;
+ align_alen -= xfs_extlen_to_rtxmod(mp, align_alen);
}
/*
* Result doesn't cover the request, fail it.
@@ -4826,12 +4826,8 @@ xfs_bmap_del_extent_delay(
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
- if (isrt) {
- uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
-
- do_div(rtexts, mp->m_sb.sb_rextsize);
- xfs_mod_frextents(mp, rtexts);
- }
+ if (isrt)
+ xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
/*
* Update the inode delalloc counter now and wait to update the
@@ -5057,33 +5053,20 @@ xfs_bmap_del_extent_real(
flags = XFS_ILOG_CORE;
if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
- xfs_filblks_t len;
- xfs_extlen_t mod;
-
- len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize,
- &mod);
- ASSERT(mod == 0);
-
if (!(bflags & XFS_BMAPI_REMAP)) {
- xfs_fsblock_t bno;
-
- bno = div_u64_rem(del->br_startblock,
- mp->m_sb.sb_rextsize, &mod);
- ASSERT(mod == 0);
-
- error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ error = xfs_rtfree_blocks(tp, del->br_startblock,
+ del->br_blockcount);
if (error)
goto done;
}
do_fx = 0;
- nblks = len * mp->m_sb.sb_rextsize;
qfield = XFS_TRANS_DQ_RTBCOUNT;
} else {
do_fx = 1;
- nblks = del->br_blockcount;
qfield = XFS_TRANS_DQ_BCOUNT;
}
+ nblks = del->br_blockcount;
del_endblock = del->br_startblock + del->br_blockcount;
if (cur) {
@@ -5289,7 +5272,6 @@ __xfs_bunmapi(
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
- xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
xfs_fileoff_t end;
struct xfs_iext_cursor icur;
@@ -5384,8 +5366,8 @@ __xfs_bunmapi(
if (!isrt)
goto delete;
- sum = del.br_startblock + del.br_blockcount;
- div_u64_rem(sum, mp->m_sb.sb_rextsize, &mod);
+ mod = xfs_rtb_to_rtxoff(mp,
+ del.br_startblock + del.br_blockcount);
if (mod) {
/*
* Realtime extent not lined up at the end.
@@ -5432,7 +5414,8 @@ __xfs_bunmapi(
goto error0;
goto nodelete;
}
- div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod);
+
+ mod = xfs_rtb_to_rtxoff(mp, del.br_startblock);
if (mod) {
xfs_extlen_t off = mp->m_sb.sb_rextsize - mod;
@@ -6209,8 +6192,8 @@ xfs_bmap_validate_extent(
return __this_address;
if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) {
- if (!xfs_verify_rtext(mp, irec->br_startblock,
- irec->br_blockcount))
+ if (!xfs_verify_rtbext(mp, irec->br_startblock,
+ irec->br_blockcount))
return __this_address;
} else {
if (!xfs_verify_fsbext(mp, irec->br_startblock,
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index bcfb6a4203cd..f71679ce23b9 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -245,21 +245,18 @@ xfs_defer_create_intents(
return ret;
}
-/* Abort all the intents that were committed. */
STATIC void
-xfs_defer_trans_abort(
- struct xfs_trans *tp,
- struct list_head *dop_pending)
+xfs_defer_pending_abort(
+ struct xfs_mount *mp,
+ struct list_head *dop_list)
{
struct xfs_defer_pending *dfp;
const struct xfs_defer_op_type *ops;
- trace_xfs_defer_trans_abort(tp, _RET_IP_);
-
/* Abort intent items that don't have a done item. */
- list_for_each_entry(dfp, dop_pending, dfp_list) {
+ list_for_each_entry(dfp, dop_list, dfp_list) {
ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+ trace_xfs_defer_pending_abort(mp, dfp);
if (dfp->dfp_intent && !dfp->dfp_done) {
ops->abort_intent(dfp->dfp_intent);
dfp->dfp_intent = NULL;
@@ -267,6 +264,16 @@ xfs_defer_trans_abort(
}
}
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+ struct xfs_trans *tp,
+ struct list_head *dop_pending)
+{
+ trace_xfs_defer_trans_abort(tp, _RET_IP_);
+ xfs_defer_pending_abort(tp->t_mountp, dop_pending);
+}
+
/*
* Capture resources that the caller said not to release ("held") when the
* transaction commits. Caller is responsible for zero-initializing @dres.
@@ -756,12 +763,13 @@ xfs_defer_ops_capture(
/* Release all resources that we used to capture deferred ops. */
void
-xfs_defer_ops_capture_free(
+xfs_defer_ops_capture_abort(
struct xfs_mount *mp,
struct xfs_defer_capture *dfc)
{
unsigned short i;
+ xfs_defer_pending_abort(mp, &dfc->dfc_dfops);
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
@@ -802,7 +810,7 @@ xfs_defer_ops_capture_and_commit(
/* Commit the transaction and add the capture structure to the list. */
error = xfs_trans_commit(tp);
if (error) {
- xfs_defer_ops_capture_free(mp, dfc);
+ xfs_defer_ops_capture_abort(mp, dfc);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 114a3a4930a3..8788ad5f6a73 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -121,7 +121,7 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
struct list_head *capture_list);
void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
struct xfs_defer_resources *dres);
-void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
struct xfs_defer_capture *d);
void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 371dc07233e0..9a88aba1589f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -98,7 +98,7 @@ typedef struct xfs_sb {
uint32_t sb_blocksize; /* logical block size, bytes */
xfs_rfsblock_t sb_dblocks; /* number of data blocks */
xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
- xfs_rtblock_t sb_rextents; /* number of realtime extents */
+ xfs_rtbxlen_t sb_rextents; /* number of realtime extents */
uuid_t sb_uuid; /* user-visible file system unique id */
xfs_fsblock_t sb_logstart; /* starting block of log if internal */
xfs_ino_t sb_rootino; /* root inode number */
@@ -691,6 +691,22 @@ struct xfs_agfl {
xfs_daddr_to_agno(mp, (d) + (len) - 1)))
/*
+ * Realtime bitmap information is accessed by the word, which is currently
+ * stored in host-endian format.
+ */
+union xfs_rtword_raw {
+ __u32 old;
+};
+
+/*
+ * Realtime summary counts are accessed by the word, which is currently
+ * stored in host-endian format.
+ */
+union xfs_suminfo_raw {
+ __u32 old;
+};
+
+/*
* XFS Timestamps
* ==============
*
@@ -1142,24 +1158,10 @@ static inline bool xfs_dinode_has_large_extent_counts(
#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
-#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
-#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
/*
- * RT Summary and bit manipulation macros.
+ * RT bit manipulation macros.
*/
-#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
-#define XFS_SUMOFFSTOBLOCK(mp,s) \
- (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
-#define XFS_SUMPTR(mp,bp,so) \
- ((xfs_suminfo_t *)((bp)->b_addr + \
- (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
-
-#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
-#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
-#define XFS_BITTOWORD(mp,bi) \
- ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
-
#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 543f3748c2a3..137a65bda95d 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -510,6 +510,9 @@ xfs_dinode_verify(
if (mode && nextents + naextents > nblocks)
return __this_address;
+ if (nextents + naextents == 0 && nblocks != 0)
+ return __this_address;
+
if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 396648acb5be..c269d704314d 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -16,6 +16,7 @@
#include "xfs_trans.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
+#include "xfs_rtbitmap.h"
/*
* Realtime allocator bitmap functions shared with userspace.
@@ -46,25 +47,69 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
.verify_write = xfs_rtbuf_verify_write,
};
+/* Release cached rt bitmap and summary buffers. */
+void
+xfs_rtbuf_cache_relse(
+ struct xfs_rtalloc_args *args)
+{
+ if (args->rbmbp) {
+ xfs_trans_brelse(args->tp, args->rbmbp);
+ args->rbmbp = NULL;
+ args->rbmoff = NULLFILEOFF;
+ }
+ if (args->sumbp) {
+ xfs_trans_brelse(args->tp, args->sumbp);
+ args->sumbp = NULL;
+ args->sumoff = NULLFILEOFF;
+ }
+}
+
/*
* Get a buffer for the bitmap or summary file block specified.
* The buffer is returned read and locked.
*/
int
xfs_rtbuf_get(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t block, /* block number in bitmap or summary */
- int issum, /* is summary not bitmap */
- struct xfs_buf **bpp) /* output: buffer for the block */
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block, /* block number in bitmap or summary */
+ int issum) /* is summary not bitmap */
{
- struct xfs_buf *bp; /* block buffer, result */
- xfs_inode_t *ip; /* bitmap or summary inode */
- xfs_bmbt_irec_t map;
- int nmap = 1;
- int error; /* error value */
+ struct xfs_mount *mp = args->mp;
+ struct xfs_buf **cbpp; /* cached block buffer */
+ xfs_fileoff_t *coffp; /* cached block number */
+ struct xfs_buf *bp; /* block buffer, result */
+ struct xfs_inode *ip; /* bitmap or summary inode */
+ struct xfs_bmbt_irec map;
+ enum xfs_blft type;
+ int nmap = 1;
+ int error;
- ip = issum ? mp->m_rsumip : mp->m_rbmip;
+ if (issum) {
+ cbpp = &args->sumbp;
+ coffp = &args->sumoff;
+ ip = mp->m_rsumip;
+ type = XFS_BLFT_RTSUMMARY_BUF;
+ } else {
+ cbpp = &args->rbmbp;
+ coffp = &args->rbmoff;
+ ip = mp->m_rbmip;
+ type = XFS_BLFT_RTBITMAP_BUF;
+ }
+
+ /*
+ * If we have a cached buffer, and the block number matches, use that.
+ */
+ if (*cbpp && *coffp == block)
+ return 0;
+
+ /*
+ * Otherwise we have to have to get the buffer. If there was an old
+ * one, get rid of it first.
+ */
+ if (*cbpp) {
+ xfs_trans_brelse(args->tp, *cbpp);
+ *cbpp = NULL;
+ }
error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0);
if (error)
@@ -74,15 +119,15 @@ xfs_rtbuf_get(
return -EFSCORRUPTED;
ASSERT(map.br_startblock != NULLFSBLOCK);
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
if (error)
return error;
- xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
- : XFS_BLFT_RTBITMAP_BUF);
- *bpp = bp;
+ xfs_trans_buf_set_type(args->tp, bp, type);
+ *cbpp = bp;
+ *coffp = block;
return 0;
}
@@ -92,47 +137,44 @@ xfs_rtbuf_get(
*/
int
xfs_rtfind_back(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to look at */
- xfs_rtblock_t limit, /* last block to look at */
- xfs_rtblock_t *rtblock) /* out: start block found */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to look at */
+ xfs_rtxnum_t limit, /* last rtext to look at */
+ xfs_rtxnum_t *rtx) /* out: start rtext found */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t firstbit; /* first useful bit in the word */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t len; /* length of inspected area */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t want; /* mask for "good" values */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error; /* error value */
+ xfs_rtxnum_t firstbit; /* first useful bit in the word */
+ xfs_rtxnum_t i; /* current bit number rel. to start */
+ xfs_rtxnum_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ xfs_rtword_t incore;
+ unsigned int word; /* word number in the buffer */
/*
* Compute and read in starting bitmap block for starting block.
*/
- block = XFS_BITTOBLOCK(mp, start);
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ block = xfs_rtx_to_rbmblock(mp, start);
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Get the first word's index & point to it.
*/
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
+ word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
len = start - limit + 1;
/*
* Compute match value, based on the bit at start: if 1 (free)
* then all-ones, else all-zeroes.
*/
- want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ incore = xfs_rtbitmap_getword(args, word);
+ want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
/*
* If the starting position is not word-aligned, deal with the
* partial word.
@@ -149,13 +191,12 @@ xfs_rtfind_back(
* Calculate the difference between the value there
* and what we're looking for.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different. Mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i = bit - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
+ *rtx = start - i + 1;
return 0;
}
i = bit - firstbit + 1;
@@ -167,19 +208,11 @@ xfs_rtfind_back(
/*
* If done with this block, get the previous one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, --block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
- word = XFS_BLOCKWMASK(mp);
- b = &bufp[word];
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b--;
+
+ word = mp->m_blockwsize - 1;
}
} else {
/*
@@ -195,13 +228,13 @@ xfs_rtfind_back(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = *b ^ want)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = incore ^ want)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
+ *rtx = start - i + 1;
return 0;
}
i += XFS_NBWORD;
@@ -213,19 +246,11 @@ xfs_rtfind_back(
/*
* If done with this block, get the previous one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, --block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
- word = XFS_BLOCKWMASK(mp);
- b = &bufp[word];
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b--;
+
+ word = mp->m_blockwsize - 1;
}
}
/*
@@ -242,13 +267,13 @@ xfs_rtfind_back(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
+ *rtx = start - i + 1;
return 0;
} else
i = len;
@@ -256,8 +281,7 @@ xfs_rtfind_back(
/*
* No match, return that we scanned the whole area.
*/
- xfs_trans_brelse(tp, bp);
- *rtblock = start - i + 1;
+ *rtx = start - i + 1;
return 0;
}
@@ -267,47 +291,44 @@ xfs_rtfind_back(
*/
int
xfs_rtfind_forw(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to look at */
- xfs_rtblock_t limit, /* last block to look at */
- xfs_rtblock_t *rtblock) /* out: start block found */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to look at */
+ xfs_rtxnum_t limit, /* last rtext to look at */
+ xfs_rtxnum_t *rtx) /* out: start rtext found */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t lastbit; /* last useful bit in the word */
- xfs_rtblock_t len; /* length of inspected area */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t want; /* mask for "good" values */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error;
+ xfs_rtxnum_t i; /* current bit number rel. to start */
+ xfs_rtxnum_t lastbit;/* last useful bit in the word */
+ xfs_rtxnum_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ xfs_rtword_t incore;
+ unsigned int word; /* word number in the buffer */
/*
* Compute and read in starting bitmap block for starting block.
*/
- block = XFS_BITTOBLOCK(mp, start);
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ block = xfs_rtx_to_rbmblock(mp, start);
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Get the first word's index & point to it.
*/
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
+ word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
len = limit - start + 1;
/*
* Compute match value, based on the bit at start: if 1 (free)
* then all-ones, else all-zeroes.
*/
- want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ incore = xfs_rtbitmap_getword(args, word);
+ want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
/*
* If the starting position is not word-aligned, deal with the
* partial word.
@@ -323,13 +344,12 @@ xfs_rtfind_forw(
* Calculate the difference between the value there
* and what we're looking for.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different. Mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i = XFS_RTLOBIT(wdiff) - bit;
- *rtblock = start + i - 1;
+ *rtx = start + i - 1;
return 0;
}
i = lastbit - bit;
@@ -337,22 +357,15 @@ xfs_rtfind_forw(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the previous one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b++;
}
} else {
/*
@@ -368,13 +381,13 @@ xfs_rtfind_forw(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = *b ^ want)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = incore ^ want)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_RTLOBIT(wdiff);
- *rtblock = start + i - 1;
+ *rtx = start + i - 1;
return 0;
}
i += XFS_NBWORD;
@@ -382,22 +395,15 @@ xfs_rtfind_forw(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the next one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
}
}
/*
@@ -412,13 +418,13 @@ xfs_rtfind_forw(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_RTLOBIT(wdiff);
- *rtblock = start + i - 1;
+ *rtx = start + i - 1;
return 0;
} else
i = len;
@@ -426,11 +432,25 @@ xfs_rtfind_forw(
/*
* No match, return that we scanned the whole area.
*/
- xfs_trans_brelse(tp, bp);
- *rtblock = start + i - 1;
+ *rtx = start + i - 1;
return 0;
}
+/* Log rtsummary counter at @infoword. */
+static inline void
+xfs_trans_log_rtsummary(
+ struct xfs_rtalloc_args *args,
+ unsigned int infoword)
+{
+ struct xfs_buf *bp = args->sumbp;
+ size_t first, last;
+
+ first = (void *)xfs_rsumblock_infoptr(args, infoword) - bp->b_addr;
+ last = first + sizeof(xfs_suminfo_t) - 1;
+
+ xfs_trans_log_buf(args->tp, bp, first, last);
+}
+
/*
* Read and/or modify the summary information for a given extent size,
* bitmap block combination.
@@ -442,86 +462,77 @@ xfs_rtfind_forw(
*/
int
xfs_rtmodify_summary_int(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- int delta, /* change to make to summary info */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_suminfo_t *sum) /* out: summary info for this block */
+ struct xfs_rtalloc_args *args,
+ int log, /* log2 of extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ int delta, /* change to make to summary info */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
{
- struct xfs_buf *bp; /* buffer for the summary block */
- int error; /* error value */
- xfs_fsblock_t sb; /* summary fsblock */
- int so; /* index into the summary file */
- xfs_suminfo_t *sp; /* pointer to returned data */
+ struct xfs_mount *mp = args->mp;
+ int error;
+ xfs_fileoff_t sb; /* summary fsblock */
+ xfs_rtsumoff_t so; /* index into the summary file */
+ unsigned int infoword;
/*
* Compute entry number in the summary file.
*/
- so = XFS_SUMOFFS(mp, log, bbno);
+ so = xfs_rtsumoffs(mp, log, bbno);
/*
* Compute the block number in the summary file.
*/
- sb = XFS_SUMOFFSTOBLOCK(mp, so);
- /*
- * If we have an old buffer, and the block number matches, use that.
- */
- if (*rbpp && *rsb == sb)
- bp = *rbpp;
- /*
- * Otherwise we have to get the buffer.
- */
- else {
- /*
- * If there was an old one, get rid of it first.
- */
- if (*rbpp)
- xfs_trans_brelse(tp, *rbpp);
- error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
- if (error) {
- return error;
- }
- /*
- * Remember this buffer and block for the next call.
- */
- *rbpp = bp;
- *rsb = sb;
- }
+ sb = xfs_rtsumoffs_to_block(mp, so);
+
+ error = xfs_rtsummary_read_buf(args, sb);
+ if (error)
+ return error;
+
/*
* Point to the summary information, modify/log it, and/or copy it out.
*/
- sp = XFS_SUMPTR(mp, bp, so);
+ infoword = xfs_rtsumoffs_to_infoword(mp, so);
if (delta) {
- uint first = (uint)((char *)sp - (char *)bp->b_addr);
+ xfs_suminfo_t val = xfs_suminfo_add(args, infoword, delta);
- *sp += delta;
if (mp->m_rsum_cache) {
- if (*sp == 0 && log == mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno]++;
- if (*sp != 0 && log < mp->m_rsum_cache[bbno])
+ if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
mp->m_rsum_cache[bbno] = log;
+ if (val != 0 && log >= mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log + 1;
}
- xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
+ xfs_trans_log_rtsummary(args, infoword);
+ if (sum)
+ *sum = val;
+ } else if (sum) {
+ *sum = xfs_suminfo_get(args, infoword);
}
- if (sum)
- *sum = *sp;
return 0;
}
int
xfs_rtmodify_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- int delta, /* change to make to summary info */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ struct xfs_rtalloc_args *args,
+ int log, /* log2 of extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ int delta) /* in/out: summary block number */
+{
+ return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL);
+}
+
+/* Log rtbitmap block from the word @from to the byte before @next. */
+static inline void
+xfs_trans_log_rtbitmap(
+ struct xfs_rtalloc_args *args,
+ unsigned int from,
+ unsigned int next)
{
- return xfs_rtmodify_summary_int(mp, tp, log, bbno,
- delta, rbpp, rsb, NULL);
+ struct xfs_buf *bp = args->rbmbp;
+ size_t first, last;
+
+ first = (void *)xfs_rbmblock_wordptr(args, from) - bp->b_addr;
+ last = ((void *)xfs_rbmblock_wordptr(args, next) - 1) - bp->b_addr;
+
+ xfs_trans_log_buf(args->tp, bp, first, last);
}
/*
@@ -530,41 +541,37 @@ xfs_rtmodify_summary(
*/
int
xfs_rtmodify_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to modify */
- xfs_extlen_t len, /* length of extent to modify */
- int val) /* 1 for free, 0 for allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to modify */
+ xfs_rtxlen_t len, /* length of extent to modify */
+ int val) /* 1 for free, 0 for allocated */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtword_t *first; /* first used word in the buffer */
- int i; /* current bit number rel. to start */
- int lastbit; /* last useful bit in word */
- xfs_rtword_t mask; /* mask o frelevant bits for value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error;
+ int i; /* current bit number rel. to start */
+ int lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t incore;
+ unsigned int firstword; /* first word used in the buffer */
+ unsigned int word; /* word number in the buffer */
/*
* Compute starting bitmap block number.
*/
- block = XFS_BITTOBLOCK(mp, start);
+ block = xfs_rtx_to_rbmblock(mp, start);
/*
* Read the bitmap block, and point to its data.
*/
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Compute the starting word's address, and starting bit.
*/
- word = XFS_BITTOWORD(mp, start);
- first = b = &bufp[word];
+ firstword = word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
/*
* 0 (allocated) => all zeroes; 1 (free) => all ones.
@@ -583,34 +590,28 @@ xfs_rtmodify_range(
/*
* Set/clear the active bits.
*/
+ incore = xfs_rtbitmap_getword(args, word);
if (val)
- *b |= mask;
+ incore |= mask;
else
- *b &= ~mask;
+ incore &= ~mask;
+ xfs_rtbitmap_setword(args, word, incore);
i = lastbit - bit;
/*
* Go on to the next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* Log the changed part of this block.
* Get the next one.
*/
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp));
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ xfs_trans_log_rtbitmap(args, firstword, word);
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- first = b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer
- */
- b++;
+
+ firstword = word = 0;
}
} else {
/*
@@ -626,31 +627,23 @@ xfs_rtmodify_range(
/*
* Set the word value correctly.
*/
- *b = val;
+ xfs_rtbitmap_setword(args, word, val);
i += XFS_NBWORD;
/*
* Go on to the next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* Log the changed part of this block.
* Get the next one.
*/
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp));
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ xfs_trans_log_rtbitmap(args, firstword, word);
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- first = b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer
- */
- b++;
+
+ firstword = word = 0;
}
}
/*
@@ -665,18 +658,19 @@ xfs_rtmodify_range(
/*
* Set/clear the active bits.
*/
+ incore = xfs_rtbitmap_getword(args, word);
if (val)
- *b |= mask;
+ incore |= mask;
else
- *b &= ~mask;
- b++;
+ incore &= ~mask;
+ xfs_rtbitmap_setword(args, word, incore);
+ word++;
}
/*
* Log any remaining changed bytes.
*/
- if (b > first)
- xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp - 1));
+ if (word > firstword)
+ xfs_trans_log_rtbitmap(args, firstword, word);
return 0;
}
@@ -686,23 +680,21 @@ xfs_rtmodify_range(
*/
int
xfs_rtfree_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to free */
- xfs_extlen_t len, /* length to free */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to free */
+ xfs_rtxlen_t len) /* in/out: summary block number */
{
- xfs_rtblock_t end; /* end of the freed extent */
- int error; /* error value */
- xfs_rtblock_t postblock; /* first block freed > end */
- xfs_rtblock_t preblock; /* first block freed < start */
+ struct xfs_mount *mp = args->mp;
+ xfs_rtxnum_t end; /* end of the freed extent */
+ int error; /* error value */
+ xfs_rtxnum_t postblock; /* first rtext freed > end */
+ xfs_rtxnum_t preblock; /* first rtext freed < start */
end = start + len - 1;
/*
* Modify the bitmap to mark this extent freed.
*/
- error = xfs_rtmodify_range(mp, tp, start, len, 1);
+ error = xfs_rtmodify_range(args, start, len, 1);
if (error) {
return error;
}
@@ -711,15 +703,15 @@ xfs_rtfree_range(
* We need to find the beginning and end of the extent so we can
* properly update the summary.
*/
- error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ error = xfs_rtfind_back(args, start, 0, &preblock);
if (error) {
return error;
}
/*
* Find the next allocated block (end of allocated extent).
*/
- error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
- &postblock);
+ error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
if (error)
return error;
/*
@@ -727,9 +719,9 @@ xfs_rtfree_range(
* old extent, add summary data for them to be allocated.
*/
if (preblock < start) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(start - preblock),
- XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(start - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), -1);
if (error) {
return error;
}
@@ -739,9 +731,9 @@ xfs_rtfree_range(
* old extent, add summary data for them to be allocated.
*/
if (postblock > end) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock - end),
- XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(postblock - end),
+ xfs_rtx_to_rbmblock(mp, end + 1), -1);
if (error) {
return error;
}
@@ -750,10 +742,9 @@ xfs_rtfree_range(
* Increment the summary information corresponding to the entire
* (new) free extent.
*/
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
- XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
- return error;
+ return xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), 1);
}
/*
@@ -762,43 +753,39 @@ xfs_rtfree_range(
*/
int
xfs_rtcheck_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block number of extent */
- xfs_extlen_t len, /* length of extent */
- int val, /* 1 for free, 0 for allocated */
- xfs_rtblock_t *new, /* out: first block not matching */
- int *stat) /* out: 1 for matches, 0 for not */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number of extent */
+ xfs_rtxlen_t len, /* length of extent */
+ int val, /* 1 for free, 0 for allocated */
+ xfs_rtxnum_t *new, /* out: first rtext not matching */
+ int *stat) /* out: 1 for matches, 0 for not */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t lastbit; /* last useful bit in word */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error;
+ xfs_rtxnum_t i; /* current bit number rel. to start */
+ xfs_rtxnum_t lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ xfs_rtword_t incore;
+ unsigned int word; /* word number in the buffer */
/*
* Compute starting bitmap block number
*/
- block = XFS_BITTOBLOCK(mp, start);
+ block = xfs_rtx_to_rbmblock(mp, start);
/*
* Read the bitmap block.
*/
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Compute the starting word's address, and starting bit.
*/
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
+ word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
/*
* 0 (allocated) => all zero's; 1 (free) => all one's.
@@ -820,11 +807,11 @@ xfs_rtcheck_range(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ val) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ val) & mask)) {
/*
* Different, compute first wrong bit and return.
*/
- xfs_trans_brelse(tp, bp);
i = XFS_RTLOBIT(wdiff) - bit;
*new = start + i;
*stat = 0;
@@ -835,22 +822,15 @@ xfs_rtcheck_range(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the next one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
}
} else {
/*
@@ -866,11 +846,11 @@ xfs_rtcheck_range(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = *b ^ val)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = incore ^ val)) {
/*
* Different, compute first wrong bit and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_RTLOBIT(wdiff);
*new = start + i;
*stat = 0;
@@ -881,22 +861,15 @@ xfs_rtcheck_range(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the next one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
}
}
/*
@@ -911,11 +884,11 @@ xfs_rtcheck_range(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ val) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ val) & mask)) {
/*
* Different, compute first wrong bit and return.
*/
- xfs_trans_brelse(tp, bp);
i += XFS_RTLOBIT(wdiff);
*new = start + i;
*stat = 0;
@@ -926,7 +899,6 @@ xfs_rtcheck_range(
/*
* Successful, return.
*/
- xfs_trans_brelse(tp, bp);
*new = start + i;
*stat = 1;
return 0;
@@ -936,58 +908,57 @@ xfs_rtcheck_range(
/*
* Check that the given extent (block range) is allocated already.
*/
-STATIC int /* error */
+STATIC int
xfs_rtcheck_alloc_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number of extent */
- xfs_extlen_t len) /* length of extent */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number of extent */
+ xfs_rtxlen_t len) /* length of extent */
{
- xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
- int stat;
- int error;
+ xfs_rtxnum_t new; /* dummy for xfs_rtcheck_range */
+ int stat;
+ int error;
- error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+ error = xfs_rtcheck_range(args, start, len, 0, &new, &stat);
if (error)
return error;
ASSERT(stat);
return 0;
}
#else
-#define xfs_rtcheck_alloc_range(m,t,b,l) (0)
+#define xfs_rtcheck_alloc_range(a,b,l) (0)
#endif
/*
* Free an extent in the realtime subvolume. Length is expressed in
* realtime extents, as is the block number.
*/
-int /* error */
+int
xfs_rtfree_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to free */
- xfs_extlen_t len) /* length of extent freed */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_rtxnum_t start, /* starting rtext number to free */
+ xfs_rtxlen_t len) /* length of extent freed */
{
- int error; /* error value */
- xfs_mount_t *mp; /* file system mount structure */
- xfs_fsblock_t sb; /* summary file block number */
- struct xfs_buf *sumbp = NULL; /* summary file block buffer */
- struct timespec64 atime;
-
- mp = tp->t_mountp;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = tp,
+ };
+ int error;
+ struct timespec64 atime;
ASSERT(mp->m_rbmip->i_itemp != NULL);
ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
- error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+ error = xfs_rtcheck_alloc_range(&args, start, len);
if (error)
return error;
/*
* Free the range of realtime blocks.
*/
- error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
- if (error) {
- return error;
- }
+ error = xfs_rtfree_range(&args, start, len);
+ if (error)
+ goto out;
+
/*
* Mark more blocks free in the superblock.
*/
@@ -1002,11 +973,47 @@ xfs_rtfree_extent(
mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
atime = inode_get_atime(VFS_I(mp->m_rbmip));
- *((uint64_t *)&atime) = 0;
+ atime.tv_sec = 0;
inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
}
- return 0;
+ error = 0;
+out:
+ xfs_rtbuf_cache_relse(&args);
+ return error;
+}
+
+/*
+ * Free some blocks in the realtime subvolume. rtbno and rtlen are in units of
+ * rt blocks, not rt extents; must be aligned to the rt extent size; and rtlen
+ * cannot exceed XFS_MAX_BMBT_EXTLEN.
+ */
+int
+xfs_rtfree_blocks(
+ struct xfs_trans *tp,
+ xfs_fsblock_t rtbno,
+ xfs_filblks_t rtlen)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rtxnum_t start;
+ xfs_filblks_t len;
+ xfs_extlen_t mod;
+
+ ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
+
+ len = xfs_rtb_to_rtxrem(mp, rtlen, &mod);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EIO;
+ }
+
+ start = xfs_rtb_to_rtxrem(mp, rtbno, &mod);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EIO;
+ }
+
+ return xfs_rtfree_extent(tp, start, len);
}
/* Find all the free records within a given range. */
@@ -1019,10 +1026,14 @@ xfs_rtalloc_query_range(
xfs_rtalloc_query_range_fn fn,
void *priv)
{
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = tp,
+ };
struct xfs_rtalloc_rec rec;
- xfs_rtblock_t rtstart;
- xfs_rtblock_t rtend;
- xfs_rtblock_t high_key;
+ xfs_rtxnum_t rtstart;
+ xfs_rtxnum_t rtend;
+ xfs_rtxnum_t high_key;
int is_free;
int error = 0;
@@ -1038,13 +1049,13 @@ xfs_rtalloc_query_range(
rtstart = low_rec->ar_startext;
while (rtstart <= high_key) {
/* Is the first block free? */
- error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
+ error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend,
&is_free);
if (error)
break;
/* How long does the extent go for? */
- error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend);
+ error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend);
if (error)
break;
@@ -1060,6 +1071,7 @@ xfs_rtalloc_query_range(
rtstart = rtend + 1;
}
+ xfs_rtbuf_cache_relse(&args);
return error;
}
@@ -1085,18 +1097,79 @@ int
xfs_rtalloc_extent_is_free(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_rtblock_t start,
- xfs_extlen_t len,
+ xfs_rtxnum_t start,
+ xfs_rtxlen_t len,
bool *is_free)
{
- xfs_rtblock_t end;
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = tp,
+ };
+ xfs_rtxnum_t end;
int matches;
int error;
- error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches);
+ error = xfs_rtcheck_range(&args, start, len, 1, &end, &matches);
+ xfs_rtbuf_cache_relse(&args);
if (error)
return error;
*is_free = matches;
return 0;
}
+
+/*
+ * Compute the number of rtbitmap blocks needed to track the given number of rt
+ * extents.
+ */
+xfs_filblks_t
+xfs_rtbitmap_blockcount(
+ struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents)
+{
+ return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Compute the number of rtbitmap words needed to populate every block of a
+ * bitmap that is large enough to track the given number of rt extents.
+ */
+unsigned long long
+xfs_rtbitmap_wordcount(
+ struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents)
+{
+ xfs_filblks_t blocks;
+
+ blocks = xfs_rtbitmap_blockcount(mp, rtextents);
+ return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
+}
+
+/* Compute the number of rtsummary blocks needed to track the given rt space. */
+xfs_filblks_t
+xfs_rtsummary_blockcount(
+ struct xfs_mount *mp,
+ unsigned int rsumlevels,
+ xfs_extlen_t rbmblocks)
+{
+ unsigned long long rsumwords;
+
+ rsumwords = (unsigned long long)rsumlevels * rbmblocks;
+ return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG);
+}
+
+/*
+ * Compute the number of rtsummary info words needed to populate every block of
+ * a summary file that is large enough to track the given rt space.
+ */
+unsigned long long
+xfs_rtsummary_wordcount(
+ struct xfs_mount *mp,
+ unsigned int rsumlevels,
+ xfs_extlen_t rbmblocks)
+{
+ xfs_filblks_t blocks;
+
+ blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks);
+ return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
new file mode 100644
index 000000000000..c0637057d69c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_RTBITMAP_H__
+#define __XFS_RTBITMAP_H__
+
+struct xfs_rtalloc_args {
+ struct xfs_mount *mp;
+ struct xfs_trans *tp;
+
+ struct xfs_buf *rbmbp; /* bitmap block buffer */
+ struct xfs_buf *sumbp; /* summary block buffer */
+
+ xfs_fileoff_t rbmoff; /* bitmap block number */
+ xfs_fileoff_t sumoff; /* summary block number */
+};
+
+static inline xfs_rtblock_t
+xfs_rtx_to_rtb(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ if (mp->m_rtxblklog >= 0)
+ return rtx << mp->m_rtxblklog;
+
+ return rtx * mp->m_sb.sb_rextsize;
+}
+
+static inline xfs_extlen_t
+xfs_rtxlen_to_extlen(
+ struct xfs_mount *mp,
+ xfs_rtxlen_t rtxlen)
+{
+ if (mp->m_rtxblklog >= 0)
+ return rtxlen << mp->m_rtxblklog;
+
+ return rtxlen * mp->m_sb.sb_rextsize;
+}
+
+/* Compute the misalignment between an extent length and a realtime extent .*/
+static inline unsigned int
+xfs_extlen_to_rtxmod(
+ struct xfs_mount *mp,
+ xfs_extlen_t len)
+{
+ if (mp->m_rtxblklog >= 0)
+ return len & mp->m_rtxblkmask;
+
+ return len % mp->m_sb.sb_rextsize;
+}
+
+static inline xfs_rtxlen_t
+xfs_extlen_to_rtxlen(
+ struct xfs_mount *mp,
+ xfs_extlen_t len)
+{
+ if (mp->m_rtxblklog >= 0)
+ return len >> mp->m_rtxblklog;
+
+ return len / mp->m_sb.sb_rextsize;
+}
+
+/* Convert an rt block number into an rt extent number. */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtx(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return rtbno >> mp->m_rtxblklog;
+
+ return div_u64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Return the offset of an rt block number within an rt extent. */
+static inline xfs_extlen_t
+xfs_rtb_to_rtxoff(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return rtbno & mp->m_rtxblkmask;
+
+ return do_div(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/*
+ * Crack an rt block number into an rt extent number and an offset within that
+ * rt extent. Returns the rt extent number directly and the offset in @off.
+ */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtxrem(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno,
+ xfs_extlen_t *off)
+{
+ if (likely(mp->m_rtxblklog >= 0)) {
+ *off = rtbno & mp->m_rtxblkmask;
+ return rtbno >> mp->m_rtxblklog;
+ }
+
+ return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off);
+}
+
+/*
+ * Convert an rt block number into an rt extent number, rounding up to the next
+ * rt extent if the rt block is not aligned to an rt extent boundary.
+ */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtxup(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ if (likely(mp->m_rtxblklog >= 0)) {
+ if (rtbno & mp->m_rtxblkmask)
+ return (rtbno >> mp->m_rtxblklog) + 1;
+ return rtbno >> mp->m_rtxblklog;
+ }
+
+ if (do_div(rtbno, mp->m_sb.sb_rextsize))
+ rtbno++;
+ return rtbno;
+}
+
+/* Round this rtblock up to the nearest rt extent size. */
+static inline xfs_rtblock_t
+xfs_rtb_roundup_rtx(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return roundup_64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Round this rtblock down to the nearest rt extent size. */
+static inline xfs_rtblock_t
+xfs_rtb_rounddown_rtx(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return rounddown_64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Convert an rt extent number to a file block offset in the rt bitmap file. */
+static inline xfs_fileoff_t
+xfs_rtx_to_rbmblock(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ return rtx >> mp->m_blkbit_log;
+}
+
+/* Convert an rt extent number to a word offset within an rt bitmap block. */
+static inline unsigned int
+xfs_rtx_to_rbmword(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1);
+}
+
+/* Convert a file block offset in the rt bitmap file to an rt extent number. */
+static inline xfs_rtxnum_t
+xfs_rbmblock_to_rtx(
+ struct xfs_mount *mp,
+ xfs_fileoff_t rbmoff)
+{
+ return rbmoff << mp->m_blkbit_log;
+}
+
+/* Return a pointer to a bitmap word within a rt bitmap block. */
+static inline union xfs_rtword_raw *
+xfs_rbmblock_wordptr(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_rtword_raw *words = args->rbmbp->b_addr;
+
+ return words + index;
+}
+
+/* Convert an ondisk bitmap word to its incore representation. */
+static inline xfs_rtword_t
+xfs_rtbitmap_getword(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index);
+
+ return word->old;
+}
+
+/* Set an ondisk bitmap word from an incore representation. */
+static inline void
+xfs_rtbitmap_setword(
+ struct xfs_rtalloc_args *args,
+ unsigned int index,
+ xfs_rtword_t value)
+{
+ union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index);
+
+ word->old = value;
+}
+
+/*
+ * Convert a rt extent length and rt bitmap block number to a xfs_suminfo_t
+ * offset within the rt summary file.
+ */
+static inline xfs_rtsumoff_t
+xfs_rtsumoffs(
+ struct xfs_mount *mp,
+ int log2_len,
+ xfs_fileoff_t rbmoff)
+{
+ return log2_len * mp->m_sb.sb_rbmblocks + rbmoff;
+}
+
+/*
+ * Convert an xfs_suminfo_t offset to a file block offset within the rt summary
+ * file.
+ */
+static inline xfs_fileoff_t
+xfs_rtsumoffs_to_block(
+ struct xfs_mount *mp,
+ xfs_rtsumoff_t rsumoff)
+{
+ return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t));
+}
+
+/*
+ * Convert an xfs_suminfo_t offset to an info word offset within an rt summary
+ * block.
+ */
+static inline unsigned int
+xfs_rtsumoffs_to_infoword(
+ struct xfs_mount *mp,
+ xfs_rtsumoff_t rsumoff)
+{
+ unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG;
+
+ return rsumoff & mask;
+}
+
+/* Return a pointer to a summary info word within a rt summary block. */
+static inline union xfs_suminfo_raw *
+xfs_rsumblock_infoptr(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_suminfo_raw *info = args->sumbp->b_addr;
+
+ return info + index;
+}
+
+/* Get the current value of a summary counter. */
+static inline xfs_suminfo_t
+xfs_suminfo_get(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index);
+
+ return info->old;
+}
+
+/* Add to the current value of a summary counter and return the new value. */
+static inline xfs_suminfo_t
+xfs_suminfo_add(
+ struct xfs_rtalloc_args *args,
+ unsigned int index,
+ int delta)
+{
+ union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index);
+
+ info->old += delta;
+ return info->old;
+}
+
+/*
+ * Functions for walking free space rtextents in the realtime bitmap.
+ */
+struct xfs_rtalloc_rec {
+ xfs_rtxnum_t ar_startext;
+ xfs_rtbxlen_t ar_extcount;
+};
+
+typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv);
+
+#ifdef CONFIG_XFS_RT
+void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args);
+
+int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block,
+ int issum);
+
+static inline int
+xfs_rtbitmap_read_buf(
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block)
+{
+ return xfs_rtbuf_get(args, block, 0);
+}
+
+static inline int
+xfs_rtsummary_read_buf(
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block)
+{
+ return xfs_rtbuf_get(args, block, 1);
+}
+
+int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat);
+int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
+int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
+int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxlen_t len, int val);
+int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log,
+ xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum);
+int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log,
+ xfs_fileoff_t bbno, int delta);
+int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxlen_t len);
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *low_rec,
+ const struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn, void *priv);
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv);
+int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtxnum_t start, xfs_rtxlen_t len,
+ bool *is_free);
+/*
+ * Free an extent in the realtime subvolume. Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int /* error */
+xfs_rtfree_extent(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_rtxnum_t start, /* starting rtext number to free */
+ xfs_rtxlen_t len); /* length of extent freed */
+
+/* Same as above, but in units of rt blocks. */
+int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
+ xfs_filblks_t rtlen);
+
+xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
+ rtextents);
+unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents);
+
+xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
+ unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
+ unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+#else /* CONFIG_XFS_RT */
+# define xfs_rtfree_extent(t,b,l) (-ENOSYS)
+# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS)
+# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS)
+# define xfs_rtbitmap_read_buf(a,b) (-ENOSYS)
+# define xfs_rtsummary_read_buf(a,b) (-ENOSYS)
+# define xfs_rtbuf_cache_relse(a) (0)
+# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS)
+static inline xfs_filblks_t
+xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
+{
+ /* shut up gcc */
+ return 0;
+}
+# define xfs_rtbitmap_wordcount(mp, r) (0)
+# define xfs_rtsummary_blockcount(mp, l, b) (0)
+# define xfs_rtsummary_wordcount(mp, l, b) (0)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __XFS_RTBITMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 6264daaab37b..1f74d0cd1618 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -975,6 +975,8 @@ xfs_sb_mount_common(
mp->m_blockmask = sbp->sb_blocksize - 1;
mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
mp->m_blockwmask = mp->m_blockwsize - 1;
+ mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
+ mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index a5e14740ec9a..19134b23c10b 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -25,7 +25,7 @@ extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp);
extern int xfs_update_secondary_sbs(struct xfs_mount *mp);
-#define XFS_FS_GEOM_MAX_STRUCT_VER (4)
+#define XFS_FS_GEOM_MAX_STRUCT_VER (5)
extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo,
int struct_version);
extern int xfs_sb_read_secondary(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 5b2f27cbdb80..6cd45e8c118d 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -19,6 +19,7 @@
#include "xfs_trans.h"
#include "xfs_qm.h"
#include "xfs_trans_space.h"
+#include "xfs_rtbitmap.h"
#define _ALLOC true
#define _FREE false
@@ -217,11 +218,12 @@ xfs_rtalloc_block_count(
struct xfs_mount *mp,
unsigned int num_ops)
{
- unsigned int blksz = XFS_FSB_TO_B(mp, 1);
- unsigned int rtbmp_bytes;
+ unsigned int rtbmp_blocks;
+ xfs_rtxlen_t rtxlen;
- rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
- return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
+ rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN);
+ rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen);
+ return (rtbmp_blocks + 1) * num_ops;
}
/*
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 5c2765934732..c299b16c9365 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -148,10 +148,10 @@ xfs_verify_rtbno(
/* Verify that a realtime device extent is fully contained inside the volume. */
bool
-xfs_verify_rtext(
+xfs_verify_rtbext(
struct xfs_mount *mp,
xfs_rtblock_t rtbno,
- xfs_rtblock_t len)
+ xfs_filblks_t len)
{
if (rtbno + len <= rtbno)
return false;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 851220021484..533200c4ccc2 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -11,6 +11,7 @@ typedef uint32_t prid_t; /* project ID */
typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
+typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
typedef uint64_t xfs_extnum_t; /* # of extents in a file */
typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
@@ -18,6 +19,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
+typedef uint32_t xfs_rtsumoff_t; /* offset of an rtsummary info word */
typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
@@ -31,6 +33,8 @@ typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
typedef uint64_t xfs_fileoff_t; /* block number in a file */
typedef uint64_t xfs_filblks_t; /* number of blocks in a file */
+typedef uint64_t xfs_rtxnum_t; /* rtextent number */
+typedef uint64_t xfs_rtbxlen_t; /* rtbitmap extent length in rtextents */
typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
@@ -47,6 +51,7 @@ typedef void * xfs_failaddr_t;
#define NULLRFSBLOCK ((xfs_rfsblock_t)-1)
#define NULLRTBLOCK ((xfs_rtblock_t)-1)
#define NULLFILEOFF ((xfs_fileoff_t)-1)
+#define NULLRTEXTNO ((xfs_rtxnum_t)-1)
#define NULLAGBLOCK ((xfs_agblock_t)-1)
#define NULLAGNUMBER ((xfs_agnumber_t)-1)
@@ -145,6 +150,7 @@ typedef uint32_t xfs_dqid_t;
*/
#define XFS_NBBYLOG 3 /* log2(NBBY) */
#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
+#define XFS_SUMINFOLOG 2 /* log2(sizeof(xfs_suminfo_t)) */
#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
#define XFS_NBWORD (1 << XFS_NBWORDLOG)
#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
@@ -229,8 +235,8 @@ bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
-bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
- xfs_rtblock_t len);
+bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
+ xfs_filblks_t len);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 75588915572e..06d8c1996a33 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -410,7 +410,7 @@ xchk_bmap_iextent(
/* Make sure the extent points to a valid place. */
if (info->is_rt &&
- !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount))
+ !xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount))
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (!info->is_rt &&
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 05be757668bb..5799e9a94f1f 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -16,7 +16,7 @@
#include "xfs_health.h"
#include "xfs_btree.h"
#include "xfs_ag.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "scrub/scrub.h"
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 59d7912fb75f..889f556bc98f 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -20,6 +20,7 @@
#include "xfs_reflink.h"
#include "xfs_rmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_rtbitmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
@@ -225,7 +226,7 @@ xchk_inode_extsize(
*/
if ((flags & XFS_DIFLAG_RTINHERIT) &&
(flags & XFS_DIFLAG_EXTSZINHERIT) &&
- value % sc->mp->m_sb.sb_rextsize > 0)
+ xfs_extlen_to_rtxmod(sc->mp, value) > 0)
xchk_ino_set_warning(sc, ino);
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 008ddb599e13..41a1d89ae8e6 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -11,7 +11,7 @@
#include "xfs_mount.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "scrub/scrub.h"
@@ -48,12 +48,12 @@ xchk_rtbitmap_rec(
{
struct xfs_scrub *sc = priv;
xfs_rtblock_t startblock;
- xfs_rtblock_t blockcount;
+ xfs_filblks_t blockcount;
- startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
- blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
+ startblock = xfs_rtx_to_rtb(mp, rec->ar_startext);
+ blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount);
- if (!xfs_verify_rtext(mp, startblock, blockcount))
+ if (!xfs_verify_rtbext(mp, startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
@@ -128,26 +128,22 @@ out:
void
xchk_xref_is_used_rt_space(
struct xfs_scrub *sc,
- xfs_rtblock_t fsbno,
+ xfs_rtblock_t rtbno,
xfs_extlen_t len)
{
- xfs_rtblock_t startext;
- xfs_rtblock_t endext;
- xfs_rtblock_t extcount;
+ xfs_rtxnum_t startext;
+ xfs_rtxnum_t endext;
bool is_free;
int error;
if (xchk_skip_xref(sc->sm))
return;
- startext = fsbno;
- endext = fsbno + len - 1;
- do_div(startext, sc->mp->m_sb.sb_rextsize);
- do_div(endext, sc->mp->m_sb.sb_rextsize);
- extcount = endext - startext + 1;
+ startext = xfs_rtb_to_rtx(sc->mp, rtbno);
+ endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
- &is_free);
+ error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext,
+ endext - startext + 1, &is_free);
if (!xchk_should_check_xref(sc, &error, NULL))
goto out_unlock;
if (is_free)
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 437ed9acbb27..8b15c47408d0 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -13,7 +13,7 @@
#include "xfs_inode.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_bit.h"
#include "xfs_bmap.h"
#include "scrub/scrub.h"
@@ -81,34 +81,45 @@ typedef unsigned int xchk_rtsumoff_t;
static inline int
xfsum_load(
struct xfs_scrub *sc,
- xchk_rtsumoff_t sumoff,
- xfs_suminfo_t *info)
+ xfs_rtsumoff_t sumoff,
+ union xfs_suminfo_raw *rawinfo)
{
- return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t),
+ return xfile_obj_load(sc->xfile, rawinfo,
+ sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
static inline int
xfsum_store(
struct xfs_scrub *sc,
- xchk_rtsumoff_t sumoff,
- const xfs_suminfo_t info)
+ xfs_rtsumoff_t sumoff,
+ const union xfs_suminfo_raw rawinfo)
{
- return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t),
+ return xfile_obj_store(sc->xfile, &rawinfo,
+ sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
static inline int
xfsum_copyout(
struct xfs_scrub *sc,
- xchk_rtsumoff_t sumoff,
- xfs_suminfo_t *info,
+ xfs_rtsumoff_t sumoff,
+ union xfs_suminfo_raw *rawinfo,
unsigned int nr_words)
{
- return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG,
+ return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
sumoff << XFS_WORDLOG);
}
+static inline xfs_suminfo_t
+xchk_rtsum_inc(
+ struct xfs_mount *mp,
+ union xfs_suminfo_raw *v)
+{
+ v->old += 1;
+ return v->old;
+}
+
/* Update the summary file to reflect the free extent that we've accumulated. */
STATIC int
xchk_rtsum_record_free(
@@ -121,23 +132,24 @@ xchk_rtsum_record_free(
xfs_fileoff_t rbmoff;
xfs_rtblock_t rtbno;
xfs_filblks_t rtlen;
- xchk_rtsumoff_t offs;
+ xfs_rtsumoff_t offs;
unsigned int lenlog;
- xfs_suminfo_t v = 0;
+ union xfs_suminfo_raw v;
+ xfs_suminfo_t value;
int error = 0;
if (xchk_should_terminate(sc, &error))
return error;
/* Compute the relevant location in the rtsum file. */
- rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext);
+ rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext);
lenlog = XFS_RTBLOCKLOG(rec->ar_extcount);
- offs = XFS_SUMOFFS(mp, lenlog, rbmoff);
+ offs = xfs_rtsumoffs(mp, lenlog, rbmoff);
- rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
- rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize;
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
+ rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
- if (!xfs_verify_rtext(mp, rtbno, rtlen)) {
+ if (!xfs_verify_rtbext(mp, rtbno, rtlen)) {
xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
return -EFSCORRUPTED;
}
@@ -147,9 +159,9 @@ xchk_rtsum_record_free(
if (error)
return error;
- v++;
+ value = xchk_rtsum_inc(sc->mp, &v);
trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount,
- lenlog, offs, v);
+ lenlog, offs, value);
return xfsum_store(sc, offs, v);
}
@@ -160,12 +172,11 @@ xchk_rtsum_compute(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
- unsigned long long rtbmp_bytes;
+ unsigned long long rtbmp_blocks;
/* If the bitmap size doesn't match the computed size, bail. */
- rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY);
- if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) !=
- mp->m_rbmip->i_disk_size)
+ rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents);
+ if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size)
return -EFSCORRUPTED;
return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free,
@@ -177,14 +188,18 @@ STATIC int
xchk_rtsum_compare(
struct xfs_scrub *sc)
{
+ struct xfs_rtalloc_args args = {
+ .mp = sc->mp,
+ .tp = sc->tp,
+ };
struct xfs_mount *mp = sc->mp;
- struct xfs_buf *bp;
struct xfs_bmbt_irec map;
xfs_fileoff_t off;
xchk_rtsumoff_t sumoff = 0;
int nmap;
for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) {
+ union xfs_suminfo_raw *ondisk_info;
int error = 0;
if (xchk_should_terminate(sc, &error))
@@ -205,22 +220,23 @@ xchk_rtsum_compare(
}
/* Read a block's worth of ondisk rtsummary file. */
- error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp);
+ error = xfs_rtsummary_read_buf(&args, off);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
return error;
/* Read a block's worth of computed rtsummary file. */
error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize);
if (error) {
- xfs_trans_brelse(sc->tp, bp);
+ xfs_rtbuf_cache_relse(&args);
return error;
}
- if (memcmp(bp->b_addr, sc->buf,
+ ondisk_info = xfs_rsumblock_infoptr(&args, 0);
+ if (memcmp(ondisk_info, sc->buf,
mp->m_blockwsize << XFS_WORDLOG) != 0)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
- xfs_trans_brelse(sc->tp, bp);
+ xfs_rtbuf_cache_relse(&args);
sumoff += mp->m_blockwsize;
}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 46249e7b17e0..29afa4851235 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -13,6 +13,7 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index cbd4d01e253c..4a8bc6f3c8f2 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1036,17 +1036,18 @@ TRACE_EVENT(xfarray_sort_stats,
#ifdef CONFIG_XFS_RT
TRACE_EVENT(xchk_rtsum_record_free,
- TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start,
- uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v),
- TP_ARGS(mp, start, len, log, pos, v),
+ TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start,
+ xfs_rtbxlen_t len, unsigned int log, loff_t pos,
+ xfs_suminfo_t value),
+ TP_ARGS(mp, start, len, log, pos, value),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(dev_t, rtdev)
- __field(xfs_rtblock_t, start)
+ __field(xfs_rtxnum_t, start)
__field(unsigned long long, len)
__field(unsigned int, log)
__field(loff_t, pos)
- __field(xfs_suminfo_t, v)
+ __field(xfs_suminfo_t, value)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
@@ -1055,7 +1056,7 @@ TRACE_EVENT(xchk_rtsum_record_free,
__entry->len = len;
__entry->log = log;
__entry->pos = pos;
- __entry->v = v;
+ __entry->value = value;
),
TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1064,7 +1065,7 @@ TRACE_EVENT(xchk_rtsum_record_free,
__entry->len,
__entry->log,
__entry->pos,
- __entry->v)
+ __entry->value)
);
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 40e0a1f1f753..731260a5af6d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -28,6 +28,7 @@
#include "xfs_icache.h"
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include "xfs_rtbitmap.h"
/* Kernel only BMAP related definitions and functions */
@@ -75,28 +76,28 @@ xfs_bmap_rtalloc(
{
struct xfs_mount *mp = ap->ip->i_mount;
xfs_fileoff_t orig_offset = ap->offset;
- xfs_rtblock_t rtb;
- xfs_extlen_t prod = 0; /* product factor for allocators */
+ xfs_rtxnum_t rtx;
+ xfs_rtxlen_t prod = 0; /* product factor for allocators */
xfs_extlen_t mod = 0; /* product factor for allocators */
- xfs_extlen_t ralen = 0; /* realtime allocation length */
+ xfs_rtxlen_t ralen = 0; /* realtime allocation length */
xfs_extlen_t align; /* minimum allocation alignment */
xfs_extlen_t orig_length = ap->length;
xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
- xfs_extlen_t raminlen;
+ xfs_rtxlen_t raminlen;
bool rtlocked = false;
bool ignore_locality = false;
int error;
align = xfs_get_extsz_hint(ap->ip);
retry:
- prod = align / mp->m_sb.sb_rextsize;
+ prod = xfs_extlen_to_rtxlen(mp, align);
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 1, ap->eof, 0,
ap->conv, &ap->offset, &ap->length);
if (error)
return error;
ASSERT(ap->length);
- ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
+ ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0);
/*
* If we shifted the file offset downward to satisfy an extent size
@@ -116,17 +117,14 @@ retry:
prod = 1;
/*
* Set ralen to be the actual requested length in rtextents.
- */
- ralen = ap->length / mp->m_sb.sb_rextsize;
- /*
+ *
* If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
* we rounded up to it, cut it back so it's valid again.
* Note that if it's a really large request (bigger than
* XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
* adjust the starting point to match it.
*/
- if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
- ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
+ ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
/*
* Lock out modifications to both the RT bitmap and summary inodes
@@ -144,12 +142,10 @@ retry:
* pick an extent that will space things out in the rt area.
*/
if (ap->eof && ap->offset == 0) {
- xfs_rtblock_t rtx; /* realtime extent no */
-
error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
if (error)
return error;
- ap->blkno = rtx * mp->m_sb.sb_rextsize;
+ ap->blkno = xfs_rtx_to_rtb(mp, rtx);
} else {
ap->blkno = 0;
}
@@ -160,20 +156,18 @@ retry:
* Realtime allocation, done through xfs_rtallocate_extent.
*/
if (ignore_locality)
- ap->blkno = 0;
+ rtx = 0;
else
- do_div(ap->blkno, mp->m_sb.sb_rextsize);
- rtb = ap->blkno;
- ap->length = ralen;
- raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize);
- error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length,
- &ralen, ap->wasdel, prod, &rtb);
+ rtx = xfs_rtb_to_rtx(mp, ap->blkno);
+ raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
+ error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen,
+ ap->wasdel, prod, &rtx);
if (error)
return error;
- if (rtb != NULLRTBLOCK) {
- ap->blkno = rtb * mp->m_sb.sb_rextsize;
- ap->length = ralen * mp->m_sb.sb_rextsize;
+ if (rtx != NULLRTEXTNO) {
+ ap->blkno = xfs_rtx_to_rtb(mp, rtx);
+ ap->length = xfs_rtxlen_to_extlen(mp, ralen);
ap->ip->i_nblocks += ap->length;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel)
@@ -690,7 +684,7 @@ xfs_can_free_eofblocks(
*/
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
- end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
+ end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (last_fsb <= end_fsb)
return false;
@@ -780,12 +774,10 @@ xfs_alloc_file_space(
{
xfs_mount_t *mp = ip->i_mount;
xfs_off_t count;
- xfs_filblks_t allocated_fsb;
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
xfs_fileoff_t startoffset_fsb;
xfs_fileoff_t endoffset_fsb;
- int nimaps;
int rt;
xfs_trans_t *tp;
xfs_bmbt_irec_t imaps[1], *imapp;
@@ -808,7 +800,6 @@ xfs_alloc_file_space(
count = len;
imapp = &imaps[0];
- nimaps = 1;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
allocatesize_fsb = endoffset_fsb - startoffset_fsb;
@@ -819,6 +810,7 @@ xfs_alloc_file_space(
while (allocatesize_fsb && !error) {
xfs_fileoff_t s, e;
unsigned int dblocks, rblocks, resblks;
+ int nimaps = 1;
/*
* Determine space reservations for data/realtime.
@@ -884,15 +876,19 @@ xfs_alloc_file_space(
if (error)
break;
- allocated_fsb = imapp->br_blockcount;
-
- if (nimaps == 0) {
- error = -ENOSPC;
- break;
+ /*
+ * If the allocator cannot find a single free extent large
+ * enough to cover the start block of the requested range,
+ * xfs_bmapi_write will return 0 but leave *nimaps set to 0.
+ *
+ * In that case we simply need to keep looping with the same
+ * startoffset_fsb so that one of the following allocations
+ * will eventually reach the requested range.
+ */
+ if (nimaps) {
+ startoffset_fsb += imapp->br_blockcount;
+ allocatesize_fsb -= imapp->br_blockcount;
}
-
- startoffset_fsb += allocated_fsb;
- allocatesize_fsb -= allocated_fsb;
}
return error;
@@ -989,10 +985,8 @@ xfs_free_file_space(
/* We can only free complete realtime extents. */
if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
- startoffset_fsb = roundup_64(startoffset_fsb,
- mp->m_sb.sb_rextsize);
- endoffset_fsb = rounddown_64(endoffset_fsb,
- mp->m_sb.sb_rextsize);
+ startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb);
+ endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb);
}
/*
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 003e157241da..545c7991b9b5 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1913,8 +1913,7 @@ xfs_buftarg_shrink_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_buftarg *btp = container_of(shrink,
- struct xfs_buftarg, bt_shrinker);
+ struct xfs_buftarg *btp = shrink->private_data;
LIST_HEAD(dispose);
unsigned long freed;
@@ -1936,8 +1935,7 @@ xfs_buftarg_shrink_count(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_buftarg *btp = container_of(shrink,
- struct xfs_buftarg, bt_shrinker);
+ struct xfs_buftarg *btp = shrink->private_data;
return list_lru_shrink_count(&btp->bt_lru, sc);
}
@@ -1945,7 +1943,7 @@ void
xfs_free_buftarg(
struct xfs_buftarg *btp)
{
- unregister_shrinker(&btp->bt_shrinker);
+ shrinker_free(btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
@@ -2029,13 +2027,17 @@ xfs_alloc_buftarg(
if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
goto error_lru;
- btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
- btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
- btp->bt_shrinker.seeks = DEFAULT_SEEKS;
- btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
- if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
- mp->m_super->s_id))
+ btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s",
+ mp->m_super->s_id);
+ if (!btp->bt_shrinker)
goto error_pcpu;
+
+ btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
+ btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
+ btp->bt_shrinker->private_data = btp;
+
+ shrinker_register(btp->bt_shrinker);
+
return btp;
error_pcpu:
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index ada9d310b7d3..c86e16419656 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -109,7 +109,7 @@ typedef struct xfs_buftarg {
size_t bt_logical_sectormask;
/* LRU control structures */
- struct shrinker bt_shrinker;
+ struct shrinker *bt_shrinker;
struct list_lru bt_lru;
struct percpu_counter bt_io_count;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203700278ddb..e33e5e13b95f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -214,6 +214,43 @@ xfs_ilock_iocb(
return 0;
}
+static int
+xfs_ilock_iocb_for_write(
+ struct kiocb *iocb,
+ unsigned int *lock_mode)
+{
+ ssize_t ret;
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+
+ ret = xfs_ilock_iocb(iocb, *lock_mode);
+ if (ret)
+ return ret;
+
+ if (*lock_mode == XFS_IOLOCK_EXCL)
+ return 0;
+ if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+ return 0;
+
+ xfs_iunlock(ip, *lock_mode);
+ *lock_mode = XFS_IOLOCK_EXCL;
+ return xfs_ilock_iocb(iocb, *lock_mode);
+}
+
+static unsigned int
+xfs_ilock_for_write_fault(
+ struct xfs_inode *ip)
+{
+ /* get a shared lock if no remapping in progress */
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+ return XFS_MMAPLOCK_SHARED;
+
+ /* wait for remapping to complete */
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ return XFS_MMAPLOCK_EXCL;
+}
+
STATIC ssize_t
xfs_file_dio_read(
struct kiocb *iocb,
@@ -551,7 +588,7 @@ xfs_file_dio_write_aligned(
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
- ret = xfs_ilock_iocb(iocb, iolock);
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
@@ -618,7 +655,7 @@ retry_exclusive:
flags = IOMAP_DIO_FORCE_WAIT;
}
- ret = xfs_ilock_iocb(iocb, iolock);
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
@@ -1180,7 +1217,7 @@ xfs_file_remap_range(
if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
xfs_log_force_inode(dest);
out_unlock:
- xfs_iunlock2_io_mmap(src, dest);
+ xfs_iunlock2_remapping(src, dest);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return remapped > 0 ? remapped : ret;
@@ -1328,6 +1365,7 @@ __xfs_filemap_fault(
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
vm_fault_t ret;
+ unsigned int lock_mode = 0;
trace_xfs_filemap_fault(ip, order, write_fault);
@@ -1336,25 +1374,24 @@ __xfs_filemap_fault(
file_update_time(vmf->vma->vm_file);
}
+ if (IS_DAX(inode) || write_fault)
+ lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
+
if (IS_DAX(inode)) {
pfn_t pfn;
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, order, pfn);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ } else if (write_fault) {
+ ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
} else {
- if (write_fault) {
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = iomap_page_mkwrite(vmf,
- &xfs_page_mkwrite_iomap_ops);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- } else {
- ret = filemap_fault(vmf);
- }
+ ret = filemap_fault(vmf);
}
+ if (lock_mode)
+ xfs_iunlock(XFS_I(inode), lock_mode);
+
if (write_fault)
sb_end_pagefault(inode->i_sb);
return ret;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 736e5545f584..5a72217f5feb 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -23,7 +23,7 @@
#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_alloc_btree.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_ag.h"
/* Convert an xfs_fsmap to an fsmap. */
@@ -483,11 +483,11 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
xfs_rtblock_t rtbno;
xfs_daddr_t rec_daddr, len_daddr;
- rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
rec_daddr = XFS_FSB_TO_BB(mp, rtbno);
irec.rm_startblock = rtbno;
- rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize;
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount);
len_daddr = XFS_FSB_TO_BB(mp, rtbno);
irec.rm_blockcount = rtbno;
@@ -514,7 +514,7 @@ xfs_getfsmap_rtdev_rtbitmap(
uint64_t eofs;
int error;
- eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize);
+ eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents));
if (keys[0].fmr_physical >= eofs)
return 0;
start_rtb = XFS_BB_TO_FSBT(mp,
@@ -539,11 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap(
* Set up query parameters to return free rtextents covering the range
* we want.
*/
- alow.ar_startext = start_rtb;
- ahigh.ar_startext = end_rtb;
- do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
- if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
- ahigh.ar_startext++;
+ alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb);
+ ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb);
error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3c210ac83713..dba514a2c84d 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2165,8 +2165,7 @@ xfs_inodegc_shrinker_count(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_mount *mp = container_of(shrink, struct xfs_mount,
- m_inodegc_shrinker);
+ struct xfs_mount *mp = shrink->private_data;
struct xfs_inodegc *gc;
int cpu;
@@ -2187,8 +2186,7 @@ xfs_inodegc_shrinker_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_mount *mp = container_of(shrink, struct xfs_mount,
- m_inodegc_shrinker);
+ struct xfs_mount *mp = shrink->private_data;
struct xfs_inodegc *gc;
int cpu;
bool no_items = true;
@@ -2224,13 +2222,19 @@ int
xfs_inodegc_register_shrinker(
struct xfs_mount *mp)
{
- struct shrinker *shrink = &mp->m_inodegc_shrinker;
+ mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB,
+ "xfs-inodegc:%s",
+ mp->m_super->s_id);
+ if (!mp->m_inodegc_shrinker)
+ return -ENOMEM;
+
+ mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count;
+ mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan;
+ mp->m_inodegc_shrinker->seeks = 0;
+ mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH;
+ mp->m_inodegc_shrinker->private_data = mp;
- shrink->count_objects = xfs_inodegc_shrinker_count;
- shrink->scan_objects = xfs_inodegc_shrinker_scan;
- shrink->seeks = 0;
- shrink->flags = SHRINKER_NONSLAB;
- shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
+ shrinker_register(mp->m_inodegc_shrinker);
- return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
+ return 0;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 36f5cf802c07..c0f1c89786c2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -918,6 +918,13 @@ xfs_droplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
{
+ if (VFS_I(ip)->i_nlink == 0) {
+ xfs_alert(ip->i_mount,
+ "%s: Attempt to drop inode (%llu) with nlink zero.",
+ __func__, ip->i_ino);
+ return -EFSCORRUPTED;
+ }
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
drop_nlink(VFS_I(ip));
@@ -3621,6 +3628,23 @@ xfs_iunlock2_io_mmap(
inode_unlock(VFS_I(ip1));
}
+/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
+void
+xfs_iunlock2_remapping(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ xfs_iflags_clear(ip1, XFS_IREMAPPING);
+
+ if (ip1 != ip2)
+ xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+
+ if (ip1 != ip2)
+ inode_unlock_shared(VFS_I(ip1));
+ inode_unlock(VFS_I(ip2));
+}
+
/*
* Reload the incore inode list for this inode. Caller should ensure that
* the link count cannot change, either by taking ILOCK_SHARED or otherwise
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0c5bdb91152e..3dc47937da5d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
/* Quotacheck is running but inode has not been added to quota counts. */
#define XFS_IQUOTAUNCHECKED (1 << 14)
+/*
+ * Remap in progress. Callers that wish to update file data while
+ * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
+ * the lock in exclusive mode. Relocking the file will block until
+ * IREMAPPING is cleared.
+ */
+#define XFS_IREMAPPING (1U << 15)
+
/* All inode state flags related to inode reclaim. */
#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \
XFS_IRECLAIM | \
@@ -595,6 +603,7 @@ void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
static inline bool
xfs_inode_unlinked_incomplete(
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 17c51804f9c6..cd7803fda8b1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -19,6 +19,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_error.h"
+#include "xfs_rtbitmap.h"
#include <linux/iversion.h>
@@ -107,7 +108,7 @@ xfs_inode_item_precommit(
*/
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
- (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
XFS_DIFLAG_EXTSZINHERIT);
ip->i_extsize = 0;
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 0e5dba2343ea..144198a6b270 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -286,6 +286,7 @@ xlog_recover_inode_commit_pass2(
struct xfs_log_dinode *ldip;
uint isize;
int need_free = 0;
+ xfs_failaddr_t fa;
if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
in_f = item->ri_buf[0].i_addr;
@@ -369,24 +370,26 @@ xlog_recover_inode_commit_pass2(
* superblock flag to determine whether we need to look at di_flushiter
* to skip replay when the on disk inode is newer than the log one
*/
- if (!xfs_has_v3inodes(mp) &&
- ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
- /*
- * Deal with the wrap case, DI_MAX_FLUSH is less
- * than smaller numbers
- */
- if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
- ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
- /* do nothing */
- } else {
- trace_xfs_log_recover_inode_skip(log, in_f);
- error = 0;
- goto out_release;
+ if (!xfs_has_v3inodes(mp)) {
+ if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * Deal with the wrap case, DI_MAX_FLUSH is less
+ * than smaller numbers
+ */
+ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+ ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+ /* do nothing */
+ } else {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_release;
+ }
}
+
+ /* Take the opportunity to reset the flush iteration count */
+ ldip->di_flushiter = 0;
}
- /* Take the opportunity to reset the flush iteration count */
- ldip->di_flushiter = 0;
if (unlikely(S_ISREG(ldip->di_mode))) {
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -528,8 +531,19 @@ out_owner_change:
(dip->di_mode != 0))
error = xfs_recover_inode_owner_change(mp, dip, in_f,
buffer_list);
- /* re-generate the checksum. */
+ /* re-generate the checksum and validate the recovered inode. */
xfs_dinode_calc_crc(log->l_mp, dip);
+ fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip);
+ if (fa) {
+ XFS_CORRUPTION_ERROR(
+ "Bad dinode after recovery",
+ XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip));
+ xfs_alert(mp,
+ "Metadata corruption detected at %pS, inode 0x%llx",
+ fa, in_f->ilf_ino);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
ASSERT(bp->b_mount == mp);
bp->b_flags |= _XBF_LOGRECOVERY;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 55bb01173cde..a82470e027f7 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -38,6 +38,7 @@
#include "xfs_reflink.h"
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
+#include "xfs_rtbitmap.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -1004,7 +1005,7 @@ xfs_fill_fsxattr(
* later.
*/
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- ip->i_extsize % mp->m_sb.sb_rextsize > 0) {
+ xfs_extlen_to_rtxmod(mp, ip->i_extsize) > 0) {
fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE |
FS_XFLAG_EXTSZINHERIT);
fa->fsx_extsize = 0;
@@ -1130,7 +1131,7 @@ xfs_ioctl_setattr_xflags(
/* If realtime flag is set then must have realtime device */
if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
- (ip->i_extsize % mp->m_sb.sb_rextsize))
+ xfs_extlen_to_rtxmod(mp, ip->i_extsize))
return -EINVAL;
}
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index e9d317a3dafe..d7873e0360f0 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -198,6 +198,18 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
return x;
}
+/* If @b is a power of 2, return log2(b). Else return -1. */
+static inline int8_t log2_if_power2(unsigned long b)
+{
+ return is_power_of_2(b) ? ilog2(b) : -1;
+}
+
+/* If @b is a power of 2, return a mask of the lower bits, else return zero. */
+static inline unsigned long long mask64_if_power2(unsigned long b)
+{
+ return is_power_of_2(b) ? b - 1 : 0;
+}
+
int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
char *data, enum req_op op);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 51c100c86177..ee206facf0dc 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1893,9 +1893,7 @@ xlog_write_iclog(
* the buffer manually, the code needs to be kept in sync
* with the I/O completion path.
*/
- xlog_state_done_syncing(iclog);
- up(&iclog->ic_sema);
- return;
+ goto sync;
}
/*
@@ -1925,20 +1923,17 @@ xlog_write_iclog(
* avoid shutdown re-entering this path and erroring out again.
*/
if (log->l_targ != log->l_mp->m_ddev_targp &&
- blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
- xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return;
- }
+ blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev))
+ goto shutdown;
}
if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
iclog->ic_bio.bi_opf |= REQ_FUA;
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
- if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
- xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return;
- }
+ if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
+ goto shutdown;
+
if (is_vmalloc_addr(iclog->ic_data))
flush_kernel_vmap_range(iclog->ic_data, count);
@@ -1959,6 +1954,12 @@ xlog_write_iclog(
}
submit_bio(&iclog->ic_bio);
+ return;
+shutdown:
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+sync:
+ xlog_state_done_syncing(iclog);
+ up(&iclog->ic_sema);
}
/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13b94d2e605b..a1e18b24971a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2511,7 +2511,7 @@ xlog_abort_defer_ops(
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
list_del_init(&dfc->dfc_list);
- xfs_defer_ops_capture_free(mp, dfc);
+ xfs_defer_ops_capture_abort(mp, dfc);
}
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0a0fd19573d8..aed5be5508fe 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1021,7 +1021,7 @@ xfs_mountfs(
out_log_dealloc:
xfs_log_mount_cancel(mp);
out_inodegc_shrinker:
- unregister_shrinker(&mp->m_inodegc_shrinker);
+ shrinker_free(mp->m_inodegc_shrinker);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_buftarg_drain(mp->m_logdev_targp);
@@ -1104,7 +1104,7 @@ xfs_unmountfs(
#if defined(DEBUG)
xfs_errortag_clearall(mp);
#endif
- unregister_shrinker(&mp->m_inodegc_shrinker);
+ shrinker_free(mp->m_inodegc_shrinker);
xfs_free_perag(mp);
xfs_errortag_del(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d19cca099bc3..503fe3c7edbf 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -101,9 +101,9 @@ typedef struct xfs_mount {
/*
* Optional cache of rt summary level per bitmap block with the
- * invariant that m_rsum_cache[bbno] <= the minimum i for which
- * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
- * inode lock.
+ * invariant that m_rsum_cache[bbno] > the maximum i for which
+ * rsum[i][bbno] != 0, or 0 if rsum[i][bbno] == 0 for all i.
+ * Reads and writes are serialized by the rsumip inode lock.
*/
uint8_t *m_rsum_cache;
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
@@ -119,6 +119,7 @@ typedef struct xfs_mount {
uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
uint8_t m_agno_log; /* log #ag's */
uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
+ int8_t m_rtxblklog; /* log2 of rextsize, if possible */
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
@@ -152,6 +153,7 @@ typedef struct xfs_mount {
uint64_t m_features; /* active filesystem features */
uint64_t m_low_space[XFS_LOWSP_MAX];
uint64_t m_low_rtexts[XFS_LOWSP_MAX];
+ uint64_t m_rtxblkmask; /* rt extent block mask */
struct xfs_ino_geometry m_ino_geo; /* inode geometry */
struct xfs_trans_resv m_resv; /* precomputed res values */
/* low free space thresholds */
@@ -219,7 +221,7 @@ typedef struct xfs_mount {
atomic_t m_agirotor; /* last ag dir inode alloced */
/* Memory shrinker to throttle and reprioritize inodegc */
- struct shrinker m_inodegc_shrinker;
+ struct shrinker *m_inodegc_shrinker;
/*
* Workqueue item so that we can coalesce multiple inode flush attempts
* into a single flush.
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index c4cc99b70dd3..21a7e350b4c5 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -72,6 +72,10 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4);
+ /* realtime structures */
+ XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4);
+ XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4);
+
/*
* m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
* 4 bytes anyway so it's not obviously a problem. Hence for the moment
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 086e78a6143a..94a7932ac570 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -504,8 +504,7 @@ xfs_qm_shrink_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_quotainfo *qi = container_of(shrink,
- struct xfs_quotainfo, qi_shrinker);
+ struct xfs_quotainfo *qi = shrink->private_data;
struct xfs_qm_isolate isol;
unsigned long freed;
int error;
@@ -539,8 +538,7 @@ xfs_qm_shrink_count(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_quotainfo *qi = container_of(shrink,
- struct xfs_quotainfo, qi_shrinker);
+ struct xfs_quotainfo *qi = shrink->private_data;
return list_lru_shrink_count(&qi->qi_lru, sc);
}
@@ -680,15 +678,18 @@ xfs_qm_init_quotainfo(
if (XFS_IS_PQUOTA_ON(mp))
xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf);
- qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
- qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
- qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
- qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
-
- error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s",
- mp->m_super->s_id);
- if (error)
+ qinf->qi_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-qm:%s",
+ mp->m_super->s_id);
+ if (!qinf->qi_shrinker) {
+ error = -ENOMEM;
goto out_free_inos;
+ }
+
+ qinf->qi_shrinker->count_objects = xfs_qm_shrink_count;
+ qinf->qi_shrinker->scan_objects = xfs_qm_shrink_scan;
+ qinf->qi_shrinker->private_data = qinf;
+
+ shrinker_register(qinf->qi_shrinker);
return 0;
@@ -718,7 +719,7 @@ xfs_qm_destroy_quotainfo(
qi = mp->m_quotainfo;
ASSERT(qi != NULL);
- unregister_shrinker(&qi->qi_shrinker);
+ shrinker_free(qi->qi_shrinker);
list_lru_destroy(&qi->qi_lru);
xfs_qm_destroy_quotainos(qi);
mutex_destroy(&qi->qi_tree_lock);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 9683f0457d19..d5c9fc4ba591 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -63,7 +63,7 @@ struct xfs_quotainfo {
struct xfs_def_quota qi_usr_default;
struct xfs_def_quota qi_grp_default;
struct xfs_def_quota qi_prj_default;
- struct shrinker qi_shrinker;
+ struct shrinker *qi_shrinker;
/* Minimum and maximum quota expiration timestamp values. */
time64_t qi_expiry_min;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index eb9102453aff..e5b62dc28466 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -784,6 +784,7 @@ xfs_reflink_end_cow_extent(
}
}
del = got;
+ xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
@@ -1540,6 +1541,10 @@ xfs_reflink_remap_prep(
if (ret)
goto out_unlock;
+ xfs_iflags_set(src, XFS_IREMAPPING);
+ if (inode_in != inode_out)
+ xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
return 0;
out_unlock:
xfs_iunlock2_io_mmap(src, dest);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 2e1a4e5cd03d..88c48de5c9c8 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -19,6 +19,7 @@
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
#include "xfs_sb.h"
+#include "xfs_rtbitmap.h"
/*
* Read and return the summary information for a given extent size,
@@ -28,48 +29,48 @@
*/
static int
xfs_rtget_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_suminfo_t *sum) /* out: summary info for this block */
+ struct xfs_rtalloc_args *args,
+ int log, /* log2 of extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
{
- return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum);
+ return xfs_rtmodify_summary_int(args, log, bbno, 0, sum);
}
/*
* Return whether there are any free extents in the size range given
* by low and high, for the bitmap block bbno.
*/
-STATIC int /* error */
+STATIC int
xfs_rtany_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int low, /* low log2 extent size */
- int high, /* high log2 extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- int *stat) /* out: any good extents here? */
+ struct xfs_rtalloc_args *args,
+ int low, /* low log2 extent size */
+ int high, /* high log2 extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ int *maxlog) /* out: max log2 extent size free */
{
- int error; /* error value */
- int log; /* loop counter, log2 of ext. size */
- xfs_suminfo_t sum; /* summary data */
-
- /* There are no extents at levels < m_rsum_cache[bbno]. */
- if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno])
- low = mp->m_rsum_cache[bbno];
+ struct xfs_mount *mp = args->mp;
+ int error;
+ int log; /* loop counter, log2 of ext. size */
+ xfs_suminfo_t sum; /* summary data */
+
+ /* There are no extents at levels >= m_rsum_cache[bbno]. */
+ if (mp->m_rsum_cache) {
+ high = min(high, mp->m_rsum_cache[bbno] - 1);
+ if (low > high) {
+ *maxlog = -1;
+ return 0;
+ }
+ }
/*
* Loop over logs of extent sizes.
*/
- for (log = low; log <= high; log++) {
+ for (log = high; log >= low; log--) {
/*
* Get one summary datum.
*/
- error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+ error = xfs_rtget_summary(args, log, bbno, &sum);
if (error) {
return error;
}
@@ -77,18 +78,18 @@ xfs_rtany_summary(
* If there are any, return success.
*/
if (sum) {
- *stat = 1;
+ *maxlog = log;
goto out;
}
}
/*
* Found nothing, return failure.
*/
- *stat = 0;
+ *maxlog = -1;
out:
- /* There were no extents at levels < log. */
- if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log;
+ /* There were no extents at levels > log. */
+ if (mp->m_rsum_cache && log + 1 < mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log + 1;
return 0;
}
@@ -97,60 +98,54 @@ out:
* Copy and transform the summary file, given the old and new
* parameters in the mount structures.
*/
-STATIC int /* error */
+STATIC int
xfs_rtcopy_summary(
- xfs_mount_t *omp, /* old file system mount point */
- xfs_mount_t *nmp, /* new file system mount point */
- xfs_trans_t *tp) /* transaction pointer */
+ struct xfs_rtalloc_args *oargs,
+ struct xfs_rtalloc_args *nargs)
{
- xfs_rtblock_t bbno; /* bitmap block number */
- struct xfs_buf *bp; /* summary buffer */
- int error; /* error return value */
- int log; /* summary level number (log length) */
- xfs_suminfo_t sum; /* summary data */
- xfs_fsblock_t sumbno; /* summary block number */
+ xfs_fileoff_t bbno; /* bitmap block number */
+ int error;
+ int log; /* summary level number (log length) */
+ xfs_suminfo_t sum; /* summary data */
- bp = NULL;
- for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
- for (bbno = omp->m_sb.sb_rbmblocks - 1;
+ for (log = oargs->mp->m_rsumlevels - 1; log >= 0; log--) {
+ for (bbno = oargs->mp->m_sb.sb_rbmblocks - 1;
(xfs_srtblock_t)bbno >= 0;
bbno--) {
- error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
- &sumbno, &sum);
+ error = xfs_rtget_summary(oargs, log, bbno, &sum);
if (error)
- return error;
+ goto out;
if (sum == 0)
continue;
- error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
- &bp, &sumbno);
+ error = xfs_rtmodify_summary(oargs, log, bbno, -sum);
if (error)
- return error;
- error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
- &bp, &sumbno);
+ goto out;
+ error = xfs_rtmodify_summary(nargs, log, bbno, sum);
if (error)
- return error;
+ goto out;
ASSERT(sum > 0);
}
}
+ error = 0;
+out:
+ xfs_rtbuf_cache_relse(oargs);
return 0;
}
/*
* Mark an extent specified by start and len allocated.
* Updates all the summary information as well as the bitmap.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* start block to allocate */
- xfs_extlen_t len, /* length to allocate */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* start rtext to allocate */
+ xfs_rtxlen_t len) /* in/out: summary block number */
{
- xfs_rtblock_t end; /* end of the allocated extent */
- int error; /* error value */
- xfs_rtblock_t postblock = 0; /* first block allocated > end */
- xfs_rtblock_t preblock = 0; /* first block allocated < start */
+ struct xfs_mount *mp = args->mp;
+ xfs_rtxnum_t end; /* end of the allocated rtext */
+ int error;
+ xfs_rtxnum_t postblock = 0; /* first rtext allocated > end */
+ xfs_rtxnum_t preblock = 0; /* first rtext allocated < start */
end = start + len - 1;
/*
@@ -158,15 +153,15 @@ xfs_rtallocate_range(
* We need to find the beginning and end of the extent so we can
* properly update the summary.
*/
- error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ error = xfs_rtfind_back(args, start, 0, &preblock);
if (error) {
return error;
}
/*
* Find the next allocated block (end of free extent).
*/
- error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
- &postblock);
+ error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
if (error) {
return error;
}
@@ -174,9 +169,9 @@ xfs_rtallocate_range(
* Decrement the summary information corresponding to the entire
* (old) free extent.
*/
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
- XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), -1);
if (error) {
return error;
}
@@ -185,9 +180,9 @@ xfs_rtallocate_range(
* old extent, add summary data for them to be free.
*/
if (preblock < start) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(start - preblock),
- XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(start - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), 1);
if (error) {
return error;
}
@@ -197,9 +192,9 @@ xfs_rtallocate_range(
* old extent, add summary data for them to be free.
*/
if (postblock > end) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock - end),
- XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ XFS_RTBLOCKLOG(postblock - end),
+ xfs_rtx_to_rbmblock(mp, end + 1), 1);
if (error) {
return error;
}
@@ -207,54 +202,69 @@ xfs_rtallocate_range(
/*
* Modify the bitmap to mark this extent allocated.
*/
- error = xfs_rtmodify_range(mp, tp, start, len, 0);
+ error = xfs_rtmodify_range(args, start, len, 0);
return error;
}
/*
+ * Make sure we don't run off the end of the rt volume. Be careful that
+ * adjusting maxlen downwards doesn't cause us to fail the alignment checks.
+ */
+static inline xfs_rtxlen_t
+xfs_rtallocate_clamp_len(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t startrtx,
+ xfs_rtxlen_t rtxlen,
+ xfs_rtxlen_t prod)
+{
+ xfs_rtxlen_t ret;
+
+ ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx;
+ return rounddown(ret, prod);
+}
+
+/*
* Attempt to allocate an extent minlen<=len<=maxlen starting from
* bitmap block bbno. If we don't get maxlen then use prod to trim
- * the length, if given. Returns error; returns starting block in *rtblock.
+ * the length, if given. Returns error; returns starting block in *rtx.
* The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_block(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bbno, /* bitmap block number */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- xfs_rtblock_t *nextp, /* out: next block to try */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t bbno, /* bitmap block number */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxnum_t *nextp, /* out: next rtext to try */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- xfs_rtblock_t besti; /* best rtblock found so far */
- xfs_rtblock_t bestlen; /* best length found so far */
- xfs_rtblock_t end; /* last rtblock in chunk */
- int error; /* error value */
- xfs_rtblock_t i; /* current rtblock trying */
- xfs_rtblock_t next; /* next rtblock to try */
- int stat; /* status from internal calls */
+ struct xfs_mount *mp = args->mp;
+ xfs_rtxnum_t besti; /* best rtext found so far */
+ xfs_rtxnum_t bestlen;/* best length found so far */
+ xfs_rtxnum_t end; /* last rtext in chunk */
+ int error;
+ xfs_rtxnum_t i; /* current rtext trying */
+ xfs_rtxnum_t next; /* next rtext to try */
+ int stat; /* status from internal calls */
/*
* Loop over all the extents starting in this bitmap block,
* looking for one that's long enough.
*/
- for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
- end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+ for (i = xfs_rbmblock_to_rtx(mp, bbno), besti = -1, bestlen = 0,
+ end = xfs_rbmblock_to_rtx(mp, bbno + 1) - 1;
i <= end;
i++) {
/* Make sure we don't scan off the end of the rt volume. */
- maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i;
+ maxlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod);
/*
* See if there's a free extent of maxlen starting at i.
* If it's not so then next will contain the first non-free.
*/
- error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+ error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat);
if (error) {
return error;
}
@@ -262,13 +272,12 @@ xfs_rtallocate_extent_block(
/*
* i for maxlen is all free, allocate and return that.
*/
- error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
- rsb);
+ error = xfs_rtallocate_range(args, i, maxlen);
if (error) {
return error;
}
*len = maxlen;
- *rtblock = i;
+ *rtx = i;
return 0;
}
/*
@@ -278,7 +287,7 @@ xfs_rtallocate_extent_block(
* so far, remember it.
*/
if (minlen < maxlen) {
- xfs_rtblock_t thislen; /* this extent size */
+ xfs_rtxnum_t thislen; /* this extent size */
thislen = next - i;
if (thislen >= minlen && thislen > bestlen) {
@@ -290,7 +299,7 @@ xfs_rtallocate_extent_block(
* If not done yet, find the start of the next free space.
*/
if (next < end) {
- error = xfs_rtfind_forw(mp, tp, next, end, &i);
+ error = xfs_rtfind_forw(args, next, end, &i);
if (error) {
return error;
}
@@ -301,7 +310,7 @@ xfs_rtallocate_extent_block(
* Searched the whole thing & didn't find a maxlen free extent.
*/
if (minlen < maxlen && besti != -1) {
- xfs_extlen_t p; /* amount to trim length by */
+ xfs_rtxlen_t p; /* amount to trim length by */
/*
* If size should be a multiple of prod, make that so.
@@ -315,51 +324,49 @@ xfs_rtallocate_extent_block(
/*
* Allocate besti for bestlen & return that.
*/
- error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
+ error = xfs_rtallocate_range(args, besti, bestlen);
if (error) {
return error;
}
*len = bestlen;
- *rtblock = besti;
+ *rtx = besti;
return 0;
}
/*
* Allocation failed. Set *nextp to the next block to try.
*/
*nextp = next;
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
/*
* Allocate an extent of length minlen<=len<=maxlen, starting at block
* bno. If we don't get maxlen then use prod to trim the length, if given.
- * Returns error; returns starting block in *rtblock.
+ * Returns error; returns starting block in *rtx.
* The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_exact(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number to allocate */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- int error; /* error value */
- xfs_extlen_t i; /* extent length trimmed due to prod */
- int isfree; /* extent is free */
- xfs_rtblock_t next; /* next block to try (dummy) */
+ int error;
+ xfs_rtxlen_t i; /* extent length trimmed due to prod */
+ int isfree; /* extent is free */
+ xfs_rtxnum_t next; /* next rtext to try (dummy) */
- ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ ASSERT(minlen % prod == 0);
+ ASSERT(maxlen % prod == 0);
/*
* Check if the range in question (for maxlen) is free.
*/
- error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
+ error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree);
if (error) {
return error;
}
@@ -367,23 +374,23 @@ xfs_rtallocate_extent_exact(
/*
* If it is, allocate it and return success.
*/
- error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+ error = xfs_rtallocate_range(args, start, maxlen);
if (error) {
return error;
}
*len = maxlen;
- *rtblock = bno;
+ *rtx = start;
return 0;
}
/*
* If not, allocate what there is, if it's at least minlen.
*/
- maxlen = next - bno;
+ maxlen = next - start;
if (maxlen < minlen) {
/*
* Failed, return failure status.
*/
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
/*
@@ -395,81 +402,82 @@ xfs_rtallocate_extent_exact(
/*
* Now we can't do it, return failure status.
*/
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
}
/*
* Allocate what we can and return it.
*/
- error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+ error = xfs_rtallocate_range(args, start, maxlen);
if (error) {
return error;
}
*len = maxlen;
- *rtblock = bno;
+ *rtx = start;
return 0;
}
/*
* Allocate an extent of length minlen<=len<=maxlen, starting as near
- * to bno as possible. If we don't get maxlen then use prod to trim
+ * to start as possible. If we don't get maxlen then use prod to trim
* the length, if given. The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_near(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number to allocate */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- int any; /* any useful extents from summary */
- xfs_rtblock_t bbno; /* bitmap block number */
- int error; /* error value */
- int i; /* bitmap block offset (loop control) */
- int j; /* secondary loop control */
- int log2len; /* log2 of minlen */
- xfs_rtblock_t n; /* next block to try */
- xfs_rtblock_t r; /* result block */
-
- ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ struct xfs_mount *mp = args->mp;
+ int maxlog; /* max useful extent from summary */
+ xfs_fileoff_t bbno; /* bitmap block number */
+ int error;
+ int i; /* bitmap block offset (loop control) */
+ int j; /* secondary loop control */
+ int log2len; /* log2 of minlen */
+ xfs_rtxnum_t n; /* next rtext to try */
+ xfs_rtxnum_t r; /* result rtext */
+
+ ASSERT(minlen % prod == 0);
+ ASSERT(maxlen % prod == 0);
+
/*
* If the block number given is off the end, silently set it to
* the last block.
*/
- if (bno >= mp->m_sb.sb_rextents)
- bno = mp->m_sb.sb_rextents - 1;
+ if (start >= mp->m_sb.sb_rextents)
+ start = mp->m_sb.sb_rextents - 1;
/* Make sure we don't run off the end of the rt volume. */
- maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno;
+ maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod);
if (maxlen < minlen) {
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
/*
* Try the exact allocation first.
*/
- error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
- rbpp, rsb, prod, &r);
+ error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len,
+ prod, &r);
if (error) {
return error;
}
/*
* If the exact allocation worked, return that.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
+ if (r != NULLRTEXTNO) {
+ *rtx = r;
return 0;
}
- bbno = XFS_BITTOBLOCK(mp, bno);
+ bbno = xfs_rtx_to_rbmblock(mp, start);
i = 0;
+ j = -1;
ASSERT(minlen != 0);
log2len = xfs_highbit32(minlen);
/*
@@ -480,8 +488,8 @@ xfs_rtallocate_extent_near(
* Get summary information of extents of all useful levels
* starting in this bitmap block.
*/
- error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
- bbno + i, rbpp, rsb, &any);
+ error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1,
+ bbno + i, &maxlog);
if (error) {
return error;
}
@@ -489,7 +497,10 @@ xfs_rtallocate_extent_near(
* If there are any useful extents starting here, try
* allocating one.
*/
- if (any) {
+ if (maxlog >= 0) {
+ xfs_extlen_t maxavail =
+ min_t(xfs_rtblock_t, maxlen,
+ (1ULL << (maxlog + 1)) - 1);
/*
* On the positive side of the starting location.
*/
@@ -498,17 +509,17 @@ xfs_rtallocate_extent_near(
* Try to allocate an extent starting in
* this block.
*/
- error = xfs_rtallocate_extent_block(mp, tp,
- bbno + i, minlen, maxlen, len, &n, rbpp,
- rsb, prod, &r);
+ error = xfs_rtallocate_extent_block(args,
+ bbno + i, minlen, maxavail, len,
+ &n, prod, &r);
if (error) {
return error;
}
/*
* If it worked, return it.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
+ if (r != NULLRTEXTNO) {
+ *rtx = r;
return 0;
}
}
@@ -516,68 +527,46 @@ xfs_rtallocate_extent_near(
* On the negative side of the starting location.
*/
else { /* i < 0 */
+ int maxblocks;
+
/*
- * Loop backwards through the bitmap blocks from
- * the starting point-1 up to where we are now.
- * There should be an extent which ends in this
- * bitmap block and is long enough.
+ * Loop backwards to find the end of the extent
+ * we found in the realtime summary.
+ *
+ * maxblocks is the maximum possible number of
+ * bitmap blocks from the start of the extent
+ * to the end of the extent.
*/
- for (j = -1; j > i; j--) {
- /*
- * Grab the summary information for
- * this bitmap block.
- */
- error = xfs_rtany_summary(mp, tp,
- log2len, mp->m_rsumlevels - 1,
- bbno + j, rbpp, rsb, &any);
- if (error) {
- return error;
- }
- /*
- * If there's no extent given in the
- * summary that means the extent we
- * found must carry over from an
- * earlier block. If there is an
- * extent given, we've already tried
- * that allocation, don't do it again.
- */
- if (any)
- continue;
- error = xfs_rtallocate_extent_block(mp,
- tp, bbno + j, minlen, maxlen,
- len, &n, rbpp, rsb, prod, &r);
+ if (maxlog == 0)
+ maxblocks = 0;
+ else if (maxlog < mp->m_blkbit_log)
+ maxblocks = 1;
+ else
+ maxblocks = 2 << (maxlog - mp->m_blkbit_log);
+
+ /*
+ * We need to check bbno + i + maxblocks down to
+ * bbno + i. We already checked bbno down to
+ * bbno + j + 1, so we don't need to check those
+ * again.
+ */
+ j = min(i + maxblocks, j);
+ for (; j >= i; j--) {
+ error = xfs_rtallocate_extent_block(args,
+ bbno + j, minlen,
+ maxavail, len, &n, prod,
+ &r);
if (error) {
return error;
}
/*
* If it works, return the extent.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
+ if (r != NULLRTEXTNO) {
+ *rtx = r;
return 0;
}
}
- /*
- * There weren't intervening bitmap blocks
- * with a long enough extent, or the
- * allocation didn't work for some reason
- * (i.e. it's a little * too short).
- * Try to allocate from the summary block
- * that we found.
- */
- error = xfs_rtallocate_extent_block(mp, tp,
- bbno + i, minlen, maxlen, len, &n, rbpp,
- rsb, prod, &r);
- if (error) {
- return error;
- }
- /*
- * If it works, return the extent.
- */
- if (r != NULLRTBLOCK) {
- *rtblock = r;
- return 0;
- }
}
}
/*
@@ -610,7 +599,7 @@ xfs_rtallocate_extent_near(
else
break;
}
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
@@ -619,26 +608,25 @@ xfs_rtallocate_extent_near(
* specified. If we don't get maxlen then use prod to trim
* the length, if given. The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_size(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- int error; /* error value */
- int i; /* bitmap block number */
- int l; /* level number (loop control) */
- xfs_rtblock_t n; /* next block to be tried */
- xfs_rtblock_t r; /* result block number */
- xfs_suminfo_t sum; /* summary information for extents */
-
- ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ struct xfs_mount *mp = args->mp;
+ int error;
+ xfs_fileoff_t i; /* bitmap block number */
+ int l; /* level number (loop control) */
+ xfs_rtxnum_t n; /* next rtext to be tried */
+ xfs_rtxnum_t r; /* result rtext number */
+ xfs_suminfo_t sum; /* summary information for extents */
+
+ ASSERT(minlen % prod == 0);
+ ASSERT(maxlen % prod == 0);
ASSERT(maxlen != 0);
/*
@@ -656,8 +644,7 @@ xfs_rtallocate_extent_size(
/*
* Get the summary for this level/block.
*/
- error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
- &sum);
+ error = xfs_rtget_summary(args, l, i, &sum);
if (error) {
return error;
}
@@ -669,16 +656,16 @@ xfs_rtallocate_extent_size(
/*
* Try allocating the extent.
*/
- error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
- maxlen, len, &n, rbpp, rsb, prod, &r);
+ error = xfs_rtallocate_extent_block(args, i, maxlen,
+ maxlen, len, &n, prod, &r);
if (error) {
return error;
}
/*
* If it worked, return that.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
+ if (r != NULLRTEXTNO) {
+ *rtx = r;
return 0;
}
/*
@@ -686,8 +673,8 @@ xfs_rtallocate_extent_size(
* allocator is beyond the next bitmap block,
* skip to that bitmap block.
*/
- if (XFS_BITTOBLOCK(mp, n) > i + 1)
- i = XFS_BITTOBLOCK(mp, n) - 1;
+ if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
+ i = xfs_rtx_to_rbmblock(mp, n) - 1;
}
}
/*
@@ -695,7 +682,7 @@ xfs_rtallocate_extent_size(
* we're asking for a fixed size extent.
*/
if (minlen > --maxlen) {
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
ASSERT(minlen != 0);
@@ -715,8 +702,7 @@ xfs_rtallocate_extent_size(
/*
* Get the summary information for this level/block.
*/
- error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
- &sum);
+ error = xfs_rtget_summary(args, l, i, &sum);
if (error) {
return error;
}
@@ -730,18 +716,18 @@ xfs_rtallocate_extent_size(
* minlen/maxlen are in the possible range for
* this summary level.
*/
- error = xfs_rtallocate_extent_block(mp, tp, i,
+ error = xfs_rtallocate_extent_block(args, i,
XFS_RTMAX(minlen, 1 << l),
XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
- len, &n, rbpp, rsb, prod, &r);
+ len, &n, prod, &r);
if (error) {
return error;
}
/*
* If it worked, return that extent.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
+ if (r != NULLRTEXTNO) {
+ *rtx = r;
return 0;
}
/*
@@ -749,14 +735,14 @@ xfs_rtallocate_extent_size(
* allocator is beyond the next bitmap block,
* skip to that bitmap block.
*/
- if (XFS_BITTOBLOCK(mp, n) > i + 1)
- i = XFS_BITTOBLOCK(mp, n) - 1;
+ if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
+ i = xfs_rtx_to_rbmblock(mp, n) - 1;
}
}
/*
* Got nothing, return failure.
*/
- *rtblock = NULLRTBLOCK;
+ *rtx = NULLRTEXTNO;
return 0;
}
@@ -886,12 +872,14 @@ xfs_alloc_rsum_cache(
xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */
{
/*
- * The rsum cache is initialized to all zeroes, which is trivially a
- * lower bound on the minimum level with any free extents. We can
- * continue without the cache if it couldn't be allocated.
+ * The rsum cache is initialized to the maximum value, which is
+ * trivially an upper bound on the maximum level with any free extents.
+ * We can continue without the cache if it couldn't be allocated.
*/
- mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL);
- if (!mp->m_rsum_cache)
+ mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL);
+ if (mp->m_rsum_cache)
+ memset(mp->m_rsum_cache, -1, rbmblocks);
+ else
xfs_warn(mp, "could not allocate realtime summary cache");
}
@@ -907,13 +895,13 @@ xfs_growfs_rt(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_growfs_rt_t *in) /* growfs rt input struct */
{
- xfs_rtblock_t bmbno; /* bitmap block number */
+ xfs_fileoff_t bmbno; /* bitmap block number */
struct xfs_buf *bp; /* temporary buffer */
int error; /* error return value */
xfs_mount_t *nmp; /* new (fake) mount structure */
xfs_rfsblock_t nrblocks; /* new number of realtime blocks */
xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
- xfs_rtblock_t nrextents; /* new number of realtime extents */
+ xfs_rtxnum_t nrextents; /* new number of realtime extents */
uint8_t nrextslog; /* new log2 of sb_rextents */
xfs_extlen_t nrsumblocks; /* new number of summary blocks */
uint nrsumlevels; /* new rt summary levels */
@@ -922,7 +910,6 @@ xfs_growfs_rt(
xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */
xfs_extlen_t rsumblocks; /* current number of rt summary blks */
xfs_sb_t *sbp; /* old superblock */
- xfs_fsblock_t sumbno; /* summary block number */
uint8_t *rsum_cache; /* old summary cache */
sbp = &mp->m_sb;
@@ -954,7 +941,7 @@ xfs_growfs_rt(
return -EINVAL;
/* Unsupported realtime features. */
- if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp))
+ if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp))
return -EOPNOTSUPP;
nrblocks = in->newblocks;
@@ -976,11 +963,10 @@ xfs_growfs_rt(
*/
nrextents = nrblocks;
do_div(nrextents, in->extsize);
- nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
+ nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
nrextslog = xfs_highbit32(nrextents);
nrsumlevels = nrextslog + 1;
- nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
- nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+ nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks);
nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
/*
* New summary size can't be more than half the size of
@@ -1023,6 +1009,12 @@ xfs_growfs_rt(
((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
bmbno < nrbmblocks;
bmbno++) {
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ };
+ struct xfs_rtalloc_args nargs = {
+ .mp = nmp,
+ };
struct xfs_trans *tp;
xfs_rfsblock_t nrblocks_step;
@@ -1032,19 +1024,17 @@ xfs_growfs_rt(
* Calculate new sb and mount fields for this round.
*/
nsbp->sb_rextsize = in->extsize;
+ nmp->m_rtxblklog = -1; /* don't use shift or masking */
nsbp->sb_rbmblocks = bmbno + 1;
nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize *
nsbp->sb_rextsize;
nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
- nsbp->sb_rextents = nsbp->sb_rblocks;
- do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+ nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks);
ASSERT(nsbp->sb_rextents != 0);
nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
- nrsumsize =
- (uint)sizeof(xfs_suminfo_t) * nrsumlevels *
- nsbp->sb_rbmblocks;
- nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+ nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels,
+ nsbp->sb_rbmblocks);
nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
/*
* Start a transaction, get the log reservation.
@@ -1053,6 +1043,9 @@ xfs_growfs_rt(
&tp);
if (error)
break;
+ args.tp = tp;
+ nargs.tp = tp;
+
/*
* Lock out other callers by grabbing the bitmap inode lock.
*/
@@ -1086,7 +1079,7 @@ xfs_growfs_rt(
*/
if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
mp->m_rsumlevels != nmp->m_rsumlevels) {
- error = xfs_rtcopy_summary(mp, nmp, tp);
+ error = xfs_rtcopy_summary(&args, &nargs);
if (error)
goto error_cancel;
}
@@ -1111,9 +1104,9 @@ xfs_growfs_rt(
/*
* Free new extent.
*/
- bp = NULL;
- error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
- nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+ error = xfs_rtfree_range(&nargs, sbp->sb_rextents,
+ nsbp->sb_rextents - sbp->sb_rextents);
+ xfs_rtbuf_cache_relse(&nargs);
if (error) {
error_cancel:
xfs_trans_cancel(tp);
@@ -1171,59 +1164,60 @@ out_free:
* parameters. The length units are all in realtime extents, as is the
* result block number.
*/
-int /* error */
+int
xfs_rtallocate_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- int wasdel, /* was a delayed allocation extent */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_trans *tp,
+ xfs_rtxnum_t start, /* starting rtext number to allocate */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ int wasdel, /* was a delayed allocation extent */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtblock) /* out: start rtext allocated */
{
- xfs_mount_t *mp = tp->t_mountp;
- int error; /* error value */
- xfs_rtblock_t r; /* result allocated block */
- xfs_fsblock_t sb; /* summary file block number */
- struct xfs_buf *sumbp; /* summary file block buffer */
-
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+ struct xfs_rtalloc_args args = {
+ .mp = tp->t_mountp,
+ .tp = tp,
+ };
+ int error; /* error value */
+ xfs_rtxnum_t r; /* result allocated rtext */
+
+ ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL));
ASSERT(minlen > 0 && minlen <= maxlen);
/*
* If prod is set then figure out what to do to minlen and maxlen.
*/
if (prod > 1) {
- xfs_extlen_t i;
+ xfs_rtxlen_t i;
if ((i = maxlen % prod))
maxlen -= i;
if ((i = minlen % prod))
minlen += prod - i;
if (maxlen < minlen) {
- *rtblock = NULLRTBLOCK;
+ *rtblock = NULLRTEXTNO;
return 0;
}
}
retry:
- sumbp = NULL;
- if (bno == 0) {
- error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
- &sumbp, &sb, prod, &r);
+ if (start == 0) {
+ error = xfs_rtallocate_extent_size(&args, minlen,
+ maxlen, len, prod, &r);
} else {
- error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
- len, &sumbp, &sb, prod, &r);
+ error = xfs_rtallocate_extent_near(&args, start, minlen,
+ maxlen, len, prod, &r);
}
+ xfs_rtbuf_cache_relse(&args);
if (error)
return error;
/*
* If it worked, update the superblock.
*/
- if (r != NULLRTBLOCK) {
+ if (r != NULLRTEXTNO) {
long slen = (long)*len;
ASSERT(*len >= minlen && *len <= maxlen);
@@ -1250,6 +1244,7 @@ xfs_rtmount_init(
struct xfs_buf *bp; /* buffer for last block of subvolume */
struct xfs_sb *sbp; /* filesystem superblock copy in mount */
xfs_daddr_t d; /* address of last block of subvolume */
+ unsigned int rsumblocks;
int error;
sbp = &mp->m_sb;
@@ -1261,10 +1256,9 @@ xfs_rtmount_init(
return -ENODEV;
}
mp->m_rsumlevels = sbp->sb_rextslog + 1;
- mp->m_rsumsize =
- (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
- sbp->sb_rbmblocks;
- mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+ rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels,
+ mp->m_sb.sb_rbmblocks);
+ mp->m_rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
mp->m_rbmip = mp->m_rsumip = NULL;
/*
* Check that the realtime section is an ok size.
@@ -1418,27 +1412,27 @@ xfs_rtunmount_inodes(
* of rtextents and the fraction.
* The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
*/
-int /* error */
+int /* error */
xfs_rtpick_extent(
xfs_mount_t *mp, /* file system mount point */
xfs_trans_t *tp, /* transaction pointer */
- xfs_extlen_t len, /* allocation length (rtextents) */
- xfs_rtblock_t *pick) /* result rt extent */
- {
- xfs_rtblock_t b; /* result block */
+ xfs_rtxlen_t len, /* allocation length (rtextents) */
+ xfs_rtxnum_t *pick) /* result rt extent */
+{
+ xfs_rtxnum_t b; /* result rtext */
int log2; /* log of sequence number */
uint64_t resid; /* residual after log removed */
uint64_t seq; /* sequence number of file creation */
- struct timespec64 ts; /* temporary timespec64 storage */
+ struct timespec64 ts; /* timespec in inode */
ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+ ts = inode_get_atime(VFS_I(mp->m_rbmip));
if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
seq = 0;
} else {
- ts = inode_get_atime(VFS_I(mp->m_rbmip));
- seq = (uint64_t)ts.tv_sec;
+ seq = ts.tv_sec;
}
if ((log2 = xfs_highbit64(seq)) == -1)
b = 0;
@@ -1451,7 +1445,7 @@ xfs_rtpick_extent(
if (b + len > mp->m_sb.sb_rextents)
b = mp->m_sb.sb_rextents - len;
}
- ts.tv_sec = (time64_t)seq + 1;
+ ts.tv_sec = seq + 1;
inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
*pick = b;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 62c7ad79cbb6..f7cb9ffe51ca 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -11,22 +11,6 @@
struct xfs_mount;
struct xfs_trans;
-/*
- * XXX: Most of the realtime allocation functions deal in units of realtime
- * extents, not realtime blocks. This looks funny when paired with the type
- * name and screams for a larger cleanup.
- */
-struct xfs_rtalloc_rec {
- xfs_rtblock_t ar_startext;
- xfs_rtblock_t ar_extcount;
-};
-
-typedef int (*xfs_rtalloc_query_range_fn)(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *rec,
- void *priv);
-
#ifdef CONFIG_XFS_RT
/*
* Function prototypes for exported functions.
@@ -40,23 +24,14 @@ typedef int (*xfs_rtalloc_query_range_fn)(
int /* error */
xfs_rtallocate_extent(
struct xfs_trans *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
+ xfs_rtxnum_t start, /* starting rtext number to allocate */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
int wasdel, /* was a delayed allocation extent */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock); /* out: start block allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtblock); /* out: start rtext allocated */
-/*
- * Free an extent in the realtime subvolume. Length is expressed in
- * realtime extents, as is the block number.
- */
-int /* error */
-xfs_rtfree_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to free */
- xfs_extlen_t len); /* length of extent freed */
/*
* Initialize realtime fields in the mount structure.
@@ -87,8 +62,8 @@ int /* error */
xfs_rtpick_extent(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
- xfs_extlen_t len, /* allocation length (rtextents) */
- xfs_rtblock_t *pick); /* result rt extent */
+ xfs_rtxlen_t len, /* allocation length (rtextents) */
+ xfs_rtxnum_t *pick); /* result rt extent */
/*
* Grow the realtime area of the filesystem.
@@ -98,55 +73,12 @@ xfs_growfs_rt(
struct xfs_mount *mp, /* file system mount structure */
xfs_growfs_rt_t *in); /* user supplied growfs struct */
-/*
- * From xfs_rtbitmap.c
- */
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
-int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len, int val,
- xfs_rtblock_t *new, int *stat);
-int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_rtblock_t limit,
- xfs_rtblock_t *rtblock);
-int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_rtblock_t limit,
- xfs_rtblock_t *rtblock);
-int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len, int val);
-int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
- int log, xfs_rtblock_t bbno, int delta,
- struct xfs_buf **rbpp, xfs_fsblock_t *rsb,
- xfs_suminfo_t *sum);
-int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
- xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp,
- xfs_fsblock_t *rsb);
-int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len,
- struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *low_rec,
- const struct xfs_rtalloc_rec *high_rec,
- xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtalloc_query_range_fn fn,
- void *priv);
-bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
-int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len,
- bool *is_free);
int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
-# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
-# define xfs_rtfree_extent(t,b,l) (ENOSYS)
-# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
-# define xfs_growfs_rt(mp,in) (ENOSYS)
-# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
-# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS)
-# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
-# define xfs_verify_rtbno(m, r) (false)
-# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS)
-# define xfs_rtalloc_reinit_frextents(m) (0)
+# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS)
+# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS)
+# define xfs_growfs_rt(mp,in) (-ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
@@ -157,7 +89,7 @@ xfs_rtmount_init(
xfs_warn(mp, "Not built with CONFIG_XFS_RT");
return -ENOSYS;
}
-# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
# define xfs_rtunmount_inodes(m)
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f0ae07828153..764304595e8b 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -42,6 +42,7 @@
#include "xfs_xattr.h"
#include "xfs_iunlink_item.h"
#include "xfs_dahash_test.h"
+#include "xfs_rtbitmap.h"
#include "scrub/stats.h"
#include <linux/magic.h>
@@ -896,7 +897,7 @@ xfs_fs_statfs(
statp->f_blocks = sbp->sb_rblocks;
freertx = percpu_counter_sum_positive(&mp->m_frextents);
- statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
+ statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx);
}
return 0;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8c0bfc9a33b1..305c9d07bf1b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -24,6 +24,7 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_icache.h"
+#include "xfs_rtbitmap.h"
struct kmem_cache *xfs_trans_cache;
@@ -655,6 +656,10 @@ xfs_trans_unreserve_and_mod_sb(
mp->m_sb.sb_agcount += tp->t_agcount_delta;
mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
mp->m_sb.sb_rextsize += tp->t_rextsize_delta;
+ if (tp->t_rextsize_delta) {
+ mp->m_rtxblklog = log2_if_power2(mp->m_sb.sb_rextsize);
+ mp->m_rtxblkmask = mask64_if_power2(mp->m_sb.sb_rextsize);
+ }
mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta;
mp->m_sb.sb_rblocks += tp->t_rblocks_delta;
mp->m_sb.sb_rextents += tp->t_rextents_delta;
@@ -1196,7 +1201,7 @@ xfs_trans_alloc_inode(
retry:
error = xfs_trans_alloc(mp, resv, dblocks,
- rblocks / mp->m_sb.sb_rextsize,
+ xfs_extlen_to_rtxlen(mp, rblocks),
force ? XFS_TRANS_RESERVE : 0, &tp);
if (error)
return error;